Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] IPv6: DAD from bonding iface is treated as dup address from others
From: Yinglin Sun @ 2011-10-07  0:03 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Neil Horman, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <CAN17JHUo00BGbsQ0wDjhcY-wWRuGr-in-i_JBzE5__jgO5be=g@mail.gmail.com>

On Thu, Oct 6, 2011 at 3:17 PM, Yinglin Sun <Yinglin.Sun@emc.com> wrote:
>
> On Thu, Oct 6, 2011 at 12:05 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> >
> > Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > >On Wed, Oct 05, 2011 at 08:59:10PM -0700, Yinglin Sun wrote:
> > >> Steps to reproduce this issue:
> > >> 1. create bond0 over eth0 and eth1, set the mode to balance-xor
> > >> 2. add an IPv6 address to bond0
> > >> 3. DAD packet is sent out from one slave and then is looped back from
> > >> the other slave. Therefore, it is treated as a duplicate address and
> > >> stays tentative afterwards:
> > >>    kern.info:
> > >>        Oct  5 11:50:18 testvm1 kernel: [  129.224353] bond0: IPv6 duplicate address 1234::1 detected!
> > >>
> > >> Signed-off-by: Yinglin Sun <Yinglin.Sun@emc.com>
> > >> ---
> > >>  net/ipv6/ndisc.c |   15 +++++++++++++--
> > >>  1 files changed, 13 insertions(+), 2 deletions(-)
> > >>
> > >> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
> > >> index 9da6e02..c82f4c7 100644
> > >> --- a/net/ipv6/ndisc.c
> > >> +++ b/net/ipv6/ndisc.c
> > >> @@ -809,9 +809,10 @@ static void ndisc_recv_ns(struct sk_buff *skb)
> > >>
> > >>              if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
> > >>                      if (dad) {
> > >> +                            const unsigned char *sadr;
> > >> +                            sadr = skb_mac_header(skb);
> > >> +
> > >>                              if (dev->type == ARPHRD_IEEE802_TR) {
> > >> -                                    const unsigned char *sadr;
> > >> -                                    sadr = skb_mac_header(skb);
> > >>                                      if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 &&
> > >>                                          sadr[9] == dev->dev_addr[1] &&
> > >>                                          sadr[10] == dev->dev_addr[2] &&
> > >> @@ -821,6 +822,16 @@ static void ndisc_recv_ns(struct sk_buff *skb)
> > >>                                              /* looped-back to us */
> > >>                                              goto out;
> > >>                                      }
> > >> +                            } else if (dev->type == ARPHRD_ETHER) {
> > >> +                                    if (sadr[6] == dev->dev_addr[0] &&
> > >> +                                        sadr[7] == dev->dev_addr[1] &&
> > >> +                                        sadr[8] == dev->dev_addr[2] &&
> > >> +                                        sadr[9] == dev->dev_addr[3] &&
> > >> +                                        sadr[10] == dev->dev_addr[4] &&
> > >> +                                        sadr[11] == dev->dev_addr[5]) {
> > >> +                                            /* looped-back to us */
> > >> +                                            goto out;
> > >> +                                    }
> > >>                              }
> > >>
> > >>                              /*
> > >> --
> > >> 1.7.4.1
> > >>
> > >Nack, This seems like it will just completely break DAD.  What if theres another
> > >system out there with the same mac address.  A response from that system would
> > >get dropped by this filter, instead of causing The local system to stop using
> > >the address.  What you really want to do is modify
> > >bond_should_deliver_exact_match to detect this frame on the inactive slave or
> > >some such, and drop the frame there.
> >
> >        Also NACK; and adding a bit of information.  The balance-xor
> > mode is nominally expecting to interact with a switch whose ports are
> > set for etherchannel ("static link aggregation"), in which case the
> > switch will not loop the packet back around.
> >
> >        If your switch can do etherchannel, then enable it and the
> > problem should go away.  If your switch cannot do this, then you may
> > have other issues, because all of the multicast or broadcast packets
> > going out any bonding slave will loop around to another slave.  You
> > could also use 802.3ad / LACP if you switch supports that.
> >
> >        For balance-xor (or balance-rr, for that matter) mode to a
> > non-etherchannel switch, it's going to be difficult, if not impossible,
> > to modify bond_should_deliver_exact_match, because there are no inactive
> > slaves.  In this mode, bonding is expecting the switch to balance
> > incoming traffic across the ports, and not deliver looped back packets
> > or duplicates.  There are no restrictions on what type of traffic
> > (mcast, bcast, ucast) may arrive on any given port.
> >
> >        I can't think of a way to make the non-etherchannel case work
> > for balance-xor (or balance-rr) without breaking the DAD functionality
> > in the case of an actual duplicate.  I'm not aware of a way to
> > distinguish a looped back DAD probe from an actual duplicate address
> > probe elsewhere on the network.
> >
>
> Hi Neil & Jay,
>
> Thanks a lot for the comments.
>
> The use case is to add IPv6 address on the bonding interface first,
> and then set up port channel on switch. We'll hit this issue and the
> new address will stay tentative and unusable after port channel is set
> up on switch. This patch is for this valid use case.
>
> Except failover mode, all slaves are active on receiving packets, so
> we are receiving such looped back DAD and the bonding driver cannot
> ignore them. I cannot think of a way to distinguish if a DAD is looped
> back or from someone else having the same mac address. They look the
> same to the host. If there is another machine having the same mac
> address, this code path gets executed if both are doing DAD at the
> same time for the same IPv6 address. Maybe we should find out what the
> specification defines for this case?
>

RFC4862 has a discussion about this issue:
http://tools.ietf.org/html/rfc4862#appendix-A
The better solution could be to record the number of DAD sent out. If
we received more DAD packets than we sent out, there is someone else
on the network who has the same mac address and sent DAD for the same
IPv6 address. However, this solution doesn't work with bonding
interface, since all other active slaves but the one sending out DAD
will receive packet looped back. It doesn't seem there is a simple
solution for this issue.

Yinglin

^ permalink raw reply

* Re: [PATCH] IPv6: DAD from bonding iface is treated as dup address from others
From: Jay Vosburgh @ 2011-10-07  0:59 UTC (permalink / raw)
  To: Yinglin Sun
  Cc: Neil Horman, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <CAN17JHWgYyrz0WFmXiyR8Ep1W-zGjeQEKCADmX6zMT=ZfQ8n7Q@mail.gmail.com>

Yinglin Sun <Yinglin.Sun@emc.com> wrote:

>On Thu, Oct 6, 2011 at 3:17 PM, Yinglin Sun <Yinglin.Sun@emc.com> wrote:
>>
>> On Thu, Oct 6, 2011 at 12:05 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
>> >
>> > Neil Horman <nhorman@tuxdriver.com> wrote:
>> >
>> > >On Wed, Oct 05, 2011 at 08:59:10PM -0700, Yinglin Sun wrote:
>> > >> Steps to reproduce this issue:
>> > >> 1. create bond0 over eth0 and eth1, set the mode to balance-xor
>> > >> 2. add an IPv6 address to bond0
>> > >> 3. DAD packet is sent out from one slave and then is looped back from
>> > >> the other slave. Therefore, it is treated as a duplicate address and
>> > >> stays tentative afterwards:
>> > >>    kern.info:
>> > >>        Oct  5 11:50:18 testvm1 kernel: [  129.224353] bond0: IPv6 duplicate address 1234::1 detected!

[...]

>> > >Nack, This seems like it will just completely break DAD.  What if theres another
>> > >system out there with the same mac address.  A response from that system would
>> > >get dropped by this filter, instead of causing The local system to stop using
>> > >the address.  What you really want to do is modify
>> > >bond_should_deliver_exact_match to detect this frame on the inactive slave or
>> > >some such, and drop the frame there.
>> >
>> >        Also NACK; and adding a bit of information.  The balance-xor
>> > mode is nominally expecting to interact with a switch whose ports are
>> > set for etherchannel ("static link aggregation"), in which case the
>> > switch will not loop the packet back around.
>> >
>> >        If your switch can do etherchannel, then enable it and the
>> > problem should go away.  If your switch cannot do this, then you may
>> > have other issues, because all of the multicast or broadcast packets
>> > going out any bonding slave will loop around to another slave.  You
>> > could also use 802.3ad / LACP if you switch supports that.
>> >
>> >        For balance-xor (or balance-rr, for that matter) mode to a
>> > non-etherchannel switch, it's going to be difficult, if not impossible,
>> > to modify bond_should_deliver_exact_match, because there are no inactive
>> > slaves.  In this mode, bonding is expecting the switch to balance
>> > incoming traffic across the ports, and not deliver looped back packets
>> > or duplicates.  There are no restrictions on what type of traffic
>> > (mcast, bcast, ucast) may arrive on any given port.
>> >
>> >        I can't think of a way to make the non-etherchannel case work
>> > for balance-xor (or balance-rr) without breaking the DAD functionality
>> > in the case of an actual duplicate.  I'm not aware of a way to
>> > distinguish a looped back DAD probe from an actual duplicate address
>> > probe elsewhere on the network.
>> >
>>
>> Hi Neil & Jay,
>>
>> Thanks a lot for the comments.
>>
>> The use case is to add IPv6 address on the bonding interface first,
>> and then set up port channel on switch. We'll hit this issue and the
>> new address will stay tentative and unusable after port channel is set
>> up on switch. This patch is for this valid use case.
>>
>> Except failover mode, all slaves are active on receiving packets, so
>> we are receiving such looped back DAD and the bonding driver cannot
>> ignore them. I cannot think of a way to distinguish if a DAD is looped
>> back or from someone else having the same mac address. They look the
>> same to the host. If there is another machine having the same mac
>> address, this code path gets executed if both are doing DAD at the
>> same time for the same IPv6 address. Maybe we should find out what the
>> specification defines for this case?
>>
>
>RFC4862 has a discussion about this issue:
>http://tools.ietf.org/html/rfc4862#appendix-A
>The better solution could be to record the number of DAD sent out. If
>we received more DAD packets than we sent out, there is someone else
>on the network who has the same mac address and sent DAD for the same
>IPv6 address. However, this solution doesn't work with bonding
>interface, since all other active slaves but the one sending out DAD
>will receive packet looped back. It doesn't seem there is a simple
>solution for this issue.

	Why are you setting up the port channel after configuring the
bond?

	As a possible workaround, if you have control over the setup
process (perhaps it's some sort of manual process), adding one slave to
the bond, leaving the other soon-to-be slaves down, then setting up the
switch, and finally adding the remaining slaves should work around the
issue, since if the bond has only one slave it won't see any looped
packets.

	Or you could bring the bond up as active-backup, then change the
mode to balance-xor once the switch is configured.

	Ultimately, though, the problem stems from the settings mismatch
between the switch and the bonding system; balance-xor is meant to
interoperate with etherchannel, and when the switch is not configured
properly, correct behavior is difficult to guarantee.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH] IPv6: DAD from bonding iface is treated as dup address from others
From: Yinglin Sun @ 2011-10-07  1:24 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Neil Horman, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <7122.1317949193@death>

On Thu, Oct 6, 2011 at 5:59 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> Yinglin Sun <Yinglin.Sun@emc.com> wrote:
>
>>On Thu, Oct 6, 2011 at 3:17 PM, Yinglin Sun <Yinglin.Sun@emc.com> wrote:
>>>
>>> On Thu, Oct 6, 2011 at 12:05 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
>>> >
>>> > Neil Horman <nhorman@tuxdriver.com> wrote:
>>> >
>>> > >On Wed, Oct 05, 2011 at 08:59:10PM -0700, Yinglin Sun wrote:
>>> > >> Steps to reproduce this issue:
>>> > >> 1. create bond0 over eth0 and eth1, set the mode to balance-xor
>>> > >> 2. add an IPv6 address to bond0
>>> > >> 3. DAD packet is sent out from one slave and then is looped back from
>>> > >> the other slave. Therefore, it is treated as a duplicate address and
>>> > >> stays tentative afterwards:
>>> > >>    kern.info:
>>> > >>        Oct  5 11:50:18 testvm1 kernel: [  129.224353] bond0: IPv6 duplicate address 1234::1 detected!
>
> [...]
>
>>> > >Nack, This seems like it will just completely break DAD.  What if theres another
>>> > >system out there with the same mac address.  A response from that system would
>>> > >get dropped by this filter, instead of causing The local system to stop using
>>> > >the address.  What you really want to do is modify
>>> > >bond_should_deliver_exact_match to detect this frame on the inactive slave or
>>> > >some such, and drop the frame there.
>>> >
>>> >        Also NACK; and adding a bit of information.  The balance-xor
>>> > mode is nominally expecting to interact with a switch whose ports are
>>> > set for etherchannel ("static link aggregation"), in which case the
>>> > switch will not loop the packet back around.
>>> >
>>> >        If your switch can do etherchannel, then enable it and the
>>> > problem should go away.  If your switch cannot do this, then you may
>>> > have other issues, because all of the multicast or broadcast packets
>>> > going out any bonding slave will loop around to another slave.  You
>>> > could also use 802.3ad / LACP if you switch supports that.
>>> >
>>> >        For balance-xor (or balance-rr, for that matter) mode to a
>>> > non-etherchannel switch, it's going to be difficult, if not impossible,
>>> > to modify bond_should_deliver_exact_match, because there are no inactive
>>> > slaves.  In this mode, bonding is expecting the switch to balance
>>> > incoming traffic across the ports, and not deliver looped back packets
>>> > or duplicates.  There are no restrictions on what type of traffic
>>> > (mcast, bcast, ucast) may arrive on any given port.
>>> >
>>> >        I can't think of a way to make the non-etherchannel case work
>>> > for balance-xor (or balance-rr) without breaking the DAD functionality
>>> > in the case of an actual duplicate.  I'm not aware of a way to
>>> > distinguish a looped back DAD probe from an actual duplicate address
>>> > probe elsewhere on the network.
>>> >
>>>
>>> Hi Neil & Jay,
>>>
>>> Thanks a lot for the comments.
>>>
>>> The use case is to add IPv6 address on the bonding interface first,
>>> and then set up port channel on switch. We'll hit this issue and the
>>> new address will stay tentative and unusable after port channel is set
>>> up on switch. This patch is for this valid use case.
>>>
>>> Except failover mode, all slaves are active on receiving packets, so
>>> we are receiving such looped back DAD and the bonding driver cannot
>>> ignore them. I cannot think of a way to distinguish if a DAD is looped
>>> back or from someone else having the same mac address. They look the
>>> same to the host. If there is another machine having the same mac
>>> address, this code path gets executed if both are doing DAD at the
>>> same time for the same IPv6 address. Maybe we should find out what the
>>> specification defines for this case?
>>>
>>
>>RFC4862 has a discussion about this issue:
>>http://tools.ietf.org/html/rfc4862#appendix-A
>>The better solution could be to record the number of DAD sent out. If
>>we received more DAD packets than we sent out, there is someone else
>>on the network who has the same mac address and sent DAD for the same
>>IPv6 address. However, this solution doesn't work with bonding
>>interface, since all other active slaves but the one sending out DAD
>>will receive packet looped back. It doesn't seem there is a simple
>>solution for this issue.
>
>        Why are you setting up the port channel after configuring the
> bond?
>
>        As a possible workaround, if you have control over the setup
> process (perhaps it's some sort of manual process), adding one slave to
> the bond, leaving the other soon-to-be slaves down, then setting up the
> switch, and finally adding the remaining slaves should work around the
> issue, since if the bond has only one slave it won't see any looped
> packets.
>
>        Or you could bring the bond up as active-backup, then change the
> mode to balance-xor once the switch is configured.
>
>        Ultimately, though, the problem stems from the settings mismatch
> between the switch and the bonding system; balance-xor is meant to
> interoperate with etherchannel, and when the switch is not configured
> properly, correct behavior is difficult to guarantee.
>

Jay,

Thanks a lot for the suggestion.

It's mainly about usability. We would like to provide customers with
consistent IPv6 configuration procedures as IPv4.  Such workarounds
could be confusing and generate customer calls.

Yinglin

^ permalink raw reply

* Re: big picture UDP/IP performance question re 2.6.18 -> 2.6.32
From: starlight @ 2011-10-07  2:24 UTC (permalink / raw)
  To: linux-kernel, netdev, Peter Zijlstra, Christoph Lameter,
	Eric Dumazet, Will

UPDATE

Hi all, I'm back with a significant update.

First, discovered that the problem with UDP was a tightening of the reverse-path filter logic in newer kernels.  To make the test work had to configure 'net.ipv4.conf.default.rp_filter = 0' in 'sysctl.conf'.  Was able to rerun all the tests in the nominal UDP sockets mode instead of packet socket mode.

Second I was intrigued by an assertion put forward that older kernels are always better than newer kernels.  So with some effort I managed to coerce 2.6.9(rhel4) into running on the Opteron 6174 CPU, though it only recognized eight of the twelve cores.


Here are the CPU results.  User and sys are expressed in jiffies.

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5  02:18:40  625666  206423 (24.8%)  18k
2.6.9-101.EL(rhel4) 02:31:12  689024  218198 (24.0%)  18k*
2.6.32-71.29.1.el6  02:50:14  602191  419276 (41.0%)  64k
2.6.39.4(vanilla)   02:42:35  629817  345674 (35.4%)  85k

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5         -       -       -            -
2.6.9-101.EL(rhel4)    +9.0%  +10.1%   +5.7%           0%*
2.6.32-71.29.1.el6    +22.7%   -3.8%   +103%        +255%
2.6.39.4(vanilla)     +17.2%   +0.7%  +67.4%        +372%

* test speed was run at 2/3rds to equalize
  the per-core frame rate since four cores
  were disabled


Here are some latency data samples.  Report interval is 10 seconds, all latency columns are in milliseconds.

2.6.18-194.8.1.el5
sample     min      max     mean    sigma
198283   0.011    3.139    0.239    0.169
206597   0.012    3.085    0.237    0.178
195939   0.012    4.220    0.266    0.211
206378   0.013    4.006    0.274    0.218
211994   0.012    3.771    0.248    0.184
222325   0.011    3.106    0.234    0.156
210871   0.011    2.693    0.254    0.177

2.6.9-101.EL(rhel4)
sample     min      max     mean    sigma
139786   0.013    3.637    0.335    0.238 
149788   0.013    3.957    0.384    0.258 
144065   0.014    7.637    0.376    0.283 
142088   0.012    3.996    0.398    0.281 
141026   0.014    4.174    0.383    0.253 
143387   0.014    4.457    0.366    0.236 
147699   0.013    4.615    0.359    0.238 

2.6.32-71.29.1.el6
sample     min      max     mean    sigma
206452   0.015    4.516    0.268    0.197
195740   0.016    4.227    0.277    0.206
206644   0.012    3.412    0.276    0.194
212008   0.012    2.569    0.269    0.182
222119   0.011    2.523    0.266    0.178
211377   0.012    2.779    0.277    0.178
214113   0.013    2.680    0.277    0.184

2.6.39.4(vanilla)
sample     min      max     mean    sigma
198530   0.012    2.736    0.274    0.147
200148   0.012    1.971    0.272    0.145
209972   0.010    2.975    0.270    0.150
219775   0.012    2.595    0.263    0.151
215601   0.015    2.554    0.276    0.153
211549   0.010    3.075    0.282    0.158
219332   0.012    2.658    0.271    0.144

2.6.9(rhel4) was a bit worse than 2.6.18(rhel5).  However the fact that it ran on eight cores instead of twelve may have subtly affected the outcome.  Also this kernel was built with the gcc 3.4.6 RH distro compiler which may generate code less well optimized for the 6174 CPU.  Therefore I view it as pretty close to a tie, but 2.6.18(rhel5) is the obviously better choice for the superior hardware support it provides.

Despite consuming a great deal more CPU, the newer kernels made a good showing on latency in this mode where only one thread wakeup occurs per arriving frame.  Both have slighter tighter latency distributions and slightly improved worst-case latencies.  2.6.39.4 is somewhat better than 2.6.32(rhel6) on all fronts.  Nonetheless I find the much higher CPU consumption to be a major negative since this reduces the overall capacity available on identical hardware.

-----

Now for the tests performed earlier and to which all my comments prior to this post apply.  Ran these only because I thought UDP was broken, but now I find I'm pleased with the added perspective they provide.  In this mode our application reads all the data from four interfaces via packet sockets (one per interface), and then requeues the packets to a pool of worker threads.  Potentially two application worker thread wakeup events occur for each frame, though wakeups only occur when work queues become empty.  The worker thread pool consists of nine threads with packets routed by similar type to maximize cache locality during processing.

2.6.18-194.8.1.el5
sample     min      max     mean    sigma
217629   0.015    1.841    0.150    0.080
217523   0.015    1.213    0.155    0.076
209183   0.014    0.624    0.129    0.066
220726   0.014    1.255    0.160    0.087
220400   0.015    3.374    0.197    0.172
238151   0.014    1.275    0.182    0.090
249239   0.016    1.399    0.197    0.093

2.6.9-101.EL(rhel4)
sample     min      max     mean    sigma
138561   0.019    3.163    0.333    0.236
144033   0.014    3.691    0.337    0.240
147437   0.016    3.802    0.320    0.226
147297   0.016    4.063    0.351    0.292
156178   0.018    3.560    0.322    0.244
166156   0.019    3.983    0.326    0.246
161930   0.017    3.441    0.311    0.197

2.6.32-71.29.1.el6
sample     min      max     mean    sigma
210340   0.017    8.734    0.344    0.638
220199   0.020    7.319    0.341    0.530
216868   0.019    6.376    0.332    0.544
211556   0.019    7.494    0.318    0.493
219462   0.014    7.472    0.344    0.545
225027   0.022    8.103    0.382    0.639
245629   0.017    8.713    0.382    0.594

2.6.39.4(vanilla)
sample     min      max     mean    sigma
253791   0.020    6.127    0.505    0.544
256739   0.020    6.960    0.535    0.577
258719   0.019    8.116    0.500    0.541
244735   0.018    6.781    0.542    0.634
250195   0.021    8.205    0.531    0.568

In this mode latency is quite a lot better with the 2.6.18(rhel5) kernel and seriously worse in the newer kernels.  Perhaps what is happening in 2.6.18(rhel) is that frames are being received on one set of cores and processed on a different set, with relatively few actual thread sleep/wake events.  The cache on each core is hot for the task it is performing, and due to the specialization of each worker thread less code cache pressure is present.  In 2.6.39.4 I can only say that the context switch rate was in the typical 200k/sec range where older kernels ran at less than half that.  Sorry now that I did not record CS rates more carefully.

Here's CPU consumption:

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5  02:07:16  615516  148152 (19.4%)  28k 
2.6.9-101.EL(rhel4) 02:30:22  696344  205953 (22.8%)  20k
2.6.32-71.29.1.el6  02:15:50  585276  229767 (28.1%) 163k
2.6.39.4(vanilla)   02:27:44  658074  228420 (25.7%) 165k

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5         -       -       -            -
2.6.9-101.EL(rhel4)   +18.1%  +13.1%   +39.0%        -29%
2.6.32-71.29.1.el6     +6.7%   -4.9%   +55.0%       +482%
2.6.39.4(vanilla)     +16.0%   +6.0%   +54.1%       +489%

The 2.6.18(rhel5) kernel performed significantly better here than in the many-threaded UDP-mode which I attribute to the close matching of the thread and core count and the reasons stated in the previous paragraph.  CPU consumption of 2.6.9(rhel4) with thirteen active threads scheduled on eight cores was about the same here as it was with the large UDP-mode thread count.

Relative to the many-threaded UDP-mode, matching of thread and core counts seems to help .32(rhel6) a great deal, but to a lesser degree for .39.

As with UDP-mode, system overhead is substantialy higher with newer kernels.

-----

Note that some tests were run with packets sockets and a large thread pool, but not all scenarios were covered and I find it less interesting than the small thread pool so it is omitted here.

Have perf reports for all 2.6.39.4 runs and a perf report for the the packet socket run on 2.6.32(rhel6).  If anyone is interested let me know and I'll post them.

-----

Overall our application runs best with 2.6.18(rhel5) in all regards excepting a slight improvement in latency distribution where, despite higher CPU consumption, the .32 and .39 kernels are better at 50% CPU load.  Naturally at higher data rates the newer kernel latency will go to pieces sooner and the absolute maximum performance attainable is substantially lower.

Until truly spectacular core counts arrive 2.6.18 remains the kernel of choice.

We would find it attractive if an option to compile newer kernels with a circa 2.6.18 O(1) scheduler was made available in the future.  Not a problem if doing so would disable any number of dependent kernel features.  Target here is HPC where in our view less is more and virtuzlization, resource allocation fairness (not to mention the SCHED_OTHER priority policy) etc. have little or no utility.  A shorter scheduler code path-length and lower cache pressure are more important than enhanced functionality.  The original O(1) scheduler appears to handle chains of related-processing thread hand-offs dramatically better.

^ permalink raw reply

* Re: big picture UDP/IP performance question re 2.6.18 -> 2.6.32
From: starlight @ 2011-10-07  2:33 UTC (permalink / raw)
  To: linux-kernel, netdev

[repost with wrapped lines]

UPDATE

Hi all, I'm back with a significant update.

First, discovered that the problem with UDP was a
tightening of the reverse-path filter logic in
newer kernels.  To make the test work had to
configure 'net.ipv4.conf.default.rp_filter = 0' in
'sysctl.conf'.  Was able to rerun all the tests in
the nominal UDP sockets mode instead of packet
socket mode.

Second I was intrigued by an assertion put forward
that older kernels are always better than newer
kernels.  So with some effort I managed to coerce
2.6.9(rhel4) into running on the Opteron 6174 CPU,
though it only recognized eight of the twelve
cores.


Here are the CPU results.  User and sys are expressed in jiffies.

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5  02:18:40  625666  206423 (24.8%)  18k
2.6.9-101.EL(rhel4) 02:31:12  689024  218198 (24.0%)  18k*
2.6.32-71.29.1.el6  02:50:14  602191  419276 (41.0%)  64k
2.6.39.4(vanilla)   02:42:35  629817  345674 (35.4%)  85k

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5         -       -       -            -
2.6.9-101.EL(rhel4)    +9.0%  +10.1%   +5.7%           0%*
2.6.32-71.29.1.el6    +22.7%   -3.8%   +103%        +255%
2.6.39.4(vanilla)     +17.2%   +0.7%  +67.4%        +372%

* test speed was run at 2/3rds to equalize
  the per-core frame rate since four cores
  were disabled


Here are some latency data samples.  Report
interval is 10 seconds, all latency columns are in
milliseconds.


2.6.18-194.8.1.el5
sample     min      max     mean    sigma
198283   0.011    3.139    0.239    0.169
206597   0.012    3.085    0.237    0.178
195939   0.012    4.220    0.266    0.211
206378   0.013    4.006    0.274    0.218
211994   0.012    3.771    0.248    0.184
222325   0.011    3.106    0.234    0.156
210871   0.011    2.693    0.254    0.177

2.6.9-101.EL(rhel4)
sample     min      max     mean    sigma
139786   0.013    3.637    0.335    0.238 
149788   0.013    3.957    0.384    0.258 
144065   0.014    7.637    0.376    0.283 
142088   0.012    3.996    0.398    0.281 
141026   0.014    4.174    0.383    0.253 
143387   0.014    4.457    0.366    0.236 
147699   0.013    4.615    0.359    0.238 

2.6.32-71.29.1.el6
sample     min      max     mean    sigma
206452   0.015    4.516    0.268    0.197
195740   0.016    4.227    0.277    0.206
206644   0.012    3.412    0.276    0.194
212008   0.012    2.569    0.269    0.182
222119   0.011    2.523    0.266    0.178
211377   0.012    2.779    0.277    0.178
214113   0.013    2.680    0.277    0.184

2.6.39.4(vanilla)
sample     min      max     mean    sigma
198530   0.012    2.736    0.274    0.147
200148   0.012    1.971    0.272    0.145
209972   0.010    2.975    0.270    0.150
219775   0.012    2.595    0.263    0.151
215601   0.015    2.554    0.276    0.153
211549   0.010    3.075    0.282    0.158
219332   0.012    2.658    0.271    0.144

2.6.9(rhel4) was a bit worse than 2.6.18(rhel5).
However the fact that it ran on eight cores
instead of twelve may have subtly affected the
outcome.  Also this kernel was built with the gcc
3.4.6 RH distro compiler which may generate code
less well optimized for the 6174 CPU.  Therefore I
view it as pretty close to a tie, but
2.6.18(rhel5) is the obviously better choice for
the superior hardware support it provides.

Despite consuming a great deal more CPU, the newer
kernels made a good showing on latency in this
mode where only one thread wakeup occurs per
arriving frame.  Both have slighter tighter
latency distributions and slightly improved
worst-case latencies.  2.6.39.4 is somewhat better
than 2.6.32(rhel6) on all fronts.  Nonetheless I
find the much higher CPU consumption to be a major
negative since this reduces the overall capacity
available on identical hardware.

-----

Now for the tests performed earlier and to which
all my comments prior to this post apply.  Ran
these only because I thought UDP was broken, but
now I find I'm pleased with the added perspective
they provide.  In this mode our application reads
all the data from four interfaces via packet
sockets (one per interface), and then requeues the
packets to a pool of worker threads.  Potentially
two application worker thread wakeup events occur
for each frame, though wakeups only occur when
work queues become empty.  The worker thread pool
consists of nine threads with packets routed by
similar type to maximize cache locality during
processing.

2.6.18-194.8.1.el5
sample     min      max     mean    sigma
217629   0.015    1.841    0.150    0.080
217523   0.015    1.213    0.155    0.076
209183   0.014    0.624    0.129    0.066
220726   0.014    1.255    0.160    0.087
220400   0.015    3.374    0.197    0.172
238151   0.014    1.275    0.182    0.090
249239   0.016    1.399    0.197    0.093

2.6.9-101.EL(rhel4)
sample     min      max     mean    sigma
138561   0.019    3.163    0.333    0.236
144033   0.014    3.691    0.337    0.240
147437   0.016    3.802    0.320    0.226
147297   0.016    4.063    0.351    0.292
156178   0.018    3.560    0.322    0.244
166156   0.019    3.983    0.326    0.246
161930   0.017    3.441    0.311    0.197

2.6.32-71.29.1.el6
sample     min      max     mean    sigma
210340   0.017    8.734    0.344    0.638
220199   0.020    7.319    0.341    0.530
216868   0.019    6.376    0.332    0.544
211556   0.019    7.494    0.318    0.493
219462   0.014    7.472    0.344    0.545
225027   0.022    8.103    0.382    0.639
245629   0.017    8.713    0.382    0.594

2.6.39.4(vanilla)
sample     min      max     mean    sigma
253791   0.020    6.127    0.505    0.544
256739   0.020    6.960    0.535    0.577
258719   0.019    8.116    0.500    0.541
244735   0.018    6.781    0.542    0.634
250195   0.021    8.205    0.531    0.568

In this mode latency is quite a lot better with
the 2.6.18(rhel5) kernel and seriously worse in
the newer kernels.  Perhaps what is happening in
2.6.18(rhel) is that frames are being received on
one set of cores and processed on a different set,
with relatively few actual thread sleep/wake
events.  The cache on each core is hot for the
task it is performing, and due to the
specialization of each worker thread less code
cache pressure is present.  In 2.6.39.4 I can only
say that the context switch rate was in the
typical 200k/sec range where older kernels ran at
less than half that.  Sorry now that I did not
record CS rates more carefully.


Here's CPU consumption:

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5  02:07:16  615516  148152 (19.4%)  28k 
2.6.9-101.EL(rhel4) 02:30:22  696344  205953 (22.8%)  20k
2.6.32-71.29.1.el6  02:15:50  585276  229767 (28.1%) 163k
2.6.39.4(vanilla)   02:27:44  658074  228420 (25.7%) 165k

kernel version      cpu total   user     sys        IRQ/s
2.6.18-194.8.1.el5         -       -       -            -
2.6.9-101.EL(rhel4)   +18.1%  +13.1%   +39.0%        -29%
2.6.32-71.29.1.el6     +6.7%   -4.9%   +55.0%       +482%
2.6.39.4(vanilla)     +16.0%   +6.0%   +54.1%       +489%

The 2.6.18(rhel5) kernel performed significantly
better here than in the many-threaded UDP-mode
which I attribute to the close matching of the
thread and core count and the reasons stated in
the previous paragraph.  CPU consumption of
2.6.9(rhel4) with thirteen active threads
scheduled on eight cores was about the same here
as it was with the large UDP-mode thread count.

Relative to the many-threaded UDP-mode, matching
of thread and core counts seems to help .32(rhel6)
a great deal, but to a lesser degree for .39.

As with UDP-mode, system overhead is substantialy
higher with newer kernels.

-----

Note that some tests were run with packets sockets
and a large thread pool, but not all scenarios
were covered and I find it less interesting than
the small thread pool so it is omitted here.

Have perf reports for all 2.6.39.4 runs and a perf
report for the the packet socket run on
2.6.32(rhel6).  If anyone is interested let me
know and I'll post them.

-----

Overall our application runs best with
2.6.18(rhel5) in all regards excepting a slight
improvement in latency distribution where, despite
higher CPU consumption, the .32 and .39 kernels
are better at 50% CPU load.  Naturally at higher
data rates the newer kernel latency will go to
pieces sooner and the absolute maximum performance
attainable is substantially lower.

Until truly spectacular core counts arrive 2.6.18
remains the kernel of choice.

We would find it attractive if an option to
compile newer kernels with a circa 2.6.18 O(1)
scheduler was made available in the future.  Not a
problem if doing so would disable any number of
dependent kernel features.  Target here is HPC
where in our view less is more and virtuzlization,
resource allocation fairness (not to mention the
SCHED_OTHER priority policy) etc. have little or
no utility.  A shorter scheduler code path-length
and lower cache pressure are more important than
enhanced functionality.  The original O(1)
scheduler appears to handle chains of
related-processing thread hand-offs dramatically
better.

^ permalink raw reply

* Re: big picture UDP/IP performance question re 2.6.18 -> 2.6.32
From: starlight @ 2011-10-07  3:27 UTC (permalink / raw)
  To: linux-kernel, netdev, Peter Zijlstra, Christoph Lameter,
	Eric Dumazet, Will

After writing the last post, the large
difference in IRQ rate between the older
and newer kernels caught my eye.

I wonder if the hugely lower rate in the older
kernels reflects a more agile shifting
into and out of NAPI mode by the network
bottom-half.

In this test the sending system
pulses data out on millisecond boundaries
due to the behavior of nsleep(), which
is used to establish the playback pace.

If the older kernels are switching to NAPI
for much of surge and the switching out
once the pulse falls off, it might
conceivably result in much better latency
and overall performance.

All tests were run with Intel 82571 
network interfaces and the 'e1000e'
device driver.  Some used the driver
packaged with the kernel, some used
Intel driver compiled from the source
found on sourceforge.net.  Never could
detected any difference between the two.

Since data in the production environment
also tends to arrive in bursts, I don't find
the pulsing playback behavior a detriment.

^ permalink raw reply

* Re: Bug#644281: linux-image-3.0.0-1-amd64: problem after changing mtu size on jme kernel module
From: Ben Hutchings @ 2011-10-07  4:17 UTC (permalink / raw)
  To: Guo-Fu Tseng; +Cc: 644281, kantacki, netdev
In-Reply-To: <20111004181353.3212.57206.reportbug@ASUS>

[-- Attachment #1: Type: text/plain, Size: 815 bytes --]

On Tue, 2011-10-04 at 20:13 +0200, kantacki wrote:
> Package: linux-2.6
> Version: 3.0.0-3
> Severity: important
> Tags: upstream
> 
> Hello,
> 
> When I try to execute:
> ifconfig eth0 mtu 4000 up
> or
> ifconfig eth0 mtu 7198 up
> my nic stops working and changing back to mtu 1500 does not help.
> To use my nic again I have to execute:
> rmmod jme
> modprome jme
> and the the command ifconfig eth0 mtu 1500 $IP up works and enables the network
> 
> My network card is Jmicron
[...]

Can you look into this, please?

It appears that jme_change_mtu() doesn't stop the RX path and doesn't
immediately reallocate RX buffers, and I think it should.  Maybe that's
not the problem though.

Ben.

-- 
Ben Hutchings
For every action, there is an equal and opposite criticism. - Harrison

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: A new 40G Network driver ready to submit to the kernel tree
From: Joyce Yu - System Software @ 2011-10-07  4:39 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev, asweisun
In-Reply-To: <20111006110056.GB27447@electric-eye.fr.zoreil.com>


> It is a bit old. It could be better to rebase it against David Miller's
> net-next tree as the drivers/net/ ethernet device tree has undergone
> some changes.
>
> The net-next tree is currently available at:
>
> git://github.com/davem330/net-next.git
>
>   
I ported the driver to the net-next tree.
>
> The remarks from 04/11 are still relevant but it will be a good start.
>
> Did it went through internal reviewing by some usual Linux folks at
> oracle (hint, hint) ?
>
>   
Yes.

What is the next step?

Thanks,
Joyce

^ permalink raw reply

* Re: A new 40G Network driver ready to submit to the kernel tree
From: Jeff Kirsher @ 2011-10-07  5:08 UTC (permalink / raw)
  To: Joyce Yu - System Software; +Cc: Francois Romieu, netdev, asweisun
In-Reply-To: <4E8E8278.90601@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 672 bytes --]

On 10/06/2011 09:39 PM, Joyce Yu - System Software wrote:
>
>> It is a bit old. It could be better to rebase it against David Miller's
>> net-next tree as the drivers/net/ ethernet device tree has undergone
>> some changes.
>>
>> The net-next tree is currently available at:
>>
>> git://github.com/davem330/net-next.git
>>
>>   
> I ported the driver to the net-next tree.
>>
>> The remarks from 04/11 are still relevant but it will be a good start.
>>
>> Did it went through internal reviewing by some usual Linux folks at
>> oracle (hint, hint) ?
>>
>>   
> Yes.
>
> What is the next step? 

Submit the patches to netdev for review/acceptance.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 900 bytes --]

^ permalink raw reply

* Re: Asserting ECN from userspace?
From: Eric Dumazet @ 2011-10-07  5:23 UTC (permalink / raw)
  To: David Täht; +Cc: netdev, bloat-devel
In-Reply-To: <4E8BF6B2.6030101@gmail.com>

Le mardi 04 octobre 2011 à 23:18 -0700, David Täht a écrit :
> No sooner had I noted (with pleasure) the kernel's new ability to
> correctly set the dscp bits on IPv6 TCP streams without messing with the
> negotiated ECN status, that I found several use cases where being able
> to assert ECN from userspace (for either ipv4, or ipv6) would be useful.
> 
> 1) Applications such as bittorrent (transmission, etc) that are much
> more aware of their overall environment could assert ECN on their UDP
> streams to indicate congestion.
> 
> 2) Test tools. It would be nice to be able, from userspace, to easily
> diagnose if ECN was working on a stream, end to end, and being able to
> set and receive the ECN bits on a less algorithmic basis (ie, not wedged
> deep within a kernel aqm such as RED or SFB)
> 
> 3) Web Proxies. A web proxy could note when it was experiencing
> congestion on one side of the proxied connection (or another) and signal
> the other side to slow down.
> 
> Ah, ECN, we hardly know ye.
> 
> as for item 1 I'm hard pressed to think of a case where setting the ECN
> bits on udp streams would introduce a security problem.
> 
> As for 2, can live without.
> 
> As for 3... perhaps a grantable network capability? A proxy could
> acquire privs to twiddle those bits before dropping root privs.
> 
> That begs the question of how to see those bits in the first place. OOB
> data?
> 
> And twiddling them, on a per stream basis, for a single packet, would
> seem to require something more robust than setsockopt/getsockopt
> (although that would work for udp streams)
> 

For UDP, its really easy to set ECT(0) or ECT(1) in your outgoing
frames, and test as well ECT(0),ECT(1),ECN in incoming frames.

For the sending part:
int tos = 0x2; /* ECT(1) */
setsockopt(fd, IP_PROTOIP, IP_TOS, &tos, sizeof(tos));

To be able to get the TOS value :

int on = 1;
setsockopt(fd, IP_PROTOIP, IP_RECVTOS, &on, sizeof(on));

RECVTOS (since Linux 2.2)
     If  enabled the IP_TOS ancillary message is passed with incoming packets.
     It contains a byte which specifies the Type of Service/Precedence
     field of the packet header.  Expects a boolean integer flag.

^ permalink raw reply

* Re: [PATCH 1/8] vxge: convert to SKB paged frag API.
From: Ian Campbell @ 2011-10-07  5:29 UTC (permalink / raw)
  To: David Miller; +Cc: mirqus@gmail.com, netdev@vger.kernel.org, jdmason@kudzu.us
In-Reply-To: <20111006.175632.863279389111584127.davem@davemloft.net>

On Thu, 2011-10-06 at 22:56 +0100, David Miller wrote:
> From: Ian Campbell <Ian.Campbell@eu.citrix.com>
> Date: Thu, 6 Oct 2011 22:25:02 +0100
> 
> > On Thu, 2011-10-06 at 22:16 +0100, David Miller wrote:
> >> From: Ian Campbell <Ian.Campbell@eu.citrix.com>
> >> Date: Thu, 6 Oct 2011 22:08:30 +0100
> >> 
> >> > You are absolutely right, I've no idea how I missed the very obvious
> >> > warning this produces. Incremental patch is below, sorry about this!
> >> 
> >> Would really love to know what your patch is against, since I
> >> fixed this problem when I commited your original patch.
> > 
> > It was against e878d78b9a74 + the original bad patch, i.e.:
> > 
> > $ git log --pretty=oneline net-next/master^..HEAD
> > 80f4b53b3d2c009689178d12de0bc108ddd580cd net: fix argument to dma_mapping_error after conversion to skb_frag_dma_map
> > 669b1ce22eedd0f7bac048299feb06c67804ed83 net: use DMA_x_DEVICE and dma_mapping_error with skb_frag_dma_map
> > e878d78b9a7403fabc89ecc93c56928b74d14f01 virtio-net: Verify page list size before fitting into skb
> > 
> > AFAICT e878d78b9a7403fabc89ecc93c56928b74d14f01 is still the head of the
> > public git://github.com/davem330/net-next master.
> 
> I'm and idiot, I didn't push it out when I left the office :-/
> 
> Sorry.

No problem.

> But when your commit appears it will have the dma_mapping_error() stuff
> fixed up, so don't worry about it. :-)

Thanks!

Ian.

^ permalink raw reply

* Re: big picture UDP/IP performance question re 2.6.18  -> 2.6.32
From: Eric Dumazet @ 2011-10-07  5:40 UTC (permalink / raw)
  To: starlight
  Cc: linux-kernel, netdev, Peter Zijlstra, Christoph Lameter,
	Willy Tarreau, Ingo Molnar, Stephen Hemminger, Benjamin LaHaise,
	Joe Perches, Chetan Loke, Con Kolivas, Serge Belyshev
In-Reply-To: <6.2.5.6.2.20111006231958.039bb570@binnacle.cx>

Le jeudi 06 octobre 2011 à 23:27 -0400, starlight@binnacle.cx a écrit :
> After writing the last post, the large
> difference in IRQ rate between the older
> and newer kernels caught my eye.
> 
> I wonder if the hugely lower rate in the older
> kernels reflects a more agile shifting
> into and out of NAPI mode by the network
> bottom-half.
> 
> In this test the sending system
> pulses data out on millisecond boundaries
> due to the behavior of nsleep(), which
> is used to establish the playback pace.
> 
> If the older kernels are switching to NAPI
> for much of surge and the switching out
> once the pulse falls off, it might
> conceivably result in much better latency
> and overall performance.
> 
> All tests were run with Intel 82571 
> network interfaces and the 'e1000e'
> device driver.  Some used the driver
> packaged with the kernel, some used
> Intel driver compiled from the source
> found on sourceforge.net.  Never could
> detected any difference between the two.
> 
> Since data in the production environment
> also tends to arrive in bursts, I don't find
> the pulsing playback behavior a detriment.
> 

Thats exactly the opposite : Your old kernel is not fast enough to
enter/exit NAPI on every incoming frame.

Instead of one IRQ per incoming frame, you have less interrupts :
A napi run processes more than 1 frame.

Now increase your incoming rate, and you'll discover a new kernel will
be able to process more frames without losses.

About your thread model :

You have one thread that reads the incoming frame, and do a distribution
on several queues based on some flow parameters. Then you wakeup a
second thread.

This kind of model is very expensive and triggers lot of false sharing.

New kernels are able to perform this fanout in kernel land.

You really should take a look at Documentation/networking/scaling.txt

[ An other way of doing this fanout is using some iptables rules :
check following commit changelog for an idea ]

commit e8648a1fdb54da1f683784b36a17aa65ea56e931
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date:   Fri Jul 23 12:59:36 2010 +0200

    netfilter: add xt_cpu match
    
    In some situations a CPU match permits a better spreading of
    connections, or select targets only for a given cpu.
    
    With Remote Packet Steering or multiqueue NIC and appropriate IRQ
    affinities, we can distribute trafic on available cpus, per session.
    (all RX packets for a given flow is handled by a given cpu)
    
    Some legacy applications being not SMP friendly, one way to scale a
    server is to run multiple copies of them.
    
    Instead of randomly choosing an instance, we can use the cpu number as a
    key so that softirq handler for a whole instance is running on a single
    cpu, maximizing cache effects in TCP/UDP stacks.
    
    Using NAT for example, a four ways machine might run four copies of
    server application, using a separate listening port for each instance,
    but still presenting an unique external port :
    
    iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 0 \
            -j REDIRECT --to-port 8080
    
    iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 1 \
            -j REDIRECT --to-port 8081
    
    iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 2 \
            -j REDIRECT --to-port 8082
    
    iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 3 \
            -j REDIRECT --to-port 8083
    

^ permalink raw reply

* Re: [PATCH] IPv6: DAD from bonding iface is treated as dup address from others
From: Chuck Anderson @ 2011-10-07  6:13 UTC (permalink / raw)
  To: netdev
In-Reply-To: <CAN17JHUkeCarKf3U-0EMEyL+RJvZGV3GSQWG7+TZFy7ZvD8jYA@mail.gmail.com>

On Thu, Oct 06, 2011 at 06:24:36PM -0700, Yinglin Sun wrote:
> On Thu, Oct 6, 2011 at 5:59 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> >        Why are you setting up the port channel after configuring the
> > bond?
> >
> >        As a possible workaround, if you have control over the setup
> > process (perhaps it's some sort of manual process), adding one slave to
> > the bond, leaving the other soon-to-be slaves down, then setting up the
> > switch, and finally adding the remaining slaves should work around the
> > issue, since if the bond has only one slave it won't see any looped
> > packets.
> >
> >        Or you could bring the bond up as active-backup, then change the
> > mode to balance-xor once the switch is configured.
> >
> >        Ultimately, though, the problem stems from the settings mismatch
> > between the switch and the bonding system; balance-xor is meant to
> > interoperate with etherchannel, and when the switch is not configured
> > properly, correct behavior is difficult to guarantee.
> >
> 
> Jay,
> 
> Thanks a lot for the suggestion.
> 
> It's mainly about usability. We would like to provide customers with
> consistent IPv6 configuration procedures as IPv4.  Such workarounds
> could be confusing and generate customer calls.

You've created/encouraged your customers to create a broken network
configuration by connecting two bonded links to a non-bonded,
non-etherchannel switch port pair.  This type of misconfiguration,
when applied to inter-switch trunks, can cause major network issues,
like looping and broadcast storms, taking down the entire network
unless something like Spanning Tree is enabled to protect against such
accidental loops.  It should be avoided at all costs.  Luckily, if the
Linux host in this case is not being used as a switch/bridge, the
impact of this might not be so bad--perhaps limited to the IPv6 DAD
issue you report.

If you want better usability and plug-n-play bonding, then require
LACP/802.3ad to be used.  Please don't encourage your customers to
connect misconfigured devices to the network, thanks.

^ permalink raw reply

* Re: big picture UDP/IP performance question re 2.6.18 -> 2.6.32
From: starlight @ 2011-10-07  6:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, netdev, Peter Zijlstra, Christoph Lameter,
	Willy Tarreau, Ingo Molnar, Stephen Hemminger, Benjamin LaHaise,
	Joe Perches, Chetan Loke, Con Kolivas, Serge Belyshev
In-Reply-To: <1317966007.3457.47.camel@edumazet-laptop>

At 07:40 AM 10/7/2011 +0200, Eric Dumazet wrote:
>
>Thats exactly the opposite : Your old kernel is not fast enough 
>to enter/exit NAPI on every incoming frame.
>
>Instead of one IRQ per incoming frame, you have less interrupts:
>A napi run processes more than 1 frame.

Please look at the data I posted.  Batching
appears to give 80us *better* latency in this
case--with the old kernel.

>Now increase your incoming rate, and you'll discover a new 
>kernel will be able to process more frames without losses.

Data loss happens mainly in relation to CPU
utilization-per-message-rate as buffers are
configured huge at all points.  So newer
kernels break down at significantly lower
message rates than older kernels.  Determined
this last year when testing SLES 11 and
unmodified 2.6.27.

I can run a max-rate comparison for 2.6.39.4
if you like.

>About your thread model :
>
>You have one thread that reads the incoming
>frame, and do a distribution on several queues
>based on some flow parameters. Then you wakeup 
>a second thread.
>
>This kind of model is very expensive and triggers
>lot of false sharing.

Please note my use of the word "nominal" and the
overall context.  Both thread-per-socket and
dual-thread handoff handling tests were performed
with the clear observation that the former is
the production model and works best at maximum
load.

However at 50% load (the test here), the
dual-thread handoff model is the clear winner
over *all* other scenarios.

>New kernels are able to perform this fanout in kernel land.

Yes, of course I am interested in Intel's flow
director and similar solutions, netfilter especially.
2.6.32 is only recently available in commercial
deployment and I will be looking at that next up.
Mainly I'll be looking at complete kernel bypass
with 10G.  Myricom looks like it might be good.
Tested Solarflare last year and it was a bust
for high volume UDP (one thread) but I've heard
that they fixed that and will revisit.

In relation to the above observation that
less CPU-per-packet is best for avoiding
data loss, it also correlates somewhat
(though not always) with better latency.

I've used the 'e1000e' 1G network interfaces
for these tests because they work better than
the multi-queue 'igb' (Intel 82576) and
'ixgbe' (Intel 82599) in all scenarios other
than maximum-stress load.  The reason is
apparently that the old 'e1000e' driver has
shorter, more efficient code paths while
'igb' and 'ixgbe' use significantly more
CPU to process the same number of packets.
I can quantify that if it is of interest.

At present the only place where multi-queue
NICs best four 1G NICs is at breaking-point
traffic loads where asymmetries in the traffic
can't be easily redistributed by the kernel
and the resulting hot-spots are weakest-link
breakpoints.

Please understand that I am not a curmudgeonly
Luddite.  I realize that sometimes it is
necessary to trade efficiency for scalability.
All I'm doing here is trying to quantify the
current state of affairs and make recommendations
in a commercial environment.  For the moment
all the excellent enhancements designed to
permit extreme scalability are costing too
much in efficiency to be worth using in
production.  When/if Tilera delivers their
100 core CPU in volume this state of affairs
will likely change.  I imagine both Intel
and AMD have many-core solutions in the pipe
as well, though it will be interesting to see
if Tilera has the essential patents and can
surpass the two majors in the market and the
courts.

^ permalink raw reply

* Re: Bug#644281: linux-image-3.0.0-1-amd64: problem after changing mtu size on jme kernel module
From: Guo-Fu Tseng @ 2011-10-07  6:24 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: 644281, kantacki, netdev
In-Reply-To: <1317961087.4024.28.camel@deadeye>

On Fri, 07 Oct 2011 05:17:52 +0100, Ben Hutchings wrote
> On Tue, 2011-10-04 at 20:13 +0200, kantacki wrote:
> > Package: linux-2.6
> > Version: 3.0.0-3
> > Severity: important
> > Tags: upstream
> > 
> > Hello,
> > 
> > When I try to execute:
> > ifconfig eth0 mtu 4000 up
> > or
> > ifconfig eth0 mtu 7198 up
> > my nic stops working and changing back to mtu 1500 does not help.
> > To use my nic again I have to execute:
> > rmmod jme
> > modprome jme
> > and the the command ifconfig eth0 mtu 1500 $IP up works and enables the network
> > 
> > My network card is Jmicron
> [...]
> 
> Can you look into this, please?
> 
> It appears that jme_change_mtu() doesn't stop the RX path and doesn't
> immediately reallocate RX buffers, and I think it should.  Maybe that's
> not the problem though.
> 
> Ben.
> 
> -- 
> Ben Hutchings
> For every action, there is an equal and opposite criticism. - Harrison

I'll find some time to look into it this weekend.
Thanks for the report. :)

Guo-Fu Tseng

^ permalink raw reply

* Re: loopback IP alias breaks tftp?
From: Joel Sing @ 2011-10-07  7:02 UTC (permalink / raw)
  To: Josh Boyer; +Cc: Julian Anastasov, netdev
In-Reply-To: <20111006132353.GF2479@zod.bos.redhat.com>

On 7 October 2011 00:23, Josh Boyer <jwboyer@redhat.com> wrote:
>
> On Thu, Oct 06, 2011 at 12:18:44AM +0300, Julian Anastasov wrote:
> >
> >       Hello,
> >
> > On Wed, 5 Oct 2011, Josh Boyer wrote:
> >
> > > Hi All,
> > >
> > > We've had a report [1] of a change in behavior when trying to use an IP
> > > alias to tftp from a loopback device.  Apparently the steps outlined in
> > > the bug worked in 2.6.35, and broke somewhere before 2.6.38.6.
> > >
> > > I can confirm the steps fail on a 3.0 based kernel and I'm trying to do
> > > a git bisect to find the commit involved, but I thought I would send
> > > this along to see if anyone might have an idea.  (Also, I'm not really
> > > sure how valid of a usecase this was to begin with.)
> >
> >       What about commit 9fc3bbb4a752f108cf096d96640f3b548bbbce6c ?
> >
> > ipv4/route.c: respect prefsrc for local routes
> >
> > http://marc.info/?t=129412232500001&r=1&w=2
> >
> > > [1] https://bugzilla.redhat.com/show_bug.cgi?id=739534
>
> Yep.  That is exactly what my git bisect said too.
>
> So now we have a change in behavior since that commit for the usecase
> described in the bug above, but I'm unsure if that usecase was ever
> really valid or if the commit had unintentional side effects.
>
> Joel (or anyone else) can you take a look and comment?

Prior to this commit the src address in the local routing table was
completely ignored, so connecting to a local address always used the
same source and destination addresses. However, this is not what was
configured in the local routing table:

  $ ifconfig lo:0 127.0.0.2
  $ ip route show table local | grep 127.0.0.2
  local 127.0.0.2 dev lo  proto kernel  scope host  src 127.0.0.1

When an interface has an alias configured, the source address
installed in the local routing table is always the primary address for
the interface. The tftp use case you've described now breaks due to
the way that in.tftpd is determining the reply address (as documented
in the in.tftpd manual page). This means that it is now responding
from 127.0.0.1 even though the client connected to 127.0.0.2. Binding
inetd to a specific address will avoid this issue.

I have not yet looked to see if there is a specific reason for the
source address selection, however one way of restoring the previous
behaviour (whilst still respecting the configured source address)
would be to use a default source address which matches the configured
address (as is done for primary addresses). I cannot immediately think
of a reason not to do this, but I've not gone looking at the code.

Worst case scenario if you specifically need the original behaviour
then the source address can be changed in the local routing table:

  $ ip route change table local local 127.0.0.2 dev lo:0 proto kernel
scope host src 127.0.0.2
  $ ip route show table local | grep 127.0.0.2
  local 127.0.0.2 dev lo  proto kernel  scope host  src 127.0.0.2

^ permalink raw reply

* [net-next] cs89x0: Move the driver into the Cirrus dir
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann, Russell Nelson,
	Andrew Morton
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

The cs89x0 driver was initial placed in the apple/ when it
should have been placed in the cirrus/.  This resolves the
issue by moving the dirver and fixing up the respective
Kconfig(s) and Makefile(s).

Thanks to Sascha for reporting the issue.

CC: Russell Nelson <nelson@crynwr.com>
CC: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/apple/Kconfig              |   22 +---------------------
 drivers/net/ethernet/apple/Makefile             |    1 -
 drivers/net/ethernet/cirrus/Kconfig             |   22 +++++++++++++++++++++-
 drivers/net/ethernet/cirrus/Makefile            |    1 +
 drivers/net/ethernet/{apple => cirrus}/cs89x0.c |    0
 drivers/net/ethernet/{apple => cirrus}/cs89x0.h |    0
 6 files changed, 23 insertions(+), 23 deletions(-)
 rename drivers/net/ethernet/{apple => cirrus}/cs89x0.c (100%)
 rename drivers/net/ethernet/{apple => cirrus}/cs89x0.h (100%)

diff --git a/drivers/net/ethernet/apple/Kconfig b/drivers/net/ethernet/apple/Kconfig
index 59d5c26..90ad2c1 100644
--- a/drivers/net/ethernet/apple/Kconfig
+++ b/drivers/net/ethernet/apple/Kconfig
@@ -5,8 +5,7 @@
 config NET_VENDOR_APPLE
 	bool "Apple devices"
 	default y
-	depends on (PPC_PMAC && PPC32) || MAC || ISA || EISA || MACH_IXDP2351 \
-		   || ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440
+	depends on (PPC_PMAC && PPC32) || MACE || MAC
 	---help---
 	  If you have a network (Ethernet) card belonging to this class, say Y
 	  and read the Ethernet-HOWTO, available from
@@ -75,23 +74,4 @@ config MACMACE
 	  say Y and read the Ethernet-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
-config CS89x0
-	tristate "CS89x0 support"
-	depends on (ISA || EISA || MACH_IXDP2351 \
-		|| ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440)
-	---help---
-	  Support for CS89x0 chipset based Ethernet cards. If you have a
-	  network (Ethernet) card of this type, say Y and read the
-	  Ethernet-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto> as well as
-	  <file:Documentation/networking/cs89x0.txt>.
-
-	  To compile this driver as a module, choose M here. The module
-	  will be called cs89x0.
-
-config CS89x0_NONISA_IRQ
-	def_bool y
-	depends on CS89x0 != n
-	depends on MACH_IXDP2351 || ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440
-
 endif # NET_VENDOR_APPLE
diff --git a/drivers/net/ethernet/apple/Makefile b/drivers/net/ethernet/apple/Makefile
index 9d30086..0d3a591 100644
--- a/drivers/net/ethernet/apple/Makefile
+++ b/drivers/net/ethernet/apple/Makefile
@@ -5,5 +5,4 @@
 obj-$(CONFIG_MACE) += mace.o
 obj-$(CONFIG_BMAC) += bmac.o
 obj-$(CONFIG_MAC89x0) += mac89x0.o
-obj-$(CONFIG_CS89x0) += cs89x0.o
 obj-$(CONFIG_MACMACE) += macmace.o
diff --git a/drivers/net/ethernet/cirrus/Kconfig b/drivers/net/ethernet/cirrus/Kconfig
index e9386ef..6cbb81c 100644
--- a/drivers/net/ethernet/cirrus/Kconfig
+++ b/drivers/net/ethernet/cirrus/Kconfig
@@ -5,7 +5,8 @@
 config NET_VENDOR_CIRRUS
 	bool "Cirrus devices"
 	default y
-	depends on ARM && ARCH_EP93XX
+	depends on ISA || EISA || MACH_IXDP2351 || ARCH_IXDP2X01 \
+		|| MACH_MX31ADS || MACH_QQ2440 || (ARM && ARCH_EP93XX)
 	---help---
 	  If you have a network (Ethernet) card belonging to this class, say Y
 	  and read the Ethernet-HOWTO, available from
@@ -18,6 +19,25 @@ config NET_VENDOR_CIRRUS
 
 if NET_VENDOR_CIRRUS
 
+config CS89x0
+	tristate "CS89x0 support"
+	depends on (ISA || EISA || MACH_IXDP2351 \
+		|| ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440)
+	---help---
+	  Support for CS89x0 chipset based Ethernet cards. If you have a
+	  network (Ethernet) card of this type, say Y and read the
+	  Ethernet-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto> as well as
+	  <file:Documentation/networking/cs89x0.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called cs89x0.
+
+config CS89x0_NONISA_IRQ
+	def_bool y
+	depends on CS89x0 != n
+	depends on MACH_IXDP2351 || ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440
+
 config EP93XX_ETH
 	tristate "EP93xx Ethernet support"
 	depends on ARM && ARCH_EP93XX
diff --git a/drivers/net/ethernet/cirrus/Makefile b/drivers/net/ethernet/cirrus/Makefile
index 9905ea2..14bd77e 100644
--- a/drivers/net/ethernet/cirrus/Makefile
+++ b/drivers/net/ethernet/cirrus/Makefile
@@ -2,4 +2,5 @@
 # Makefile for the Cirrus network device drivers.
 #
 
+obj-$(CONFIG_CS89x0) += cs89x0.o
 obj-$(CONFIG_EP93XX_ETH) += ep93xx_eth.o
diff --git a/drivers/net/ethernet/apple/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c
similarity index 100%
rename from drivers/net/ethernet/apple/cs89x0.c
rename to drivers/net/ethernet/cirrus/cs89x0.c
diff --git a/drivers/net/ethernet/apple/cs89x0.h b/drivers/net/ethernet/cirrus/cs89x0.h
similarity index 100%
rename from drivers/net/ethernet/apple/cs89x0.h
rename to drivers/net/ethernet/cirrus/cs89x0.h
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 00/13][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

The following series contains updates to e1000, e1000e, igb and ixgbe.  Here
is a quick summary:
  - e1000: 3 conversions (timers->threads, mdelay->msleep, mutex->rtnl)
  - e1000e: fix jumbo frames on 82579
  - igb: several cleanups to reduce stack space and improve performance
  - ixgbe: bump driver ver

The following are changes since commit e878d78b9a7403fabc89ecc93c56928b74d14f01:
  virtio-net: Verify page list size before fitting into skb
and are available in the git repository at
  git://github.com/Jkirsher/net-next.git

Alexander Duyck (8):
  igb: Make Tx budget for NAPI user adjustable
  igb: split buffer_info into tx_buffer_info and rx_buffer_info
  igb: Consolidate creation of Tx context descriptors into a single
    function
  igb: Make first and tx_buffer_info->next_to_watch into pointers
  igb: Create separate functions for generating cmd_type and olinfo
  igb: Cleanup protocol handling in transmit path
  igb: Combine all flag info fields into a single tx_flags structure
  igb: consolidate creation of Tx buffer info and data descriptor

Bruce Allan (1):
  e1000e: bad short packets received when jumbos enabled on 82579

Don Skidmore (1):
  ixgbe: bump version number

Jesse Brandeburg (3):
  e1000: convert hardware management from timers to threads
  e1000: convert mdelay to msleep
  e1000: convert to private mutex from rtnl

 drivers/net/ethernet/intel/e1000/e1000.h      |   12 +-
 drivers/net/ethernet/intel/e1000/e1000_hw.c   |   22 +-
 drivers/net/ethernet/intel/e1000/e1000_main.c |  169 +++---
 drivers/net/ethernet/intel/e1000e/ich8lan.c   |    2 +-
 drivers/net/ethernet/intel/igb/e1000_82575.h  |    2 +
 drivers/net/ethernet/intel/igb/igb.h          |   54 +-
 drivers/net/ethernet/intel/igb/igb_ethtool.c  |   16 +-
 drivers/net/ethernet/intel/igb/igb_main.c     |  840 ++++++++++++++-----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    4 +-
 9 files changed, 600 insertions(+), 521 deletions(-)

-- 
1.7.6.4

^ permalink raw reply

* [net-next 01/13] ixgbe: bump version number
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Don Skidmore, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Don Skidmore <donald.c.skidmore@intel.com>

Bump the version string to better match pair up with the out of tree
driver that contains the same functionality.

Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1f936c8..1519a23 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -56,8 +56,8 @@ char ixgbe_driver_name[] = "ixgbe";
 static const char ixgbe_driver_string[] =
 			      "Intel(R) 10 Gigabit PCI Express Network Driver";
 #define MAJ 3
-#define MIN 4
-#define BUILD 8
+#define MIN 6
+#define BUILD 7
 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \
 	__stringify(BUILD) "-k"
 const char ixgbe_driver_version[] = DRV_VERSION;
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 02/13] e1000: convert hardware management from timers to threads
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem
  Cc: Jesse Brandeburg, netdev, gospo, sassmann, Thomas Gleixner,
	Tushar Dave, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

Thomas Gleixner (tglx) reported that e1000 was delaying for many milliseconds
(using mdelay) from inside timer/interrupt context.  None of these paths are
performance critical and can be moved into threads/work items.  This patch
implements the work items and the next patch changes the mdelays to msleeps.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tushar Dave <tushar.n.dave@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |   10 +-
 drivers/net/ethernet/intel/e1000/e1000_main.c |  129 ++++++++++---------------
 2 files changed, 55 insertions(+), 84 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index 4ea87b1..fc6fbbd 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -214,9 +214,6 @@ struct e1000_rx_ring {
 /* board specific private data structure */
 
 struct e1000_adapter {
-	struct timer_list tx_fifo_stall_timer;
-	struct timer_list watchdog_timer;
-	struct timer_list phy_info_timer;
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 	u16 mng_vlan_id;
 	u32 bd_number;
@@ -237,7 +234,6 @@ struct e1000_adapter {
 	u16 tx_itr;
 	u16 rx_itr;
 
-	struct work_struct reset_task;
 	u8 fc_autoneg;
 
 	/* TX */
@@ -310,8 +306,10 @@ struct e1000_adapter {
 
 	bool discarding;
 
-	struct work_struct fifo_stall_task;
-	struct work_struct phy_info_task;
+	struct work_struct reset_task;
+	struct delayed_work watchdog_task;
+	struct delayed_work fifo_stall_task;
+	struct delayed_work phy_info_task;
 };
 
 enum e1000_state_t {
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 4bbc05a..a0c5ea0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -131,10 +131,8 @@ static void e1000_clean_tx_ring(struct e1000_adapter *adapter,
 static void e1000_clean_rx_ring(struct e1000_adapter *adapter,
                                 struct e1000_rx_ring *rx_ring);
 static void e1000_set_rx_mode(struct net_device *netdev);
-static void e1000_update_phy_info(unsigned long data);
 static void e1000_update_phy_info_task(struct work_struct *work);
-static void e1000_watchdog(unsigned long data);
-static void e1000_82547_tx_fifo_stall(unsigned long data);
+static void e1000_watchdog(struct work_struct *work);
 static void e1000_82547_tx_fifo_stall_task(struct work_struct *work);
 static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 				    struct net_device *netdev);
@@ -493,6 +491,15 @@ out:
 	return;
 }
 
+static void e1000_down_and_stop(struct e1000_adapter *adapter)
+{
+	set_bit(__E1000_DOWN, &adapter->flags);
+	cancel_work_sync(&adapter->reset_task);
+	cancel_delayed_work_sync(&adapter->watchdog_task);
+	cancel_delayed_work_sync(&adapter->phy_info_task);
+	cancel_delayed_work_sync(&adapter->fifo_stall_task);
+}
+
 void e1000_down(struct e1000_adapter *adapter)
 {
 	struct e1000_hw *hw = &adapter->hw;
@@ -522,13 +529,9 @@ void e1000_down(struct e1000_adapter *adapter)
 	/*
 	 * Setting DOWN must be after irq_disable to prevent
 	 * a screaming interrupt.  Setting DOWN also prevents
-	 * timers and tasks from rescheduling.
+	 * tasks from rescheduling.
 	 */
-	set_bit(__E1000_DOWN, &adapter->flags);
-
-	del_timer_sync(&adapter->tx_fifo_stall_timer);
-	del_timer_sync(&adapter->watchdog_timer);
-	del_timer_sync(&adapter->phy_info_timer);
+	e1000_down_and_stop(adapter);
 
 	adapter->link_speed = 0;
 	adapter->link_duplex = 0;
@@ -1120,21 +1123,12 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 	if (!is_valid_ether_addr(netdev->perm_addr))
 		e_err(probe, "Invalid MAC Address\n");
 
-	init_timer(&adapter->tx_fifo_stall_timer);
-	adapter->tx_fifo_stall_timer.function = e1000_82547_tx_fifo_stall;
-	adapter->tx_fifo_stall_timer.data = (unsigned long)adapter;
-
-	init_timer(&adapter->watchdog_timer);
-	adapter->watchdog_timer.function = e1000_watchdog;
-	adapter->watchdog_timer.data = (unsigned long) adapter;
-
-	init_timer(&adapter->phy_info_timer);
-	adapter->phy_info_timer.function = e1000_update_phy_info;
-	adapter->phy_info_timer.data = (unsigned long)adapter;
 
-	INIT_WORK(&adapter->fifo_stall_task, e1000_82547_tx_fifo_stall_task);
+	INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog);
+	INIT_DELAYED_WORK(&adapter->fifo_stall_task,
+			  e1000_82547_tx_fifo_stall_task);
+	INIT_DELAYED_WORK(&adapter->phy_info_task, e1000_update_phy_info_task);
 	INIT_WORK(&adapter->reset_task, e1000_reset_task);
-	INIT_WORK(&adapter->phy_info_task, e1000_update_phy_info_task);
 
 	e1000_check_options(adapter);
 
@@ -1279,13 +1273,7 @@ static void __devexit e1000_remove(struct pci_dev *pdev)
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
 
-	set_bit(__E1000_DOWN, &adapter->flags);
-	del_timer_sync(&adapter->tx_fifo_stall_timer);
-	del_timer_sync(&adapter->watchdog_timer);
-	del_timer_sync(&adapter->phy_info_timer);
-
-	cancel_work_sync(&adapter->reset_task);
-
+	e1000_down_and_stop(adapter);
 	e1000_release_manageability(adapter);
 
 	unregister_netdev(netdev);
@@ -1369,7 +1357,7 @@ static int __devinit e1000_alloc_queues(struct e1000_adapter *adapter)
  * The open entry point is called when a network interface is made
  * active by the system (IFF_UP).  At this point all resources needed
  * for transmit and receive operations are allocated, the interrupt
- * handler is registered with the OS, the watchdog timer is started,
+ * handler is registered with the OS, the watchdog task is started,
  * and the stack is notified that the interface is ready.
  **/
 
@@ -2331,46 +2319,32 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 	kfree(mcarray);
 }
 
-/* Need to wait a few seconds after link up to get diagnostic information from
- * the phy */
-
-static void e1000_update_phy_info(unsigned long data)
-{
-	struct e1000_adapter *adapter = (struct e1000_adapter *)data;
-	schedule_work(&adapter->phy_info_task);
-}
-
+/**
+ * e1000_update_phy_info_task - get phy info
+ * @work: work struct contained inside adapter struct
+ *
+ * Need to wait a few seconds after link up to get diagnostic information from
+ * the phy
+ */
 static void e1000_update_phy_info_task(struct work_struct *work)
 {
 	struct e1000_adapter *adapter = container_of(work,
-	                                             struct e1000_adapter,
-	                                             phy_info_task);
-	struct e1000_hw *hw = &adapter->hw;
-
+						     struct e1000_adapter,
+						     phy_info_task.work);
 	rtnl_lock();
-	e1000_phy_get_info(hw, &adapter->phy_info);
+	e1000_phy_get_info(&adapter->hw, &adapter->phy_info);
 	rtnl_unlock();
 }
 
 /**
- * e1000_82547_tx_fifo_stall - Timer Call-back
- * @data: pointer to adapter cast into an unsigned long
- **/
-static void e1000_82547_tx_fifo_stall(unsigned long data)
-{
-	struct e1000_adapter *adapter = (struct e1000_adapter *)data;
-	schedule_work(&adapter->fifo_stall_task);
-}
-
-/**
  * e1000_82547_tx_fifo_stall_task - task to complete work
  * @work: work struct contained inside adapter struct
  **/
 static void e1000_82547_tx_fifo_stall_task(struct work_struct *work)
 {
 	struct e1000_adapter *adapter = container_of(work,
-	                                             struct e1000_adapter,
-	                                             fifo_stall_task);
+						     struct e1000_adapter,
+						     fifo_stall_task.work);
 	struct e1000_hw *hw = &adapter->hw;
 	struct net_device *netdev = adapter->netdev;
 	u32 tctl;
@@ -2393,7 +2367,7 @@ static void e1000_82547_tx_fifo_stall_task(struct work_struct *work)
 			atomic_set(&adapter->tx_fifo_stall, 0);
 			netif_wake_queue(netdev);
 		} else if (!test_bit(__E1000_DOWN, &adapter->flags)) {
-			mod_timer(&adapter->tx_fifo_stall_timer, jiffies + 1);
+			schedule_delayed_work(&adapter->fifo_stall_task, 1);
 		}
 	}
 	rtnl_unlock();
@@ -2437,12 +2411,14 @@ bool e1000_has_link(struct e1000_adapter *adapter)
 }
 
 /**
- * e1000_watchdog - Timer Call-back
- * @data: pointer to adapter cast into an unsigned long
+ * e1000_watchdog - work function
+ * @work: work struct contained inside adapter struct
  **/
-static void e1000_watchdog(unsigned long data)
+static void e1000_watchdog(struct work_struct *work)
 {
-	struct e1000_adapter *adapter = (struct e1000_adapter *)data;
+	struct e1000_adapter *adapter = container_of(work,
+						     struct e1000_adapter,
+						     watchdog_task.work);
 	struct e1000_hw *hw = &adapter->hw;
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_tx_ring *txdr = adapter->tx_ring;
@@ -2493,8 +2469,8 @@ static void e1000_watchdog(unsigned long data)
 
 			netif_carrier_on(netdev);
 			if (!test_bit(__E1000_DOWN, &adapter->flags))
-				mod_timer(&adapter->phy_info_timer,
-				          round_jiffies(jiffies + 2 * HZ));
+				schedule_delayed_work(&adapter->phy_info_task,
+						      2 * HZ);
 			adapter->smartspeed = 0;
 		}
 	} else {
@@ -2506,8 +2482,8 @@ static void e1000_watchdog(unsigned long data)
 			netif_carrier_off(netdev);
 
 			if (!test_bit(__E1000_DOWN, &adapter->flags))
-				mod_timer(&adapter->phy_info_timer,
-				          round_jiffies(jiffies + 2 * HZ));
+				schedule_delayed_work(&adapter->phy_info_task,
+						      2 * HZ);
 		}
 
 		e1000_smartspeed(adapter);
@@ -2563,10 +2539,9 @@ link_up:
 	/* Force detection of hung controller every watchdog period */
 	adapter->detect_tx_hung = true;
 
-	/* Reset the timer */
+	/* Reschedule the task */
 	if (!test_bit(__E1000_DOWN, &adapter->flags))
-		mod_timer(&adapter->watchdog_timer,
-		          round_jiffies(jiffies + 2 * HZ));
+		schedule_delayed_work(&adapter->watchdog_task, 2 * HZ);
 }
 
 enum latency_range {
@@ -3206,14 +3181,12 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 	if (unlikely(e1000_maybe_stop_tx(netdev, tx_ring, count + 2)))
 		return NETDEV_TX_BUSY;
 
-	if (unlikely(hw->mac_type == e1000_82547)) {
-		if (unlikely(e1000_82547_fifo_workaround(adapter, skb))) {
-			netif_stop_queue(netdev);
-			if (!test_bit(__E1000_DOWN, &adapter->flags))
-				mod_timer(&adapter->tx_fifo_stall_timer,
-				          jiffies + 1);
-			return NETDEV_TX_BUSY;
-		}
+	if (unlikely((hw->mac_type == e1000_82547) &&
+		     (e1000_82547_fifo_workaround(adapter, skb)))) {
+		netif_stop_queue(netdev);
+		if (!test_bit(__E1000_DOWN, &adapter->flags))
+			schedule_delayed_work(&adapter->fifo_stall_task, 1);
+		return NETDEV_TX_BUSY;
 	}
 
 	if (vlan_tx_tag_present(skb)) {
@@ -3283,7 +3256,7 @@ static void e1000_reset_task(struct work_struct *work)
  * @netdev: network interface device structure
  *
  * Returns the address of the device statistics structure.
- * The statistics are actually updated from the timer callback.
+ * The statistics are actually updated from the watchdog.
  **/
 
 static struct net_device_stats *e1000_get_stats(struct net_device *netdev)
@@ -3551,7 +3524,7 @@ static irqreturn_t e1000_intr(int irq, void *data)
 		hw->get_link_status = 1;
 		/* guard against interrupt when we're going down */
 		if (!test_bit(__E1000_DOWN, &adapter->flags))
-			mod_timer(&adapter->watchdog_timer, jiffies + 1);
+			schedule_delayed_work(&adapter->watchdog_task, 1);
 	}
 
 	/* disable interrupts, without the synchronize_irq bit */
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 03/13] e1000: convert mdelay to msleep
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem
  Cc: Jesse Brandeburg, netdev, gospo, sassmann, Thomas Gleixner,
	Tushar Dave, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

With the previous commit, there are several functions
that are only ever called from thread context, and are
able to sleep with msleep instead of mdelay.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tushar Dave <tushar.n.dave@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000_hw.c   |   22 +++++++++++-----------
 drivers/net/ethernet/intel/e1000/e1000_main.c |    2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000_hw.c b/drivers/net/ethernet/intel/e1000/e1000_hw.c
index a5a89ec..36ee76b 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_hw.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_hw.c
@@ -5385,7 +5385,7 @@ static s32 e1000_config_dsp_after_link_change(struct e1000_hw *hw, bool link_up)
 			if (ret_val)
 				return ret_val;
 
-			mdelay(20);
+			msleep(20);
 
 			ret_val = e1000_write_phy_reg(hw, 0x0000,
 						      IGP01E1000_IEEE_FORCE_GIGA);
@@ -5413,7 +5413,7 @@ static s32 e1000_config_dsp_after_link_change(struct e1000_hw *hw, bool link_up)
 			if (ret_val)
 				return ret_val;
 
-			mdelay(20);
+			msleep(20);
 
 			/* Now enable the transmitter */
 			ret_val =
@@ -5440,7 +5440,7 @@ static s32 e1000_config_dsp_after_link_change(struct e1000_hw *hw, bool link_up)
 			if (ret_val)
 				return ret_val;
 
-			mdelay(20);
+			msleep(20);
 
 			ret_val = e1000_write_phy_reg(hw, 0x0000,
 						      IGP01E1000_IEEE_FORCE_GIGA);
@@ -5457,7 +5457,7 @@ static s32 e1000_config_dsp_after_link_change(struct e1000_hw *hw, bool link_up)
 			if (ret_val)
 				return ret_val;
 
-			mdelay(20);
+			msleep(20);
 
 			/* Now enable the transmitter */
 			ret_val =
@@ -5750,26 +5750,26 @@ static s32 e1000_polarity_reversal_workaround(struct e1000_hw *hw)
 
 		if ((mii_status_reg & ~MII_SR_LINK_STATUS) == 0)
 			break;
-		mdelay(100);
+		msleep(100);
 	}
 
 	/* Recommended delay time after link has been lost */
-	mdelay(1000);
+	msleep(1000);
 
 	/* Now we will re-enable th transmitter on the PHY */
 
 	ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0019);
 	if (ret_val)
 		return ret_val;
-	mdelay(50);
+	msleep(50);
 	ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0xFFF0);
 	if (ret_val)
 		return ret_val;
-	mdelay(50);
+	msleep(50);
 	ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0xFF00);
 	if (ret_val)
 		return ret_val;
-	mdelay(50);
+	msleep(50);
 	ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0x0000);
 	if (ret_val)
 		return ret_val;
@@ -5794,7 +5794,7 @@ static s32 e1000_polarity_reversal_workaround(struct e1000_hw *hw)
 
 		if (mii_status_reg & MII_SR_LINK_STATUS)
 			break;
-		mdelay(100);
+		msleep(100);
 	}
 	return E1000_SUCCESS;
 }
@@ -5825,6 +5825,6 @@ static s32 e1000_get_auto_rd_done(struct e1000_hw *hw)
 static s32 e1000_get_phy_cfg_done(struct e1000_hw *hw)
 {
 	e_dbg("e1000_get_phy_cfg_done");
-	mdelay(10);
+	msleep(10);
 	return E1000_SUCCESS;
 }
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index a0c5ea0..6d03d76 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -485,7 +485,7 @@ static void e1000_power_down_phy(struct e1000_adapter *adapter)
 		e1000_read_phy_reg(hw, PHY_CTRL, &mii_reg);
 		mii_reg |= MII_CR_POWER_DOWN;
 		e1000_write_phy_reg(hw, PHY_CTRL, mii_reg);
-		mdelay(1);
+		msleep(1);
 	}
 out:
 	return;
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 04/13] e1000: convert to private mutex from rtnl
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem
  Cc: Jesse Brandeburg, netdev, gospo, sassmann, Thomas Gleixner,
	Tushar Dave, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

The e1000 driver when running with lockdep could run into
some possible deadlocks between the work items acquiring
rtnl and the rtnl lock being acquired before work items
were cancelled.

Use a private mutex to make sure lock ordering isn't violated.
The private mutex is only used to protect areas not generally
covered by the rtnl lock already.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tushar Dave <tushar.n.dave@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |    2 +
 drivers/net/ethernet/intel/e1000/e1000_main.c |   38 +++++++++++++++++++------
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index fc6fbbd..1e15969 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -310,6 +310,8 @@ struct e1000_adapter {
 	struct delayed_work watchdog_task;
 	struct delayed_work fifo_stall_task;
 	struct delayed_work phy_info_task;
+
+	struct mutex mutex;
 };
 
 enum e1000_state_t {
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 6d03d76..a42421f 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -546,10 +546,10 @@ static void e1000_reinit_safe(struct e1000_adapter *adapter)
 {
 	while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
 		msleep(1);
-	rtnl_lock();
+	mutex_lock(&adapter->mutex);
 	e1000_down(adapter);
 	e1000_up(adapter);
-	rtnl_unlock();
+	mutex_unlock(&adapter->mutex);
 	clear_bit(__E1000_RESETTING, &adapter->flags);
 }
 
@@ -1317,6 +1317,7 @@ static int __devinit e1000_sw_init(struct e1000_adapter *adapter)
 	e1000_irq_disable(adapter);
 
 	spin_lock_init(&adapter->stats_lock);
+	mutex_init(&adapter->mutex);
 
 	set_bit(__E1000_DOWN, &adapter->flags);
 
@@ -2331,9 +2332,11 @@ static void e1000_update_phy_info_task(struct work_struct *work)
 	struct e1000_adapter *adapter = container_of(work,
 						     struct e1000_adapter,
 						     phy_info_task.work);
-	rtnl_lock();
+	if (test_bit(__E1000_DOWN, &adapter->flags))
+		return;
+	mutex_lock(&adapter->mutex);
 	e1000_phy_get_info(&adapter->hw, &adapter->phy_info);
-	rtnl_unlock();
+	mutex_unlock(&adapter->mutex);
 }
 
 /**
@@ -2349,7 +2352,9 @@ static void e1000_82547_tx_fifo_stall_task(struct work_struct *work)
 	struct net_device *netdev = adapter->netdev;
 	u32 tctl;
 
-	rtnl_lock();
+	if (test_bit(__E1000_DOWN, &adapter->flags))
+		return;
+	mutex_lock(&adapter->mutex);
 	if (atomic_read(&adapter->tx_fifo_stall)) {
 		if ((er32(TDT) == er32(TDH)) &&
 		   (er32(TDFT) == er32(TDFH)) &&
@@ -2370,7 +2375,7 @@ static void e1000_82547_tx_fifo_stall_task(struct work_struct *work)
 			schedule_delayed_work(&adapter->fifo_stall_task, 1);
 		}
 	}
-	rtnl_unlock();
+	mutex_unlock(&adapter->mutex);
 }
 
 bool e1000_has_link(struct e1000_adapter *adapter)
@@ -2424,6 +2429,10 @@ static void e1000_watchdog(struct work_struct *work)
 	struct e1000_tx_ring *txdr = adapter->tx_ring;
 	u32 link, tctl;
 
+	if (test_bit(__E1000_DOWN, &adapter->flags))
+		return;
+
+	mutex_lock(&adapter->mutex);
 	link = e1000_has_link(adapter);
 	if ((netif_carrier_ok(netdev)) && link)
 		goto link_up;
@@ -2512,8 +2521,8 @@ link_up:
 			 * (Do the reset outside of interrupt context). */
 			adapter->tx_timeout_count++;
 			schedule_work(&adapter->reset_task);
-			/* return immediately since reset is imminent */
-			return;
+			/* exit immediately since reset is imminent */
+			goto unlock;
 		}
 	}
 
@@ -2542,6 +2551,9 @@ link_up:
 	/* Reschedule the task */
 	if (!test_bit(__E1000_DOWN, &adapter->flags))
 		schedule_delayed_work(&adapter->watchdog_task, 2 * HZ);
+
+unlock:
+	mutex_unlock(&adapter->mutex);
 }
 
 enum latency_range {
@@ -3248,6 +3260,8 @@ static void e1000_reset_task(struct work_struct *work)
 	struct e1000_adapter *adapter =
 		container_of(work, struct e1000_adapter, reset_task);
 
+	if (test_bit(__E1000_DOWN, &adapter->flags))
+		return;
 	e1000_reinit_safe(adapter);
 }
 
@@ -4702,6 +4716,8 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 
 	netif_device_detach(netdev);
 
+	mutex_lock(&adapter->mutex);
+
 	if (netif_running(netdev)) {
 		WARN_ON(test_bit(__E1000_RESETTING, &adapter->flags));
 		e1000_down(adapter);
@@ -4709,8 +4725,10 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 
 #ifdef CONFIG_PM
 	retval = pci_save_state(pdev);
-	if (retval)
+	if (retval) {
+		mutex_unlock(&adapter->mutex);
 		return retval;
+	}
 #endif
 
 	status = er32(STATUS);
@@ -4765,6 +4783,8 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	if (netif_running(netdev))
 		e1000_free_irq(adapter);
 
+	mutex_unlock(&adapter->mutex);
+
 	pci_disable_device(pdev);
 
 	return 0;
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 05/13] e1000e: bad short packets received when jumbos enabled on 82579
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

When short packets are received with jumbos enabled on 82579, they can be
interpreted to have a receive address that does not match any configured
address.  This is due to a hardware bug that can be worked around by
reducing the number of IPG octets added when the packet is transferred from
the PHY to the MAC.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index ad34de0..4f70974 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1578,7 +1578,7 @@ s32 e1000_lv_jumbo_workaround_ich8lan(struct e1000_hw *hw, bool enable)
 		ret_val = e1e_wphy(hw, PHY_REG(776, 20), data);
 		if (ret_val)
 			goto out;
-		ret_val = e1e_wphy(hw, PHY_REG(776, 23), 0xFE00);
+		ret_val = e1e_wphy(hw, PHY_REG(776, 23), 0xF100);
 		if (ret_val)
 			goto out;
 		e1e_rphy(hw, HV_PM_CTRL, &data);
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 06/13] igb: Make Tx budget for NAPI user adjustable
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change is to make the NAPI budget limits for transmit
adjustable.  Currently they are only set to 128, and when
the changes/improvements to NAPI occur to allow for adjustability,
it would be possible to tune the value for optimal
performance with applications such as routing.

v2: remove tie between NAPI and interrupt moderation
    fix work limit define name (s/IXGBE/IGB/)
    Update patch description to better reflect patch

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |    3 +
 drivers/net/ethernet/intel/igb/igb_ethtool.c |    1 +
 drivers/net/ethernet/intel/igb/igb_main.c    |  136 ++++++++++++++++----------
 3 files changed, 87 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index b725937..beab918 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -47,6 +47,7 @@ struct igb_adapter;
 
 /* TX/RX descriptor defines */
 #define IGB_DEFAULT_TXD                  256
+#define IGB_DEFAULT_TX_WORK		 128
 #define IGB_MIN_TXD                       80
 #define IGB_MAX_TXD                     4096
 
@@ -177,6 +178,7 @@ struct igb_q_vector {
 
 	u32 eims_value;
 	u16 cpu;
+	u16 tx_work_limit;
 
 	u16 itr_val;
 	u8 set_itr;
@@ -266,6 +268,7 @@ struct igb_adapter {
 	u16 rx_itr;
 
 	/* TX */
+	u16 tx_work_limit;
 	u32 tx_timeout_count;
 	int num_tx_queues;
 	struct igb_ring *tx_ring[16];
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index f231d82..a445c4f 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2011,6 +2011,7 @@ static int igb_set_coalesce(struct net_device *netdev,
 
 	for (i = 0; i < adapter->num_q_vectors; i++) {
 		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		q_vector->tx_work_limit = adapter->tx_work_limit;
 		if (q_vector->rx_ring)
 			q_vector->itr_val = adapter->rx_itr_setting;
 		else
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 7ad25e8..12faa99 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -136,8 +136,8 @@ static irqreturn_t igb_msix_ring(int irq, void *);
 static void igb_update_dca(struct igb_q_vector *);
 static void igb_setup_dca(struct igb_adapter *);
 #endif /* CONFIG_IGB_DCA */
-static bool igb_clean_tx_irq(struct igb_q_vector *);
 static int igb_poll(struct napi_struct *, int);
+static bool igb_clean_tx_irq(struct igb_q_vector *);
 static bool igb_clean_rx_irq(struct igb_q_vector *, int);
 static int igb_ioctl(struct net_device *, struct ifreq *, int cmd);
 static void igb_tx_timeout(struct net_device *);
@@ -1120,6 +1120,7 @@ static void igb_map_tx_ring_to_vector(struct igb_adapter *adapter,
 	q_vector->tx_ring = adapter->tx_ring[ring_idx];
 	q_vector->tx_ring->q_vector = q_vector;
 	q_vector->itr_val = adapter->tx_itr_setting;
+	q_vector->tx_work_limit = adapter->tx_work_limit;
 	if (q_vector->itr_val && q_vector->itr_val <= 3)
 		q_vector->itr_val = IGB_START_ITR;
 }
@@ -2388,11 +2389,17 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 
 	pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word);
 
+	/* set default ring sizes */
 	adapter->tx_ring_count = IGB_DEFAULT_TXD;
 	adapter->rx_ring_count = IGB_DEFAULT_RXD;
+
+	/* set default ITR values */
 	adapter->rx_itr_setting = IGB_DEFAULT_ITR;
 	adapter->tx_itr_setting = IGB_DEFAULT_ITR;
 
+	/* set default work limits */
+	adapter->tx_work_limit = IGB_DEFAULT_TX_WORK;
+
 	adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN +
 				  VLAN_HLEN;
 	adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
@@ -5496,7 +5503,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
 		igb_update_dca(q_vector);
 #endif
 	if (q_vector->tx_ring)
-		clean_complete = !!igb_clean_tx_irq(q_vector);
+		clean_complete = igb_clean_tx_irq(q_vector);
 
 	if (q_vector->rx_ring)
 		clean_complete &= igb_clean_rx_irq(q_vector, budget);
@@ -5578,64 +5585,69 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct igb_ring *tx_ring = q_vector->tx_ring;
-	struct net_device *netdev = tx_ring->netdev;
-	struct e1000_hw *hw = &adapter->hw;
-	struct igb_buffer *buffer_info;
-	union e1000_adv_tx_desc *tx_desc, *eop_desc;
+	struct igb_buffer *tx_buffer;
+	union e1000_adv_tx_desc *tx_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
-	unsigned int i, eop, count = 0;
-	bool cleaned = false;
+	unsigned int budget = q_vector->tx_work_limit;
+	u16 i = tx_ring->next_to_clean;
 
-	i = tx_ring->next_to_clean;
-	eop = tx_ring->buffer_info[i].next_to_watch;
-	eop_desc = IGB_TX_DESC(tx_ring, eop);
+	if (test_bit(__IGB_DOWN, &adapter->state))
+		return true;
 
-	while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&
-	       (count < tx_ring->count)) {
-		rmb();	/* read buffer_info after eop_desc status */
-		for (cleaned = false; !cleaned; count++) {
-			tx_desc = IGB_TX_DESC(tx_ring, i);
-			buffer_info = &tx_ring->buffer_info[i];
-			cleaned = (i == eop);
+	tx_buffer = &tx_ring->buffer_info[i];
+	tx_desc = IGB_TX_DESC(tx_ring, i);
 
-			if (buffer_info->skb) {
-				total_bytes += buffer_info->bytecount;
-				/* gso_segs is currently only valid for tcp */
-				total_packets += buffer_info->gso_segs;
-				igb_tx_hwtstamp(q_vector, buffer_info);
-			}
+	for (; budget; budget--) {
+		u16 eop = tx_buffer->next_to_watch;
+		union e1000_adv_tx_desc *eop_desc;
+
+		eop_desc = IGB_TX_DESC(tx_ring, eop);
+
+		/* if DD is not set pending work has not been completed */
+		if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)))
+			break;
+
+		/* prevent any other reads prior to eop_desc being verified */
+		rmb();
 
-			igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
+		do {
 			tx_desc->wb.status = 0;
+			if (likely(tx_desc == eop_desc)) {
+				eop_desc = NULL;
+
+				total_bytes += tx_buffer->bytecount;
+				total_packets += tx_buffer->gso_segs;
+				igb_tx_hwtstamp(q_vector, tx_buffer);
+			}
+
+			igb_unmap_and_free_tx_resource(tx_ring, tx_buffer);
 
+			tx_buffer++;
+			tx_desc++;
 			i++;
-			if (i == tx_ring->count)
+			if (unlikely(i == tx_ring->count)) {
 				i = 0;
-		}
-		eop = tx_ring->buffer_info[i].next_to_watch;
-		eop_desc = IGB_TX_DESC(tx_ring, eop);
+				tx_buffer = tx_ring->buffer_info;
+				tx_desc = IGB_TX_DESC(tx_ring, 0);
+			}
+		} while (eop_desc);
 	}
 
 	tx_ring->next_to_clean = i;
+	u64_stats_update_begin(&tx_ring->tx_syncp);
+	tx_ring->tx_stats.bytes += total_bytes;
+	tx_ring->tx_stats.packets += total_packets;
+	u64_stats_update_end(&tx_ring->tx_syncp);
+	tx_ring->total_bytes += total_bytes;
+	tx_ring->total_packets += total_packets;
 
-	if (unlikely(count &&
-		     netif_carrier_ok(netdev) &&
-		     igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {
-		/* Make sure that anybody stopping the queue after this
-		 * sees the new next_to_clean.
-		 */
-		smp_mb();
-		if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&
-		    !(test_bit(__IGB_DOWN, &adapter->state))) {
-			netif_wake_subqueue(netdev, tx_ring->queue_index);
+	if (tx_ring->detect_tx_hung) {
+		struct e1000_hw *hw = &adapter->hw;
+		u16 eop = tx_ring->buffer_info[i].next_to_watch;
+		union e1000_adv_tx_desc *eop_desc;
 
-			u64_stats_update_begin(&tx_ring->tx_syncp);
-			tx_ring->tx_stats.restart_queue++;
-			u64_stats_update_end(&tx_ring->tx_syncp);
-		}
-	}
+		eop_desc = IGB_TX_DESC(tx_ring, eop);
 
-	if (tx_ring->detect_tx_hung) {
 		/* Detect a transmit hang in hardware, this serializes the
 		 * check with the clearing of time_stamp and movement of i */
 		tx_ring->detect_tx_hung = false;
@@ -5666,16 +5678,34 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				eop,
 				jiffies,
 				eop_desc->wb.status);
-			netif_stop_subqueue(netdev, tx_ring->queue_index);
+			netif_stop_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
+
+			/* we are about to reset, no point in enabling stuff */
+			return true;
 		}
 	}
-	tx_ring->total_bytes += total_bytes;
-	tx_ring->total_packets += total_packets;
-	u64_stats_update_begin(&tx_ring->tx_syncp);
-	tx_ring->tx_stats.bytes += total_bytes;
-	tx_ring->tx_stats.packets += total_packets;
-	u64_stats_update_end(&tx_ring->tx_syncp);
-	return count < tx_ring->count;
+
+	if (unlikely(total_packets &&
+		     netif_carrier_ok(tx_ring->netdev) &&
+		     igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {
+		/* Make sure that anybody stopping the queue after this
+		 * sees the new next_to_clean.
+		 */
+		smp_mb();
+		if (__netif_subqueue_stopped(tx_ring->netdev,
+					     tx_ring->queue_index) &&
+		    !(test_bit(__IGB_DOWN, &adapter->state))) {
+			netif_wake_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
+
+			u64_stats_update_begin(&tx_ring->tx_syncp);
+			tx_ring->tx_stats.restart_queue++;
+			u64_stats_update_end(&tx_ring->tx_syncp);
+		}
+	}
+
+	return !!budget;
 }
 
 static inline void igb_rx_checksum(struct igb_ring *ring,
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 07/13] igb: split buffer_info into tx_buffer_info and rx_buffer_info
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

In order to be able to improve the performance of the TX path it has been
necessary to add addition info to the tx_buffer_info structure.  However a
side effect is that the structure has gotten larger and this in turn has
also increased the size of the RX buffer info structure.  In order to avoid
this in the future I am splitting the single buffer_info structure into two
separate ones and instead I will join them by making the buffer_info
pointer in the ring a union of the two.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |   42 +++++-----
 drivers/net/ethernet/intel/igb/igb_ethtool.c |   15 ++--
 drivers/net/ethernet/intel/igb/igb_main.c    |  123 +++++++++++++-------------
 3 files changed, 92 insertions(+), 88 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index beab918..56c68fc 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -132,27 +132,24 @@ struct vf_data_storage {
 
 /* wrapper around a pointer to a socket buffer,
  * so a DMA handle can be stored along with the buffer */
-struct igb_buffer {
+struct igb_tx_buffer {
+	u16 next_to_watch;
+	unsigned long time_stamp;
+	dma_addr_t dma;
+	u32 length;
+	u32 tx_flags;
+	struct sk_buff *skb;
+	unsigned int bytecount;
+	u16 gso_segs;
+	u8 mapped_as_page;
+};
+
+struct igb_rx_buffer {
 	struct sk_buff *skb;
 	dma_addr_t dma;
-	union {
-		/* TX */
-		struct {
-			unsigned long time_stamp;
-			u16 length;
-			u16 next_to_watch;
-			unsigned int bytecount;
-			u16 gso_segs;
-			u8 tx_flags;
-			u8 mapped_as_page;
-		};
-		/* RX */
-		struct {
-			struct page *page;
-			dma_addr_t page_dma;
-			u16 page_offset;
-		};
-	};
+	struct page *page;
+	dma_addr_t page_dma;
+	u32 page_offset;
 };
 
 struct igb_tx_queue_stats {
@@ -191,7 +188,10 @@ struct igb_ring {
 	struct igb_q_vector *q_vector;	/* backlink to q_vector */
 	struct net_device *netdev;	/* back pointer to net_device */
 	struct device *dev;		/* device pointer for dma mapping */
-	struct igb_buffer *buffer_info;	/* array of buffer info structs */
+	union {				/* array of buffer info structs */
+		struct igb_tx_buffer *tx_buffer_info;
+		struct igb_rx_buffer *rx_buffer_info;
+	};
 	void *desc;			/* descriptor ring memory */
 	unsigned long flags;		/* ring specific flags */
 	void __iomem *tail;		/* pointer to ring tail register */
@@ -377,7 +377,7 @@ extern void igb_setup_tctl(struct igb_adapter *);
 extern void igb_setup_rctl(struct igb_adapter *);
 extern netdev_tx_t igb_xmit_frame_ring(struct sk_buff *, struct igb_ring *);
 extern void igb_unmap_and_free_tx_resource(struct igb_ring *,
-					   struct igb_buffer *);
+					   struct igb_tx_buffer *);
 extern void igb_alloc_rx_buffers(struct igb_ring *, u16);
 extern void igb_update_stats(struct igb_adapter *, struct rtnl_link_stats64 *);
 extern bool igb_has_link(struct igb_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index a445c4f..f227fc5 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1579,7 +1579,8 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
                                 unsigned int size)
 {
 	union e1000_adv_rx_desc *rx_desc;
-	struct igb_buffer *buffer_info;
+	struct igb_rx_buffer *rx_buffer_info;
+	struct igb_tx_buffer *tx_buffer_info;
 	int rx_ntc, tx_ntc, count = 0;
 	u32 staterr;
 
@@ -1591,22 +1592,22 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 
 	while (staterr & E1000_RXD_STAT_DD) {
 		/* check rx buffer */
-		buffer_info = &rx_ring->buffer_info[rx_ntc];
+		rx_buffer_info = &rx_ring->rx_buffer_info[rx_ntc];
 
 		/* unmap rx buffer, will be remapped by alloc_rx_buffers */
 		dma_unmap_single(rx_ring->dev,
-		                 buffer_info->dma,
+				 rx_buffer_info->dma,
 				 IGB_RX_HDR_LEN,
 				 DMA_FROM_DEVICE);
-		buffer_info->dma = 0;
+		rx_buffer_info->dma = 0;
 
 		/* verify contents of skb */
-		if (!igb_check_lbtest_frame(buffer_info->skb, size))
+		if (!igb_check_lbtest_frame(rx_buffer_info->skb, size))
 			count++;
 
 		/* unmap buffer on tx side */
-		buffer_info = &tx_ring->buffer_info[tx_ntc];
-		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
+		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ntc];
+		igb_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
 
 		/* increment rx/tx next to clean counters */
 		rx_ntc++;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 12faa99..2bdc783 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -339,7 +339,6 @@ static void igb_dump(struct igb_adapter *adapter)
 	struct igb_ring *tx_ring;
 	union e1000_adv_tx_desc *tx_desc;
 	struct my_u0 { u64 a; u64 b; } *u0;
-	struct igb_buffer *buffer_info;
 	struct igb_ring *rx_ring;
 	union e1000_adv_rx_desc *rx_desc;
 	u32 staterr;
@@ -376,8 +375,9 @@ static void igb_dump(struct igb_adapter *adapter)
 	printk(KERN_INFO "Queue [NTU] [NTC] [bi(ntc)->dma  ]"
 		" leng ntw timestamp\n");
 	for (n = 0; n < adapter->num_tx_queues; n++) {
+		struct igb_tx_buffer *buffer_info;
 		tx_ring = adapter->tx_ring[n];
-		buffer_info = &tx_ring->buffer_info[tx_ring->next_to_clean];
+		buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_clean];
 		printk(KERN_INFO " %5d %5X %5X %016llX %04X %3X %016llX\n",
 			   n, tx_ring->next_to_use, tx_ring->next_to_clean,
 			   (u64)buffer_info->dma,
@@ -413,8 +413,9 @@ static void igb_dump(struct igb_adapter *adapter)
 			"leng  ntw timestamp        bi->skb\n");
 
 		for (i = 0; tx_ring->desc && (i < tx_ring->count); i++) {
+			struct igb_tx_buffer *buffer_info;
 			tx_desc = IGB_TX_DESC(tx_ring, i);
-			buffer_info = &tx_ring->buffer_info[i];
+			buffer_info = &tx_ring->tx_buffer_info[i];
 			u0 = (struct my_u0 *)tx_desc;
 			printk(KERN_INFO "T [0x%03X]    %016llX %016llX %016llX"
 				" %04X  %3X %016llX %p", i,
@@ -493,7 +494,8 @@ rx_ring_summary:
 			"<-- Adv Rx Write-Back format\n");
 
 		for (i = 0; i < rx_ring->count; i++) {
-			buffer_info = &rx_ring->buffer_info[i];
+			struct igb_rx_buffer *buffer_info;
+			buffer_info = &rx_ring->rx_buffer_info[i];
 			rx_desc = IGB_RX_DESC(rx_ring, i);
 			u0 = (struct my_u0 *)rx_desc;
 			staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
@@ -2576,9 +2578,9 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring)
 	struct device *dev = tx_ring->dev;
 	int size;
 
-	size = sizeof(struct igb_buffer) * tx_ring->count;
-	tx_ring->buffer_info = vzalloc(size);
-	if (!tx_ring->buffer_info)
+	size = sizeof(struct igb_tx_buffer) * tx_ring->count;
+	tx_ring->tx_buffer_info = vzalloc(size);
+	if (!tx_ring->tx_buffer_info)
 		goto err;
 
 	/* round up to nearest 4K */
@@ -2598,7 +2600,7 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring)
 	return 0;
 
 err:
-	vfree(tx_ring->buffer_info);
+	vfree(tx_ring->tx_buffer_info);
 	dev_err(dev,
 		"Unable to allocate memory for the transmit descriptor ring\n");
 	return -ENOMEM;
@@ -2719,9 +2721,9 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	struct device *dev = rx_ring->dev;
 	int size, desc_len;
 
-	size = sizeof(struct igb_buffer) * rx_ring->count;
-	rx_ring->buffer_info = vzalloc(size);
-	if (!rx_ring->buffer_info)
+	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
+	rx_ring->rx_buffer_info = vzalloc(size);
+	if (!rx_ring->rx_buffer_info)
 		goto err;
 
 	desc_len = sizeof(union e1000_adv_rx_desc);
@@ -2744,8 +2746,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	return 0;
 
 err:
-	vfree(rx_ring->buffer_info);
-	rx_ring->buffer_info = NULL;
+	vfree(rx_ring->rx_buffer_info);
+	rx_ring->rx_buffer_info = NULL;
 	dev_err(dev, "Unable to allocate memory for the receive descriptor"
 		" ring\n");
 	return -ENOMEM;
@@ -3107,8 +3109,8 @@ void igb_free_tx_resources(struct igb_ring *tx_ring)
 {
 	igb_clean_tx_ring(tx_ring);
 
-	vfree(tx_ring->buffer_info);
-	tx_ring->buffer_info = NULL;
+	vfree(tx_ring->tx_buffer_info);
+	tx_ring->tx_buffer_info = NULL;
 
 	/* if not set, then don't free */
 	if (!tx_ring->desc)
@@ -3135,7 +3137,7 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter)
 }
 
 void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
-				    struct igb_buffer *buffer_info)
+				    struct igb_tx_buffer *buffer_info)
 {
 	if (buffer_info->dma) {
 		if (buffer_info->mapped_as_page)
@@ -3166,21 +3168,21 @@ void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
  **/
 static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 {
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	unsigned long size;
 	unsigned int i;
 
-	if (!tx_ring->buffer_info)
+	if (!tx_ring->tx_buffer_info)
 		return;
 	/* Free all the Tx ring sk_buffs */
 
 	for (i = 0; i < tx_ring->count; i++) {
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 	}
 
-	size = sizeof(struct igb_buffer) * tx_ring->count;
-	memset(tx_ring->buffer_info, 0, size);
+	size = sizeof(struct igb_tx_buffer) * tx_ring->count;
+	memset(tx_ring->tx_buffer_info, 0, size);
 
 	/* Zero out the descriptor ring */
 	memset(tx_ring->desc, 0, tx_ring->size);
@@ -3211,8 +3213,8 @@ void igb_free_rx_resources(struct igb_ring *rx_ring)
 {
 	igb_clean_rx_ring(rx_ring);
 
-	vfree(rx_ring->buffer_info);
-	rx_ring->buffer_info = NULL;
+	vfree(rx_ring->rx_buffer_info);
+	rx_ring->rx_buffer_info = NULL;
 
 	/* if not set, then don't free */
 	if (!rx_ring->desc)
@@ -3247,12 +3249,12 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	unsigned long size;
 	u16 i;
 
-	if (!rx_ring->buffer_info)
+	if (!rx_ring->rx_buffer_info)
 		return;
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
-		struct igb_buffer *buffer_info = &rx_ring->buffer_info[i];
+		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
 		if (buffer_info->dma) {
 			dma_unmap_single(rx_ring->dev,
 			                 buffer_info->dma,
@@ -3279,8 +3281,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 		}
 	}
 
-	size = sizeof(struct igb_buffer) * rx_ring->count;
-	memset(rx_ring->buffer_info, 0, size);
+	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
+	memset(rx_ring->rx_buffer_info, 0, size);
 
 	/* Zero out the descriptor ring */
 	memset(rx_ring->desc, 0, rx_ring->size);
@@ -3964,7 +3966,7 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 	struct e1000_adv_tx_context_desc *context_desc;
 	unsigned int i;
 	int err;
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	u32 info = 0, tu_cmd = 0;
 	u32 mss_l4len_idx;
 	u8 l4len;
@@ -3995,7 +3997,7 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 
 	i = tx_ring->next_to_use;
 
-	buffer_info = &tx_ring->buffer_info[i];
+	buffer_info = &tx_ring->tx_buffer_info[i];
 	context_desc = IGB_TX_CTXTDESC(tx_ring, i);
 	/* VLAN MACLEN IPLEN */
 	if (tx_flags & IGB_TX_FLAGS_VLAN)
@@ -4043,14 +4045,14 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 {
 	struct e1000_adv_tx_context_desc *context_desc;
 	struct device *dev = tx_ring->dev;
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	u32 info = 0, tu_cmd = 0;
 	unsigned int i;
 
 	if ((skb->ip_summed == CHECKSUM_PARTIAL) ||
 	    (tx_flags & IGB_TX_FLAGS_VLAN)) {
 		i = tx_ring->next_to_use;
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		context_desc = IGB_TX_CTXTDESC(tx_ring, i);
 
 		if (tx_flags & IGB_TX_FLAGS_VLAN)
@@ -4126,7 +4128,7 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 			     unsigned int first)
 {
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	struct device *dev = tx_ring->dev;
 	unsigned int hlen = skb_headlen(skb);
 	unsigned int count = 0, i;
@@ -4135,7 +4137,7 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 
 	i = tx_ring->next_to_use;
 
-	buffer_info = &tx_ring->buffer_info[i];
+	buffer_info = &tx_ring->tx_buffer_info[i];
 	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
 	buffer_info->length = hlen;
 	/* set time_stamp *before* dma to help avoid a possible race */
@@ -4155,7 +4157,7 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 		if (i == tx_ring->count)
 			i = 0;
 
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
 		buffer_info->length = len;
 		buffer_info->time_stamp = jiffies;
@@ -4168,12 +4170,12 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 
 	}
 
-	tx_ring->buffer_info[i].skb = skb;
-	tx_ring->buffer_info[i].tx_flags = skb_shinfo(skb)->tx_flags;
+	buffer_info->skb = skb;
+	buffer_info->tx_flags = skb_shinfo(skb)->tx_flags;
 	/* multiply data chunks by size of headers */
-	tx_ring->buffer_info[i].bytecount = ((gso_segs - 1) * hlen) + skb->len;
-	tx_ring->buffer_info[i].gso_segs = gso_segs;
-	tx_ring->buffer_info[first].next_to_watch = i;
+	buffer_info->bytecount = ((gso_segs - 1) * hlen) + skb->len;
+	buffer_info->gso_segs = gso_segs;
+	tx_ring->tx_buffer_info[first].next_to_watch = i;
 
 	return ++count;
 
@@ -4192,7 +4194,7 @@ dma_error:
 		if (i == 0)
 			i = tx_ring->count;
 		i--;
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 	}
 
@@ -4204,7 +4206,7 @@ static inline void igb_tx_queue(struct igb_ring *tx_ring,
 				u8 hdr_len)
 {
 	union e1000_adv_tx_desc *tx_desc;
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	u32 olinfo_status = 0, cmd_type_len;
 	unsigned int i = tx_ring->next_to_use;
 
@@ -4240,7 +4242,7 @@ static inline void igb_tx_queue(struct igb_ring *tx_ring,
 	olinfo_status |= ((paylen - hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT);
 
 	do {
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		tx_desc = IGB_TX_DESC(tx_ring, i);
 		tx_desc->read.buffer_addr = cpu_to_le64(buffer_info->dma);
 		tx_desc->read.cmd_type_len =
@@ -4353,7 +4355,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	count = igb_tx_map(tx_ring, skb, first);
 	if (!count) {
 		dev_kfree_skb_any(skb);
-		tx_ring->buffer_info[first].time_stamp = 0;
+		tx_ring->tx_buffer_info[first].time_stamp = 0;
 		tx_ring->next_to_use = first;
 		return NETDEV_TX_OK;
 	}
@@ -5551,13 +5553,14 @@ static void igb_systim_to_hwtstamp(struct igb_adapter *adapter,
 /**
  * igb_tx_hwtstamp - utility function which checks for TX time stamp
  * @q_vector: pointer to q_vector containing needed info
- * @buffer: pointer to igb_buffer structure
+ * @buffer: pointer to igb_tx_buffer structure
  *
  * If we were asked to do hardware stamping and such a time stamp is
  * available, then it must have been for this skb here because we only
  * allow only one such packet into the queue.
  */
-static void igb_tx_hwtstamp(struct igb_q_vector *q_vector, struct igb_buffer *buffer_info)
+static void igb_tx_hwtstamp(struct igb_q_vector *q_vector,
+			    struct igb_tx_buffer *buffer_info)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct e1000_hw *hw = &adapter->hw;
@@ -5585,7 +5588,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct igb_ring *tx_ring = q_vector->tx_ring;
-	struct igb_buffer *tx_buffer;
+	struct igb_tx_buffer *tx_buffer;
 	union e1000_adv_tx_desc *tx_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = q_vector->tx_work_limit;
@@ -5594,7 +5597,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 	if (test_bit(__IGB_DOWN, &adapter->state))
 		return true;
 
-	tx_buffer = &tx_ring->buffer_info[i];
+	tx_buffer = &tx_ring->tx_buffer_info[i];
 	tx_desc = IGB_TX_DESC(tx_ring, i);
 
 	for (; budget; budget--) {
@@ -5627,7 +5630,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 			i++;
 			if (unlikely(i == tx_ring->count)) {
 				i = 0;
-				tx_buffer = tx_ring->buffer_info;
+				tx_buffer = tx_ring->tx_buffer_info;
 				tx_desc = IGB_TX_DESC(tx_ring, 0);
 			}
 		} while (eop_desc);
@@ -5643,7 +5646,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 
 	if (tx_ring->detect_tx_hung) {
 		struct e1000_hw *hw = &adapter->hw;
-		u16 eop = tx_ring->buffer_info[i].next_to_watch;
+		u16 eop = tx_ring->tx_buffer_info[i].next_to_watch;
 		union e1000_adv_tx_desc *eop_desc;
 
 		eop_desc = IGB_TX_DESC(tx_ring, eop);
@@ -5651,8 +5654,8 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 		/* Detect a transmit hang in hardware, this serializes the
 		 * check with the clearing of time_stamp and movement of i */
 		tx_ring->detect_tx_hung = false;
-		if (tx_ring->buffer_info[i].time_stamp &&
-		    time_after(jiffies, tx_ring->buffer_info[i].time_stamp +
+		if (tx_ring->tx_buffer_info[i].time_stamp &&
+		    time_after(jiffies, tx_ring->tx_buffer_info[i].time_stamp +
 			       (adapter->tx_timeout_factor * HZ)) &&
 		    !(rd32(E1000_STATUS) & E1000_STATUS_TXOFF)) {
 
@@ -5674,7 +5677,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				readl(tx_ring->tail),
 				tx_ring->next_to_use,
 				tx_ring->next_to_clean,
-				tx_ring->buffer_info[eop].time_stamp,
+				tx_ring->tx_buffer_info[eop].time_stamp,
 				eop,
 				jiffies,
 				eop_desc->wb.status);
@@ -5802,7 +5805,7 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 	staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
 
 	while (staterr & E1000_RXD_STAT_DD) {
-		struct igb_buffer *buffer_info = &rx_ring->buffer_info[i];
+		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
 		struct sk_buff *skb = buffer_info->skb;
 		union e1000_adv_rx_desc *next_rxd;
 
@@ -5855,8 +5858,8 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 		}
 
 		if (!(staterr & E1000_RXD_STAT_EOP)) {
-			struct igb_buffer *next_buffer;
-			next_buffer = &rx_ring->buffer_info[i];
+			struct igb_rx_buffer *next_buffer;
+			next_buffer = &rx_ring->rx_buffer_info[i];
 			buffer_info->skb = next_buffer->skb;
 			buffer_info->dma = next_buffer->dma;
 			next_buffer->skb = skb;
@@ -5917,7 +5920,7 @@ next_desc:
 }
 
 static bool igb_alloc_mapped_skb(struct igb_ring *rx_ring,
-				 struct igb_buffer *bi)
+				 struct igb_rx_buffer *bi)
 {
 	struct sk_buff *skb = bi->skb;
 	dma_addr_t dma = bi->dma;
@@ -5951,7 +5954,7 @@ static bool igb_alloc_mapped_skb(struct igb_ring *rx_ring,
 }
 
 static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
-				  struct igb_buffer *bi)
+				  struct igb_rx_buffer *bi)
 {
 	struct page *page = bi->page;
 	dma_addr_t page_dma = bi->page_dma;
@@ -5990,11 +5993,11 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
 void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 {
 	union e1000_adv_rx_desc *rx_desc;
-	struct igb_buffer *bi;
+	struct igb_rx_buffer *bi;
 	u16 i = rx_ring->next_to_use;
 
 	rx_desc = IGB_RX_DESC(rx_ring, i);
-	bi = &rx_ring->buffer_info[i];
+	bi = &rx_ring->rx_buffer_info[i];
 	i -= rx_ring->count;
 
 	while (cleaned_count--) {
@@ -6015,7 +6018,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 		i++;
 		if (unlikely(!i)) {
 			rx_desc = IGB_RX_DESC(rx_ring, 0);
-			bi = rx_ring->buffer_info;
+			bi = rx_ring->rx_buffer_info;
 			i -= rx_ring->count;
 		}
 
-- 
1.7.6.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox