Netdev List
 help / color / mirror / Atom feed
* [PATCH 1/1] ethtool: add ETHTOOL_{G,S}CHANNEL support.
From: Sucheta Chakraborty @ 2011-10-07 11:58 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Dept_NX_Linux_NIC_Driver
In-Reply-To: <1317988730-4197-1-git-send-email-sucheta.chakraborty@qlogic.com>

Used to configure number of tx/ rx/ other channels.
Reqd. man page changes are included.

Signed-off-by: Sucheta Chakraborty <sucheta.chakraborty@qlogic.com>
---
 ethtool.8.in |   29 ++++++++++++++
 ethtool.c    |  120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+), 0 deletions(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index efc6098..f9d3633 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -307,6 +307,16 @@ ethtool \- query or control network driver and hardware settings
 .BN action
 .BN loc
 .RB ]
+.HP
+.B ethtool \-l|\-\-show\-channels
+.I ethX
+.HP
+.B ethtool \-L|\-\-set\-channels
+.I ethX
+.BN rx
+.BN tx
+.BN other
+.BN combined
 .
 .\" Adjust lines (i.e. full justification) and hyphenate.
 .ad
@@ -754,6 +764,25 @@ lB	l.
 Specify the location/ID to insert the rule. This will overwrite
 any rule present in that location and will not go through any
 of the rule ordering process.
+.TP
+.B \-l \-\-show\-channels
+Queries the specified network device for the numbers of channels it has.
+A channel is an IRQ and the set of queues that can trigger that IRQ.
+.TP
+.B \-L \-\-set\-channels
+Changes the numbers of channels of the specified network device.
+.TP
+.BI rx \ N
+Changes the number of channels with only receive queues.
+.TP
+.BI tx \ N
+Changes the number of channels with only transmit queues.
+.TP
+.BI other \ N
+Changes the number of channels used only for other purposes e.g. link interrupts or SR-IOV co-ordination.
+.TP
+.BI combined \ N
+Changes the number of multi-purpose channels.
 .SH BUGS
 Not supported (in part or whole) on all network drivers.
 .SH AUTHOR
diff --git a/ethtool.c b/ethtool.c
index d7d2d58..c533a48 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -80,6 +80,8 @@ static int do_gpause(int fd, struct ifreq *ifr);
 static int do_spause(int fd, struct ifreq *ifr);
 static int do_gring(int fd, struct ifreq *ifr);
 static int do_sring(int fd, struct ifreq *ifr);
+static int do_schannels(int fd, struct ifreq *ifr);
+static int do_gchannels(int fd, struct ifreq *ifr);
 static int do_gcoalesce(int fd, struct ifreq *ifr);
 static int do_scoalesce(int fd, struct ifreq *ifr);
 static int do_goffload(int fd, struct ifreq *ifr);
@@ -133,6 +135,8 @@ static enum {
 	MODE_PERMADDR,
 	MODE_SET_DUMP,
 	MODE_GET_DUMP,
+	MODE_GCHANNELS,
+	MODE_SCHANNELS
 } mode = MODE_GSET;
 
 static struct option {
@@ -266,6 +270,12 @@ static struct option {
     { "-W", "--set-dump", MODE_SET_DUMP,
 		"Set dump flag of the device",
 		"		N\n"},
+    { "-l", "--show-channels", MODE_GCHANNELS, "Query Channels" },
+    { "-L", "--set-channels", MODE_SCHANNELS, "Set Channels",
+		"               [ rx N ]\n"
+		"               [ tx N ]\n"
+		"               [ other N ]\n"
+		"               [ combined N ]\n" },
     { "-h", "--help", MODE_HELP, "Show this help" },
     { NULL, "--version", MODE_VERSION, "Show version number" },
     {}
@@ -331,6 +341,13 @@ static s32 ring_rx_mini_wanted = -1;
 static s32 ring_rx_jumbo_wanted = -1;
 static s32 ring_tx_wanted = -1;
 
+static struct ethtool_channels echannels;
+static int gchannels_changed;
+static s32 channels_rx_wanted = -1;
+static s32 channels_tx_wanted = -1;
+static s32 channels_other_wanted = -1;
+static s32 channels_combined_wanted = -1;
+
 static struct ethtool_coalesce ecoal;
 static int gcoalesce_changed = 0;
 static s32 coal_stats_wanted = -1;
@@ -495,6 +512,13 @@ static struct cmdline_info cmdline_ring[] = {
 	{ "tx", CMDL_S32, &ring_tx_wanted, &ering.tx_pending },
 };
 
+static struct cmdline_info cmdline_channels[] = {
+	{ "rx", CMDL_S32, &channels_rx_wanted, &echannels.rx_count },
+	{ "tx", CMDL_S32, &channels_tx_wanted, &echannels.tx_count },
+	{ "other", CMDL_S32, &channels_other_wanted, &echannels.other_count },
+	{ "combined", CMDL_S32, &channels_combined_wanted, &echannels.combined_count },
+};
+
 static struct cmdline_info cmdline_coalesce[] = {
 	{ "adaptive-rx", CMDL_BOOL, &coal_adaptive_rx_wanted, &ecoal.use_adaptive_rx_coalesce },
 	{ "adaptive-tx", CMDL_BOOL, &coal_adaptive_tx_wanted, &ecoal.use_adaptive_tx_coalesce },
@@ -787,6 +811,8 @@ static void parse_cmdline(int argc, char **argp)
 			    (mode == MODE_GCOALESCE) ||
 			    (mode == MODE_SCOALESCE) ||
 			    (mode == MODE_GRING) ||
+			    (mode == MODE_GCHANNELS) ||
+			    (mode == MODE_SCHANNELS) ||
 			    (mode == MODE_SRING) ||
 			    (mode == MODE_GOFFLOAD) ||
 			    (mode == MODE_SOFFLOAD) ||
@@ -871,6 +897,14 @@ static void parse_cmdline(int argc, char **argp)
 				i = argc;
 				break;
 			}
+			if (mode == MODE_SCHANNELS) {
+				parse_generic_cmdline(argc, argp, i,
+					&gchannels_changed,
+					cmdline_channels,
+					ARRAY_SIZE(cmdline_ring));
+				i = argc;
+				break;
+			}
 			if (mode == MODE_SCOALESCE) {
 				parse_generic_cmdline(argc, argp, i,
 					&gcoalesce_changed,
@@ -1751,6 +1785,32 @@ static int dump_ring(void)
 	return 0;
 }
 
+static int dump_channels(void)
+{
+	fprintf(stdout,
+		"Pre-set maximums:\n"
+		"RX:		%u\n"
+		"TX:		%u\n"
+		"Other:		%u\n"
+		"Combined:	%u\n",
+		echannels.max_rx, echannels.max_tx,
+		echannels.max_other,
+		echannels.max_combined);
+
+	fprintf(stdout,
+		"Current hardware settings:\n"
+		"RX:		%u\n"
+		"TX:		%u\n"
+		"Other:		%u\n"
+		"Combined:	%u\n",
+		echannels.rx_count, echannels.tx_count,
+		echannels.other_count,
+		echannels.combined_count);
+
+	fprintf(stdout, "\n");
+	return 0;
+}
+
 static int dump_coalesce(void)
 {
 	fprintf(stdout, "Adaptive RX: %s  TX: %s\n",
@@ -1937,6 +1997,10 @@ static int doit(void)
 		return do_gring(fd, &ifr);
 	} else if (mode == MODE_SRING) {
 		return do_sring(fd, &ifr);
+	} else if (mode == MODE_GCHANNELS) {
+		return do_gchannels(fd, &ifr);
+	} else if (mode == MODE_SCHANNELS) {
+		return do_schannels(fd, &ifr);
 	} else if (mode == MODE_GOFFLOAD) {
 		return do_goffload(fd, &ifr);
 	} else if (mode == MODE_SOFFLOAD) {
@@ -2114,6 +2178,62 @@ static int do_gring(int fd, struct ifreq *ifr)
 	return 0;
 }
 
+static int do_schannels(int fd, struct ifreq *ifr)
+{
+	int err, changed = 0;
+
+	echannels.cmd = ETHTOOL_GCHANNELS;
+	ifr->ifr_data = (caddr_t)&echannels;
+	err = send_ioctl(fd, ifr);
+	if (err) {
+		perror("Cannot get device channel parameters");
+		return 1;
+	}
+
+	do_generic_set(cmdline_channels, ARRAY_SIZE(cmdline_channels),
+			&changed);
+
+	if (!changed) {
+		fprintf(stderr, "no channel parameters changed, aborting\n");
+		fprintf(stderr, "current values: tx %u rx %u other %u"
+			"combined %u\n", echannels.rx_count,
+			echannels.tx_count, echannels.other_count,
+			echannels.combined_count);
+		return 1;
+	}
+
+	echannels.cmd = ETHTOOL_SCHANNELS;
+	ifr->ifr_data = (caddr_t)&echannels;
+	err = send_ioctl(fd, ifr);
+	if (err) {
+		perror("Cannot set device channel parameters");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int do_gchannels(int fd, struct ifreq *ifr)
+{
+	int err;
+
+	fprintf(stdout, "Channel parameters for %s:\n", devname);
+
+	echannels.cmd = ETHTOOL_GCHANNELS;
+	ifr->ifr_data = (caddr_t)&echannels;
+	err = send_ioctl(fd, ifr);
+	if (err == 0) {
+		err = dump_channels();
+		if (err)
+			return err;
+	} else {
+		perror("Cannot get device channel parameters\n");
+		return 1;
+	}
+	return 0;
+
+}
+
 static int do_gcoalesce(int fd, struct ifreq *ifr)
 {
 	int err;
-- 
1.7.3.3

^ permalink raw reply related

* [PATCH 0/1] ethtool: {G, S}-CHANNEL support added
From: Sucheta Chakraborty @ 2011-10-07 11:58 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Dept_NX_Linux_NIC_Driver

Thanks Ben for reviewing and applying the patches.
I have incorporated all the changes suggested by you for this patch.

^ permalink raw reply

* Re: loopback IP alias breaks tftp?
From: Eric Dumazet @ 2011-10-07 12:04 UTC (permalink / raw)
  To: Josh Boyer; +Cc: Joel Sing, Julian Anastasov, netdev
In-Reply-To: <20111007114017.GA1165@zod.bos.redhat.com>

Le vendredi 07 octobre 2011 à 07:40 -0400, Josh Boyer a écrit :

> The original bug reporter explained the loopback testcase is simply a
> trivial example to illustrate the problem.  The real setup seems to be:
> 
> "I have a hardware (Dell PC) booting over the network with PXE, this
> hardware try to load it system from a tftp server. If I used the real
> ip of the tftp server I have no problem, if I used an alias I have the
> problem (I have to use an alias, because I have a Cluster).
> 
> What I think, the bios tftp client talk to the server with the IP alias
> but the server reply with the real IP and the client reject the reply."
> 
> So in this case, because the alias is being used in a cluster, should he
> also setup the local routing table as you suggested?
> 
> I apologize for my lack of depth here.  I'm at the moment somewhat of a
> go-between.  I do greatly appreciate the help!

Its a completely different problem IMHO : You describe a tftp server
bug.

Say your tftp server is multihomed with 3 different IPS : 

192.168.20.21, 192.168.20.22, 192.168.20.23

And tftp server listens to any address (UDP port 69) : 0.0.0.0:69

When receiving a request on 192.168.20.22, it should use same source
address, not let the system chose a "random or whatever policy" one.



So I would suggest to check/fix if TFTP server uses the correct socket
API to get both the client IP and its own IP in each UDP datagram

-> setsockopt(fd, IPPROTO_IP, &on, sizeof(on))

       IP_PKTINFO
              Pass  an  IP_PKTINFO  ancillary message that contains a pktinfo structure that supplies some
              information about the incoming packet.  This only works for datagram oriented sockets.   The
              argument  is a flag that tells the socket whether the IP_PKTINFO message should be passed or
              not.  The message itself can only be sent/retrieved as control message with a  packet  using
              recvmsg(2) or sendmsg(2).

                  struct in_pktinfo {
                      unsigned int   ipi_ifindex;  /* Interface index */
                      struct in_addr ipi_spec_dst; /* Local address */
                      struct in_addr ipi_addr;     /* Header Destination
                                                      address */
                  };

              ipi_ifindex  is  the unique index of the interface the packet was received on.  ipi_spec_dst
              is the local address of the packet and ipi_addr is the destination  address  in  the  packet
              header.  If IP_PKTINFO is passed to sendmsg(2) and ipi_spec_dst is not zero, then it is used
              as the local source address for the routing table lookup and for setting up IP source  route
              options.  When ipi_ifindex is not zero, the primary local address of the interface specified
              by the index overwrites ipi_spec_dst for the routing table lookup.



This permits tftp server to use the same "struct in_pktinfo" for replies, forcing a correct source address.

^ permalink raw reply

* Re: loopback IP alias breaks tftp?
From: Josh Boyer @ 2011-10-07 11:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Joel Sing, Julian Anastasov, netdev
In-Reply-To: <1317972573.3457.55.camel@edumazet-laptop>

On Fri, Oct 07, 2011 at 09:29:33AM +0200, Eric Dumazet wrote:
> Le vendredi 07 octobre 2011 à 18:02 +1100, Joel Sing a écrit :
> > > > > We've had a report [1] of a change in behavior when trying to use an IP
> > > > > alias to tftp from a loopback device.  Apparently the steps outlined in
> > > > > [1] https://bugzilla.redhat.com/show_bug.cgi?id=739534
> > >
> > > Yep.  That is exactly what my git bisect said too.
> > >
> > > So now we have a change in behavior since that commit for the usecase
> > > described in the bug above, but I'm unsure if that usecase was ever
> > > really valid or if the commit had unintentional side effects.
> > >
> > > Joel (or anyone else) can you take a look and comment?
> > 
> > Prior to this commit the src address in the local routing table was
> > completely ignored, so connecting to a local address always used the
> > same source and destination addresses. However, this is not what was
> > configured in the local routing table:
> > 
> >   $ ifconfig lo:0 127.0.0.2
> >   $ ip route show table local | grep 127.0.0.2
> >   local 127.0.0.2 dev lo  proto kernel  scope host  src 127.0.0.1
> > 
> > When an interface has an alias configured, the source address
> > installed in the local routing table is always the primary address for
> > the interface. The tftp use case you've described now breaks due to
> > the way that in.tftpd is determining the reply address (as documented
> > in the in.tftpd manual page). This means that it is now responding
> > from 127.0.0.1 even though the client connected to 127.0.0.2. Binding
> > inetd to a specific address will avoid this issue.
> > 
> > I have not yet looked to see if there is a specific reason for the
> > source address selection, however one way of restoring the previous
> > behaviour (whilst still respecting the configured source address)
> > would be to use a default source address which matches the configured
> > address (as is done for primary addresses). I cannot immediately think
> > of a reason not to do this, but I've not gone looking at the code.
> > 
> > Worst case scenario if you specifically need the original behaviour
> > then the source address can be changed in the local routing table:
> > 
> >   $ ip route change table local local 127.0.0.2 dev lo:0 proto kernel
> > scope host src 127.0.0.2
> >   $ ip route show table local | grep 127.0.0.2
> >   local 127.0.0.2 dev lo  proto kernel  scope host  src 127.0.0.2
> 
> Agreed.
> 
> The "table local" bit is really the key, because following naive setup
> doesnt work.
> 
> # ip addr add 127.0.0.11/8 dev lo
> # ip route add 127.0.0.11 dev lo src 127.0.0.11
> 
> since 127.0.0.1 will still be the src address selected for connections.
> 
> # ip ro show table local | grep 127.0.0.11
> local 127.0.0.11 dev lo  proto kernel  scope host  src 127.0.0.1 

The original bug reporter explained the loopback testcase is simply a
trivial example to illustrate the problem.  The real setup seems to be:

"I have a hardware (Dell PC) booting over the network with PXE, this
hardware try to load it system from a tftp server. If I used the real
ip of the tftp server I have no problem, if I used an alias I have the
problem (I have to use an alias, because I have a Cluster).

What I think, the bios tftp client talk to the server with the IP alias
but the server reply with the real IP and the client reject the reply."

So in this case, because the alias is being used in a cluster, should he
also setup the local routing table as you suggested?

I apologize for my lack of depth here.  I'm at the moment somewhat of a
go-between.  I do greatly appreciate the help!

josh

^ permalink raw reply

* Re: [PATCH] IPv6: DAD from bonding iface is treated as dup address from others
From: Neil Horman @ 2011-10-07 11:10 UTC (permalink / raw)
  To: Yinglin Sun
  Cc: Jay Vosburgh, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <CAN17JHUkeCarKf3U-0EMEyL+RJvZGV3GSQWG7+TZFy7ZvD8jYA@mail.gmail.com>

On Thu, Oct 06, 2011 at 06:24:36PM -0700, Yinglin Sun wrote:
> On Thu, Oct 6, 2011 at 5:59 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> > Yinglin Sun <Yinglin.Sun@emc.com> wrote:
> >
> >>On Thu, Oct 6, 2011 at 3:17 PM, Yinglin Sun <Yinglin.Sun@emc.com> wrote:
> >>>
> >>> On Thu, Oct 6, 2011 at 12:05 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> >>> >
> >>> > Neil Horman <nhorman@tuxdriver.com> wrote:
> >>> >
> >>> > >On Wed, Oct 05, 2011 at 08:59:10PM -0700, Yinglin Sun wrote:
> >>> > >> Steps to reproduce this issue:
> >>> > >> 1. create bond0 over eth0 and eth1, set the mode to balance-xor
> >>> > >> 2. add an IPv6 address to bond0
> >>> > >> 3. DAD packet is sent out from one slave and then is looped back from
> >>> > >> the other slave. Therefore, it is treated as a duplicate address and
> >>> > >> stays tentative afterwards:
> >>> > >>    kern.info:
> >>> > >>        Oct  5 11:50:18 testvm1 kernel: [  129.224353] bond0: IPv6 duplicate address 1234::1 detected!
> >
> > [...]
> >
> >>> > >Nack, This seems like it will just completely break DAD.  What if theres another
> >>> > >system out there with the same mac address.  A response from that system would
> >>> > >get dropped by this filter, instead of causing The local system to stop using
> >>> > >the address.  What you really want to do is modify
> >>> > >bond_should_deliver_exact_match to detect this frame on the inactive slave or
> >>> > >some such, and drop the frame there.
> >>> >
> >>> >        Also NACK; and adding a bit of information.  The balance-xor
> >>> > mode is nominally expecting to interact with a switch whose ports are
> >>> > set for etherchannel ("static link aggregation"), in which case the
> >>> > switch will not loop the packet back around.
> >>> >
> >>> >        If your switch can do etherchannel, then enable it and the
> >>> > problem should go away.  If your switch cannot do this, then you may
> >>> > have other issues, because all of the multicast or broadcast packets
> >>> > going out any bonding slave will loop around to another slave.  You
> >>> > could also use 802.3ad / LACP if you switch supports that.
> >>> >
> >>> >        For balance-xor (or balance-rr, for that matter) mode to a
> >>> > non-etherchannel switch, it's going to be difficult, if not impossible,
> >>> > to modify bond_should_deliver_exact_match, because there are no inactive
> >>> > slaves.  In this mode, bonding is expecting the switch to balance
> >>> > incoming traffic across the ports, and not deliver looped back packets
> >>> > or duplicates.  There are no restrictions on what type of traffic
> >>> > (mcast, bcast, ucast) may arrive on any given port.
> >>> >
> >>> >        I can't think of a way to make the non-etherchannel case work
> >>> > for balance-xor (or balance-rr) without breaking the DAD functionality
> >>> > in the case of an actual duplicate.  I'm not aware of a way to
> >>> > distinguish a looped back DAD probe from an actual duplicate address
> >>> > probe elsewhere on the network.
> >>> >
> >>>
> >>> Hi Neil & Jay,
> >>>
> >>> Thanks a lot for the comments.
> >>>
> >>> The use case is to add IPv6 address on the bonding interface first,
> >>> and then set up port channel on switch. We'll hit this issue and the
> >>> new address will stay tentative and unusable after port channel is set
> >>> up on switch. This patch is for this valid use case.
> >>>
> >>> Except failover mode, all slaves are active on receiving packets, so
> >>> we are receiving such looped back DAD and the bonding driver cannot
> >>> ignore them. I cannot think of a way to distinguish if a DAD is looped
> >>> back or from someone else having the same mac address. They look the
> >>> same to the host. If there is another machine having the same mac
> >>> address, this code path gets executed if both are doing DAD at the
> >>> same time for the same IPv6 address. Maybe we should find out what the
> >>> specification defines for this case?
> >>>
> >>
> >>RFC4862 has a discussion about this issue:
> >>http://tools.ietf.org/html/rfc4862#appendix-A
> >>The better solution could be to record the number of DAD sent out. If
> >>we received more DAD packets than we sent out, there is someone else
> >>on the network who has the same mac address and sent DAD for the same
> >>IPv6 address. However, this solution doesn't work with bonding
> >>interface, since all other active slaves but the one sending out DAD
> >>will receive packet looped back. It doesn't seem there is a simple
> >>solution for this issue.
> >
> >        Why are you setting up the port channel after configuring the
> > bond?
> >
> >        As a possible workaround, if you have control over the setup
> > process (perhaps it's some sort of manual process), adding one slave to
> > the bond, leaving the other soon-to-be slaves down, then setting up the
> > switch, and finally adding the remaining slaves should work around the
> > issue, since if the bond has only one slave it won't see any looped
> > packets.
> >
> >        Or you could bring the bond up as active-backup, then change the
> > mode to balance-xor once the switch is configured.
> >
> >        Ultimately, though, the problem stems from the settings mismatch
> > between the switch and the bonding system; balance-xor is meant to
> > interoperate with etherchannel, and when the switch is not configured
> > properly, correct behavior is difficult to guarantee.
> >
> 
> Jay,
> 
> Thanks a lot for the suggestion.
> 
> It's mainly about usability. We would like to provide customers with
> consistent IPv6 configuration procedures as IPv4.  Such workarounds
> could be confusing and generate customer calls.
> 
Its not a workaround, its the way it has to be done.  You can't just drop dad
packets because you can't tell the difference between those that are looped back
and those that are legitimaely from other hosts, so you need to do something
like what Jay is suggesting.

> Yinglin
> 

^ permalink raw reply

* Re: [PATCH 1/2 net-next] r6040: invoke phy_{start,stop} when appropriate
From: Eric Dumazet @ 2011-10-07 10:24 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: davem, netdev
In-Reply-To: <201110071136.22107.florian@openwrt.org>

Le vendredi 07 octobre 2011 à 11:36 +0200, Florian Fainelli a écrit :
> Joe reported to me that right after a bring up of a r6040 interface
> the ethtool output had no consistent output with respect to link duplex
> and speed. Fix this by adding a missing phy_start call in r6040_up and
> conversely a phy_stop call in r6040_down to properly initialize phy states.
> 
> Reported-by: Joe Chou <Joe.Chou@rdc.com.tw>
> Signed-off-by: Florian Fainelli <florian@openwrt.org>
> ---
> diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c
> index 2bbadc0..a128c3d 100644
> --- a/drivers/net/ethernet/rdc/r6040.c
> +++ b/drivers/net/ethernet/rdc/r6040.c
> @@ -470,6 +470,8 @@ static void r6040_down(struct net_device *dev)
>  	iowrite16(adrp[0], ioaddr + MID_0L);
>  	iowrite16(adrp[1], ioaddr + MID_0M);
>  	iowrite16(adrp[2], ioaddr + MID_0H);
> +
> +	phy_stop(lp->phydev);
>  }
>  
>  static int r6040_close(struct net_device *dev)
> @@ -727,6 +729,8 @@ static int r6040_up(struct net_device *dev)
>  	/* Initialize all MAC registers */
>  	r6040_init_mac_regs(dev);
>  
> +	phy_start(lp->phydev);
> +
>  	return 0;
>  }
>  

You dont need to submit your patches twice (net and net-next)

If its a bug fix, base your patch against net tree

If its a new feature, against net-next

^ permalink raw reply

* [PATCH 2/2 net] r6040: bump version to 0.28 and date to 07Oct2011.
From: Florian Fainelli @ 2011-10-07  9:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, stable


Signed-off-by: Florian Fainelli <florian@openwrt.org>
CC: stable@kernel.org # 2.6.38+
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index ab59538..6b21565 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -49,8 +49,8 @@
 #include <asm/processor.h>
 
 #define DRV_NAME	"r6040"
-#define DRV_VERSION	"0.27"
-#define DRV_RELDATE	"23Feb2011"
+#define DRV_VERSION	"0.28"
+#define DRV_RELDATE	"07Oct2011"
 
 /* PHY CHIP Address */
 #define PHY1_ADDR	1	/* For MAC1 */
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 1/2 net] r6040: invoke phy_{start,stop} when appropriate
From: Florian Fainelli @ 2011-10-07  9:40 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, stable

Joe reported to me that right after a bring up of a r6040 interface
the ethtool output had no consistent output with respect to link duplex
and speed. Fix this by adding a missing phy_start call in r6040_up and
conversely a phy_stop call in r6040_down to properly initialize phy states.

Reported-by: Joe Chou <Joe.Chou@rdc.com.tw>
Signed-off-by: Florian Fainelli <florian@openwrt.org>
CC: stable@kernel.org # 2.6.38+
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index b64fcee..ab59538 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -470,6 +470,8 @@ static void r6040_down(struct net_device *dev)
 	iowrite16(adrp[0], ioaddr + MID_0L);
 	iowrite16(adrp[1], ioaddr + MID_0M);
 	iowrite16(adrp[2], ioaddr + MID_0H);
+
+	phy_stop(lp->phydev);
 }
 
 static int r6040_close(struct net_device *dev)
@@ -727,6 +729,8 @@ static int r6040_up(struct net_device *dev)
 	/* Initialize all MAC registers */
 	r6040_init_mac_regs(dev);
 
+	phy_start(lp->phydev);
+
 	return 0;
 }
 
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 2/2 net-next] r6040: bump version to 0.28 and date to 07Oct2011.
From: Florian Fainelli @ 2011-10-07  9:36 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c
index a128c3d..1fc01ca 100644
--- a/drivers/net/ethernet/rdc/r6040.c
+++ b/drivers/net/ethernet/rdc/r6040.c
@@ -49,8 +49,8 @@
 #include <asm/processor.h>
 
 #define DRV_NAME	"r6040"
-#define DRV_VERSION	"0.27"
-#define DRV_RELDATE	"23Feb2011"
+#define DRV_VERSION	"0.28"
+#define DRV_RELDATE	"07Oct2011"
 
 /* PHY CHIP Address */
 #define PHY1_ADDR	1	/* For MAC1 */
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 1/2 net-next] r6040: invoke phy_{start,stop} when appropriate
From: Florian Fainelli @ 2011-10-07  9:36 UTC (permalink / raw)
  To: davem; +Cc: netdev

Joe reported to me that right after a bring up of a r6040 interface
the ethtool output had no consistent output with respect to link duplex
and speed. Fix this by adding a missing phy_start call in r6040_up and
conversely a phy_stop call in r6040_down to properly initialize phy states.

Reported-by: Joe Chou <Joe.Chou@rdc.com.tw>
Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c
index 2bbadc0..a128c3d 100644
--- a/drivers/net/ethernet/rdc/r6040.c
+++ b/drivers/net/ethernet/rdc/r6040.c
@@ -470,6 +470,8 @@ static void r6040_down(struct net_device *dev)
 	iowrite16(adrp[0], ioaddr + MID_0L);
 	iowrite16(adrp[1], ioaddr + MID_0M);
 	iowrite16(adrp[2], ioaddr + MID_0H);
+
+	phy_stop(lp->phydev);
 }
 
 static int r6040_close(struct net_device *dev)
@@ -727,6 +729,8 @@ static int r6040_up(struct net_device *dev)
 	/* Initialize all MAC registers */
 	r6040_init_mac_regs(dev);
 
+	phy_start(lp->phydev);
+
 	return 0;
 }
 
-- 
1.7.4.1

^ permalink raw reply related

* Re: [PATCH v5 0/8] per-cgroup tcp buffer pressure settings
From: KAMEZAWA Hiroyuki @ 2011-10-07  8:55 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, paul, lizf, ebiederm, davem, gthelen, netdev,
	linux-mm, kirill, avagin, devel
In-Reply-To: <4E8EB634.9090208@parallels.com>

On Fri, 7 Oct 2011 12:20:04 +0400
Glauber Costa <glommer@parallels.com> wrote:

> >
> >> So what I really mean here with "will integrate later", is that I think
> >> that we'd be better off tracking the allocations themselves at the slab
> >> level.
> >>
> >>>      Can't tcp-limit-code borrows some amount of charges in batch from kmem_limit
> >>>      and use it ?
> >> Sorry, I don't know what exactly do you mean. Can you clarify?
> >>
> > Now, tcp-usage is independent from kmem-usage.
> >
> > My idea is
> >
> >    1. when you account tcp usage, charge kmem, too.
> 
> Absolutely.
> >    Now, your work is
> >       a) tcp use new xxxx bytes.
> >       b) account it to tcp.uage and check tcp limit
> >
> >    To ingegrate kmem,
> >       a) tcp use new xxxx bytes.
> >       b) account it to tcp.usage and check tcp limit
> >       c) account it to kmem.usage
> >
> > ? 2 counters may be slow ?
> 
> Well, the way I see it, 1 counter is slow already =)
> I honestly think we need some optimizations here. But
> that is a side issue.
> 
> To begin with: The new patchset that I intend to spin
> today or Monday, depending on my progress, uses res_counters,
> as you and Kirill requested.
> 
> So what makes res_counters slow IMHO, is two things:
> 
> 1) interrupts are always disabled.
> 2) All is done under a lock.
> 
> Now, we are starting to have resources that are billed to multiple
> counters. One simple way to work around it, is to have child counters
> that has to be accounted for as well everytime a resource is counted.
> 
> Like this:
> 
> 1) tcp has kmem as child. When we bill to tcp, we bill to kmem as well.
>     For protocols that do memory pressure, we then don't bill kmem from
>     the slab.
> 2) When kmem_independent_account is set to 0, kmem has mem as child.
> 

Seems reasonable.


> >
> >
> >>>    - Don't you need a stat file to indicate "tcp memory pressure works!" ?
> >>>      It can be obtained already ?
> >>
> >> Not 100 % clear as well. We can query the amount of buffer used, and the
> >> amount of buffer allowed. What else do we need?
> >>
> >
> > IIUC, we can see the fact tcp.usage is near to tcp.limit but never can see it
> > got memory pressure and how many numbers of failure happens.
> > I'm sorry if I don't read codes correctly.
> 
> IIUC, With res_counters being used, we get at least failcnt for free, right?
> 

Right. you can get failcnt and max_usage and can have soft_limit base
implemenation at the same time.

Thank you.
-Kame



 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v5 0/8] per-cgroup tcp buffer pressure settings
From: Glauber Costa @ 2011-10-07  8:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, paul, lizf, ebiederm, davem, gthelen, netdev,
	linux-mm, kirill, avagin, devel
In-Reply-To: <20111007170522.624fab3d.kamezawa.hiroyu@jp.fujitsu.com>

On 10/07/2011 12:05 PM, KAMEZAWA Hiroyuki wrote:
>
>
> Sorry for lazy answer.
Hi Kame,

Now matter how hard you try, you'll never be as lazy as I am. So that's 
okay.

>
> On Wed, 5 Oct 2011 11:25:50 +0400
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> On 10/05/2011 04:29 AM, KAMEZAWA Hiroyuki wrote:
>>> On Tue,  4 Oct 2011 16:17:52 +0400
>>> Glauber Costa<glommer@parallels.com>   wrote:
>>>
>
>>> At this stage, my concern is view of interfaces and documenation, and future plans.
>>
>> Okay. I will try to address them as well as I can.
>>
>>> * memory.independent_kmem_limit
>>>    If 1, kmem_limit_in_bytes/kmem_usage_in_bytes works.
>>>    If 0, kmem_limit_in_bytes/kmem_usage_in_bytes doesn't work and all kmem
>>>       usages are controlled under memory.limit_in_bytes.
>>
>> Correct. For the questions below, I won't even look at the code not to
>> get misguided. Let's settle on the desired behavior, and everything that
>> deviates from it, is a bug.
>>
>>> Question:
>>>    - What happens when parent/chidlren cgroup has different indepedent_kmem_limit ?
>> I think it should be forbidden. It was raised by Kirill before, and
>> IIRC, he specifically requested it to be. (Okay: Saying it now, makes me
>> realizes that the child can have set it to 1 while parent was 1. But
>> then parent sets it to 0... I don't think I am handling this case).
>>
>
> ok, please put it into TODO list ;)

Done.

>
>
>>>    In future plan, kmem.usage_in_bytes should includes tcp.kmem_usage_in_bytes.
>>>    And kmem.limit_in_bytes should be the limiation of sum of all kmem.xxxx.limit_in_bytes.
>>
>> I am not sure there will be others xxx.limit_in_bytes. (see below)
>>
>
> ok.
>
>
>>>
>>> Question:
>>>    - Why this integration is difficult ?
>> It is not that it is difficult.
>> What happens is that there are two things taking place here:
>> One of them is allocation.
>> The other, is tcp-specific pressure thresholds. Bear with me with the
>> following example code: (from sk_stream_alloc_skb, net/ipv4/tcp.c)
>>
>> 1:      skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
>>           if (skb) {
>> 3:              if (sk_wmem_schedule(sk, skb->truesize)) {
>>                           /*
>>                            * Make sure that we have exactly size bytes
>>                            * available to the caller, no more, no less.
>>                            */
>>                           skb_reserve(skb, skb_tailroom(skb) - size);
>>                           return skb;
>>                   }
>>                   __kfree_skb(skb);
>>           } else {
>>                   sk->sk_prot->enter_memory_pressure(sk);
>>                   sk_stream_moderate_sndbuf(sk);
>>           }
>>
>> In line 1, an allocation takes place. This allocs memory from the skbuff
>> slab cache.
>> But then, pressure thresholds are applied in 3. If it fails, we drop the
>> memory buffer even if the allocation succeeded.
>>
>
> Sure.
>
>
>> So this patchset, as I've stated already, cares about pressure
>> conditions only. It is enough to guarantee that no more memory will be
>> pinned that we specified, because we'll free the allocation in case
>> pressure is reached.
>>
>> There is work in progress from guys at google (and I have my very own
>> PoCs as well), to include all slab allocations in kmem.usage_in_bytes.
>>
>
> ok.
>
>
>> So what I really mean here with "will integrate later", is that I think
>> that we'd be better off tracking the allocations themselves at the slab
>> level.
>>
>>>      Can't tcp-limit-code borrows some amount of charges in batch from kmem_limit
>>>      and use it ?
>> Sorry, I don't know what exactly do you mean. Can you clarify?
>>
> Now, tcp-usage is independent from kmem-usage.
>
> My idea is
>
>    1. when you account tcp usage, charge kmem, too.

Absolutely.
>    Now, your work is
>       a) tcp use new xxxx bytes.
>       b) account it to tcp.uage and check tcp limit
>
>    To ingegrate kmem,
>       a) tcp use new xxxx bytes.
>       b) account it to tcp.usage and check tcp limit
>       c) account it to kmem.usage
>
> ? 2 counters may be slow ?

Well, the way I see it, 1 counter is slow already =)
I honestly think we need some optimizations here. But
that is a side issue.

To begin with: The new patchset that I intend to spin
today or Monday, depending on my progress, uses res_counters,
as you and Kirill requested.

So what makes res_counters slow IMHO, is two things:

1) interrupts are always disabled.
2) All is done under a lock.

Now, we are starting to have resources that are billed to multiple
counters. One simple way to work around it, is to have child counters
that has to be accounted for as well everytime a resource is counted.

Like this:

1) tcp has kmem as child. When we bill to tcp, we bill to kmem as well.
    For protocols that do memory pressure, we then don't bill kmem from
    the slab.
2) When kmem_independent_account is set to 0, kmem has mem as child.

>
>
>>>    - Don't you need a stat file to indicate "tcp memory pressure works!" ?
>>>      It can be obtained already ?
>>
>> Not 100 % clear as well. We can query the amount of buffer used, and the
>> amount of buffer allowed. What else do we need?
>>
>
> IIUC, we can see the fact tcp.usage is near to tcp.limit but never can see it
> got memory pressure and how many numbers of failure happens.
> I'm sorry if I don't read codes correctly.

IIUC, With res_counters being used, we get at least failcnt for free, right?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v5 0/8] per-cgroup tcp buffer pressure settings
From: KAMEZAWA Hiroyuki @ 2011-10-07  8:05 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, paul, lizf, ebiederm, davem, gthelen, netdev,
	linux-mm, kirill, avagin, devel
In-Reply-To: <4E8C067E.6040203@parallels.com>



Sorry for lazy answer.

On Wed, 5 Oct 2011 11:25:50 +0400
Glauber Costa <glommer@parallels.com> wrote:

> On 10/05/2011 04:29 AM, KAMEZAWA Hiroyuki wrote:
> > On Tue,  4 Oct 2011 16:17:52 +0400
> > Glauber Costa<glommer@parallels.com>  wrote:
> >

> > At this stage, my concern is view of interfaces and documenation, and future plans.
> 
> Okay. I will try to address them as well as I can.
> 
> > * memory.independent_kmem_limit
> >   If 1, kmem_limit_in_bytes/kmem_usage_in_bytes works.
> >   If 0, kmem_limit_in_bytes/kmem_usage_in_bytes doesn't work and all kmem
> >      usages are controlled under memory.limit_in_bytes.
> 
> Correct. For the questions below, I won't even look at the code not to 
> get misguided. Let's settle on the desired behavior, and everything that 
> deviates from it, is a bug.
> 
> > Question:
> >   - What happens when parent/chidlren cgroup has different indepedent_kmem_limit ?
> I think it should be forbidden. It was raised by Kirill before, and 
> IIRC, he specifically requested it to be. (Okay: Saying it now, makes me 
> realizes that the child can have set it to 1 while parent was 1. But 
> then parent sets it to 0... I don't think I am handling this case).
> 

ok, please put it into TODO list ;)



> >   In future plan, kmem.usage_in_bytes should includes tcp.kmem_usage_in_bytes.
> >   And kmem.limit_in_bytes should be the limiation of sum of all kmem.xxxx.limit_in_bytes.
> 
> I am not sure there will be others xxx.limit_in_bytes. (see below)
> 

ok.


> >
> > Question:
> >   - Why this integration is difficult ?
> It is not that it is difficult.
> What happens is that there are two things taking place here:
> One of them is allocation.
> The other, is tcp-specific pressure thresholds. Bear with me with the 
> following example code: (from sk_stream_alloc_skb, net/ipv4/tcp.c)
> 
> 1:      skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
>          if (skb) {
> 3:              if (sk_wmem_schedule(sk, skb->truesize)) {
>                          /*
>                           * Make sure that we have exactly size bytes
>                           * available to the caller, no more, no less.
>                           */
>                          skb_reserve(skb, skb_tailroom(skb) - size);
>                          return skb;
>                  }
>                  __kfree_skb(skb);
>          } else {
>                  sk->sk_prot->enter_memory_pressure(sk);
>                  sk_stream_moderate_sndbuf(sk);
>          }
> 
> In line 1, an allocation takes place. This allocs memory from the skbuff 
> slab cache.
> But then, pressure thresholds are applied in 3. If it fails, we drop the 
> memory buffer even if the allocation succeeded.
> 

Sure.


> So this patchset, as I've stated already, cares about pressure 
> conditions only. It is enough to guarantee that no more memory will be 
> pinned that we specified, because we'll free the allocation in case 
> pressure is reached.
> 
> There is work in progress from guys at google (and I have my very own 
> PoCs as well), to include all slab allocations in kmem.usage_in_bytes.
> 

ok.


> So what I really mean here with "will integrate later", is that I think 
> that we'd be better off tracking the allocations themselves at the slab 
> level.
> 
> >     Can't tcp-limit-code borrows some amount of charges in batch from kmem_limit
> >     and use it ?
> Sorry, I don't know what exactly do you mean. Can you clarify?
> 
Now, tcp-usage is independent from kmem-usage.

My idea is
  
  1. when you account tcp usage, charge kmem, too.

  Now, your work is
     a) tcp use new xxxx bytes.
     b) account it to tcp.uage and check tcp limit
 
  To ingegrate kmem,
     a) tcp use new xxxx bytes.
     b) account it to tcp.usage and check tcp limit
     c) account it to kmem.usage

? 2 counters may be slow ?


> >   - Don't you need a stat file to indicate "tcp memory pressure works!" ?
> >     It can be obtained already ?
> 
> Not 100 % clear as well. We can query the amount of buffer used, and the 
> amount of buffer allowed. What else do we need?
> 

IIUC, we can see the fact tcp.usage is near to tcp.limit but never can see it
got memory pressure and how many numbers of failure happens.
I'm sorry if I don't read codes correctly. 

Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH] pch_gbe: compilation warning in pch_gbe_setup_rctl() fixed
From: Vasily Averin @ 2011-10-07  7:59 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Toshiharu Okada, Vasily Averin

From: Vasily Averin <vvs@sw.ru>

compilation warning fixed
drivers/net/pch_gbe/pch_gbe_main.c: In function ‘pch_gbe_setup_rctl’:
drivers/net/pch_gbe/pch_gbe_main.c:701:21: warning: unused variable ‘netdev’

Signed-off-by: Vasily Averin <vvs@sw.ru>
---
 drivers/net/pch_gbe/pch_gbe_main.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c
index b8b4ba2..7659324 100644
--- a/drivers/net/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/pch_gbe/pch_gbe_main.c
@@ -698,7 +698,6 @@ static void pch_gbe_configure_tx(struct pch_gbe_adapter *adapter)
  */
 static void pch_gbe_setup_rctl(struct pch_gbe_adapter *adapter)
 {
-	struct net_device *netdev = adapter->netdev;
 	struct pch_gbe_hw *hw = &adapter->hw;
 	u32 rx_mode, tcpip;
 
-- 
1.7.4.1

^ permalink raw reply related

* Re: loopback IP alias breaks tftp?
From: Eric Dumazet @ 2011-10-07  7:29 UTC (permalink / raw)
  To: Joel Sing; +Cc: Josh Boyer, Julian Anastasov, netdev
In-Reply-To: <CAA6X_dP_LDO0hQYpixi-DtsKVJZQEgAY5wa4y_wYpEqXVyn7TQ@mail.gmail.com>

Le vendredi 07 octobre 2011 à 18:02 +1100, Joel Sing a écrit :
> On 7 October 2011 00:23, Josh Boyer <jwboyer@redhat.com> wrote:
> >
> > On Thu, Oct 06, 2011 at 12:18:44AM +0300, Julian Anastasov wrote:
> > >
> > >       Hello,
> > >
> > > On Wed, 5 Oct 2011, Josh Boyer wrote:
> > >
> > > > Hi All,
> > > >
> > > > We've had a report [1] of a change in behavior when trying to use an IP
> > > > alias to tftp from a loopback device.  Apparently the steps outlined in
> > > > the bug worked in 2.6.35, and broke somewhere before 2.6.38.6.
> > > >
> > > > I can confirm the steps fail on a 3.0 based kernel and I'm trying to do
> > > > a git bisect to find the commit involved, but I thought I would send
> > > > this along to see if anyone might have an idea.  (Also, I'm not really
> > > > sure how valid of a usecase this was to begin with.)
> > >
> > >       What about commit 9fc3bbb4a752f108cf096d96640f3b548bbbce6c ?
> > >
> > > ipv4/route.c: respect prefsrc for local routes
> > >
> > > http://marc.info/?t=129412232500001&r=1&w=2
> > >
> > > > [1] https://bugzilla.redhat.com/show_bug.cgi?id=739534
> >
> > Yep.  That is exactly what my git bisect said too.
> >
> > So now we have a change in behavior since that commit for the usecase
> > described in the bug above, but I'm unsure if that usecase was ever
> > really valid or if the commit had unintentional side effects.
> >
> > Joel (or anyone else) can you take a look and comment?
> 
> Prior to this commit the src address in the local routing table was
> completely ignored, so connecting to a local address always used the
> same source and destination addresses. However, this is not what was
> configured in the local routing table:
> 
>   $ ifconfig lo:0 127.0.0.2
>   $ ip route show table local | grep 127.0.0.2
>   local 127.0.0.2 dev lo  proto kernel  scope host  src 127.0.0.1
> 
> When an interface has an alias configured, the source address
> installed in the local routing table is always the primary address for
> the interface. The tftp use case you've described now breaks due to
> the way that in.tftpd is determining the reply address (as documented
> in the in.tftpd manual page). This means that it is now responding
> from 127.0.0.1 even though the client connected to 127.0.0.2. Binding
> inetd to a specific address will avoid this issue.
> 
> I have not yet looked to see if there is a specific reason for the
> source address selection, however one way of restoring the previous
> behaviour (whilst still respecting the configured source address)
> would be to use a default source address which matches the configured
> address (as is done for primary addresses). I cannot immediately think
> of a reason not to do this, but I've not gone looking at the code.
> 
> Worst case scenario if you specifically need the original behaviour
> then the source address can be changed in the local routing table:
> 
>   $ ip route change table local local 127.0.0.2 dev lo:0 proto kernel
> scope host src 127.0.0.2
>   $ ip route show table local | grep 127.0.0.2
>   local 127.0.0.2 dev lo  proto kernel  scope host  src 127.0.0.2

Agreed.

The "table local" bit is really the key, because following naive setup
doesnt work.

# ip addr add 127.0.0.11/8 dev lo
# ip route add 127.0.0.11 dev lo src 127.0.0.11

since 127.0.0.1 will still be the src address selected for connections.

# ip ro show table local | grep 127.0.0.11
local 127.0.0.11 dev lo  proto kernel  scope host  src 127.0.0.1 

^ permalink raw reply

* Re: [net-next] cs89x0: Move the driver into the Cirrus dir
From: Jeff Kirsher @ 2011-10-07  7:21 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, netdev, gospo, sassmann, Russell Nelson, Andrew Morton
In-Reply-To: <1317971926-23145-2-git-send-email-jeffrey.t.kirsher@intel.com>

[-- Attachment #1: Type: text/plain, Size: 573 bytes --]

On 10/07/2011 12:18 AM, Jeff Kirsher wrote:
> The cs89x0 driver was initial placed in the apple/ when it
> should have been placed in the cirrus/.  This resolves the
> issue by moving the dirver and fixing up the respective
> Kconfig(s) and Makefile(s).
>
> Thanks to Sascha for reporting the issue.
>
> CC: Russell Nelson <nelson@crynwr.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> Reported-by: Sascha Hauer <s.hauer@pengutronix.de>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Sorry, I did not mean to send this out a second time.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 900 bytes --]

^ permalink raw reply

* [net-next 13/13] igb: consolidate creation of Tx buffer info and data descriptor
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change will combine the writes of tx_buffer_info and the Tx data
descriptors into a single function. The advantage of this is that we can
avoid needless memory reads from the buffer info struct and speed things up
by keeping the accesses to the local registers.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h      |    8 +-
 drivers/net/ethernet/intel/igb/igb_main.c |  318 ++++++++++++++++-------------
 2 files changed, 184 insertions(+), 142 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index b71d186..77793a9 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -135,7 +135,6 @@ struct vf_data_storage {
 #define IGB_TX_FLAGS_TSO		0x00000004
 #define IGB_TX_FLAGS_IPV4		0x00000008
 #define IGB_TX_FLAGS_TSTAMP		0x00000010
-#define IGB_TX_FLAGS_MAPPED_AS_PAGE	0x00000020
 #define IGB_TX_FLAGS_VLAN_MASK		0xffff0000
 #define IGB_TX_FLAGS_VLAN_SHIFT	16
 
@@ -144,13 +143,12 @@ struct vf_data_storage {
 struct igb_tx_buffer {
 	union e1000_adv_tx_desc *next_to_watch;
 	unsigned long time_stamp;
-	dma_addr_t dma;
-	u32 length;
-	u32 tx_flags;
 	struct sk_buff *skb;
 	unsigned int bytecount;
 	u16 gso_segs;
-	u8 mapped_as_page;
+	dma_addr_t dma;
+	u32 length;
+	u32 tx_flags;
 };
 
 struct igb_rx_buffer {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index dc93d64..862dd7c 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3139,29 +3139,26 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter)
 		igb_free_tx_resources(adapter->tx_ring[i]);
 }
 
-void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
-				    struct igb_tx_buffer *buffer_info)
-{
-	if (buffer_info->dma) {
-		if (buffer_info->tx_flags & IGB_TX_FLAGS_MAPPED_AS_PAGE)
-			dma_unmap_page(tx_ring->dev,
-					buffer_info->dma,
-					buffer_info->length,
-					DMA_TO_DEVICE);
-		else
-			dma_unmap_single(tx_ring->dev,
-					buffer_info->dma,
-					buffer_info->length,
-					DMA_TO_DEVICE);
-		buffer_info->dma = 0;
-	}
-	if (buffer_info->skb) {
-		dev_kfree_skb_any(buffer_info->skb);
-		buffer_info->skb = NULL;
-	}
-	buffer_info->time_stamp = 0;
-	buffer_info->length = 0;
-	buffer_info->next_to_watch = NULL;
+void igb_unmap_and_free_tx_resource(struct igb_ring *ring,
+				    struct igb_tx_buffer *tx_buffer)
+{
+	if (tx_buffer->skb) {
+		dev_kfree_skb_any(tx_buffer->skb);
+		if (tx_buffer->dma)
+			dma_unmap_single(ring->dev,
+					 tx_buffer->dma,
+					 tx_buffer->length,
+					 DMA_TO_DEVICE);
+	} else if (tx_buffer->dma) {
+		dma_unmap_page(ring->dev,
+			       tx_buffer->dma,
+			       tx_buffer->length,
+			       DMA_TO_DEVICE);
+	}
+	tx_buffer->next_to_watch = NULL;
+	tx_buffer->skb = NULL;
+	tx_buffer->dma = 0;
+	/* buffer_info must be completely set up in the transmit path */
 }
 
 /**
@@ -4138,124 +4135,153 @@ static __le32 igb_tx_olinfo_status(u32 tx_flags, unsigned int paylen,
 	return cpu_to_le32(olinfo_status);
 }
 
-#define IGB_MAX_TXD_PWR	16
-#define IGB_MAX_DATA_PER_TXD	(1<<IGB_MAX_TXD_PWR)
+/*
+ * The largest size we can write to the descriptor is 65535.  In order to
+ * maintain a power of two alignment we have to limit ourselves to 32K.
+ */
+#define IGB_MAX_TXD_PWR	15
+#define IGB_MAX_DATA_PER_TXD	(1 << IGB_MAX_TXD_PWR)
 
-static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
-			     struct igb_tx_buffer *first, u32 tx_flags)
+static void igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
+		       struct igb_tx_buffer *first, u32 tx_flags,
+		       const u8 hdr_len)
 {
-	struct igb_tx_buffer *buffer_info;
-	struct device *dev = tx_ring->dev;
-	unsigned int hlen = skb_headlen(skb);
-	unsigned int count = 0, i;
-	unsigned int f;
-	u16 gso_segs = skb_shinfo(skb)->gso_segs ?: 1;
-
-	i = tx_ring->next_to_use;
-
-	buffer_info = &tx_ring->tx_buffer_info[i];
-	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
-	buffer_info->length = hlen;
-	buffer_info->tx_flags = tx_flags;
-	buffer_info->dma = dma_map_single(dev, skb->data, hlen,
-					  DMA_TO_DEVICE);
-	if (dma_mapping_error(dev, buffer_info->dma))
+	struct igb_tx_buffer *tx_buffer_info;
+	union e1000_adv_tx_desc *tx_desc;
+	dma_addr_t dma;
+	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
+	unsigned int data_len = skb->data_len;
+	unsigned int size = skb_headlen(skb);
+	unsigned int paylen = skb->len - hdr_len;
+	__le32 cmd_type;
+	u16 i = tx_ring->next_to_use;
+	u16 gso_segs;
+
+	if (tx_flags & IGB_TX_FLAGS_TSO)
+		gso_segs = skb_shinfo(skb)->gso_segs;
+	else
+		gso_segs = 1;
+
+	/* multiply data chunks by size of headers */
+	first->bytecount = paylen + (gso_segs * hdr_len);
+	first->gso_segs = gso_segs;
+	first->skb = skb;
+
+	tx_desc = IGB_TX_DESC(tx_ring, i);
+
+	tx_desc->read.olinfo_status =
+		igb_tx_olinfo_status(tx_flags, paylen, tx_ring);
+
+	cmd_type = igb_tx_cmd_type(tx_flags);
+
+	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(tx_ring->dev, dma))
 		goto dma_error;
 
-	tx_flags |= IGB_TX_FLAGS_MAPPED_AS_PAGE;
+	/* record length, and DMA address */
+	first->length = size;
+	first->dma = dma;
+	first->tx_flags = tx_flags;
+	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
-	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) {
-		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[f];
-		unsigned int len = frag->size;
+	for (;;) {
+		while (unlikely(size > IGB_MAX_DATA_PER_TXD)) {
+			tx_desc->read.cmd_type_len =
+				cmd_type | cpu_to_le32(IGB_MAX_DATA_PER_TXD);
+
+			i++;
+			tx_desc++;
+			if (i == tx_ring->count) {
+				tx_desc = IGB_TX_DESC(tx_ring, 0);
+				i = 0;
+			}
+
+			dma += IGB_MAX_DATA_PER_TXD;
+			size -= IGB_MAX_DATA_PER_TXD;
+
+			tx_desc->read.olinfo_status = 0;
+			tx_desc->read.buffer_addr = cpu_to_le64(dma);
+		}
+
+		if (likely(!data_len))
+			break;
+
+		tx_desc->read.cmd_type_len = cmd_type | cpu_to_le32(size);
 
-		count++;
 		i++;
-		if (i == tx_ring->count)
+		tx_desc++;
+		if (i == tx_ring->count) {
+			tx_desc = IGB_TX_DESC(tx_ring, 0);
 			i = 0;
+		}
 
-		buffer_info = &tx_ring->tx_buffer_info[i];
-		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
-		buffer_info->length = len;
-		buffer_info->tx_flags = tx_flags;
-		buffer_info->dma = skb_frag_dma_map(dev, frag, 0, len,
-						DMA_TO_DEVICE);
-		if (dma_mapping_error(dev, buffer_info->dma))
+		size = frag->size;
+		data_len -= size;
+
+		dma = skb_frag_dma_map(tx_ring->dev, frag, 0,
+				   size, DMA_TO_DEVICE);
+		if (dma_mapping_error(tx_ring->dev, dma))
 			goto dma_error;
 
+		tx_buffer_info = &tx_ring->tx_buffer_info[i];
+		tx_buffer_info->length = size;
+		tx_buffer_info->dma = dma;
+
+		tx_desc->read.olinfo_status = 0;
+		tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
+		frag++;
 	}
 
-	buffer_info->skb = skb;
-	/* multiply data chunks by size of headers */
-	buffer_info->bytecount = ((gso_segs - 1) * hlen) + skb->len;
-	buffer_info->gso_segs = gso_segs;
+	/* write last descriptor with RS and EOP bits */
+	cmd_type |= cpu_to_le32(size) | cpu_to_le32(IGB_TXD_DCMD);
+	tx_desc->read.cmd_type_len = cmd_type;
 
 	/* set the timestamp */
 	first->time_stamp = jiffies;
 
+	/*
+	 * Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.  (Only applicable for weak-ordered
+	 * memory model archs, such as IA-64).
+	 *
+	 * We also need this memory barrier to make certain all of the
+	 * status bits have been updated before next_to_watch is written.
+	 */
+	wmb();
+
 	/* set next_to_watch value indicating a packet is present */
-	first->next_to_watch = IGB_TX_DESC(tx_ring, i);
+	first->next_to_watch = tx_desc;
 
-	return ++count;
+	i++;
+	if (i == tx_ring->count)
+		i = 0;
 
-dma_error:
-	dev_err(dev, "TX DMA map failed\n");
+	tx_ring->next_to_use = i;
+
+	writel(i, tx_ring->tail);
 
-	/* clear timestamp and dma mappings for failed buffer_info mapping */
-	buffer_info->dma = 0;
-	buffer_info->time_stamp = 0;
-	buffer_info->length = 0;
+	/* we need this if more than one processor can write to our tail
+	 * at a time, it syncronizes IO on IA64/Altix systems */
+	mmiowb();
+
+	return;
+
+dma_error:
+	dev_err(tx_ring->dev, "TX DMA map failed\n");
 
-	/* clear timestamp and dma mappings for remaining portion of packet */
-	while (count--) {
+	/* clear dma mappings for failed tx_buffer_info map */
+	for (;;) {
+		tx_buffer_info = &tx_ring->tx_buffer_info[i];
+		igb_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
+		if (tx_buffer_info == first)
+			break;
 		if (i == 0)
 			i = tx_ring->count;
 		i--;
-		buffer_info = &tx_ring->tx_buffer_info[i];
-		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 	}
 
-	return 0;
-}
-
-static inline void igb_tx_queue(struct igb_ring *tx_ring,
-				u32 tx_flags, int count, u32 paylen,
-				u8 hdr_len)
-{
-	union e1000_adv_tx_desc *tx_desc;
-	struct igb_tx_buffer *buffer_info;
-	__le32 olinfo_status, cmd_type;
-	unsigned int i = tx_ring->next_to_use;
-
-	cmd_type = igb_tx_cmd_type(tx_flags);
-	olinfo_status = igb_tx_olinfo_status(tx_flags,
-					     paylen - hdr_len,
-					     tx_ring);
-
-	do {
-		buffer_info = &tx_ring->tx_buffer_info[i];
-		tx_desc = IGB_TX_DESC(tx_ring, i);
-		tx_desc->read.buffer_addr = cpu_to_le64(buffer_info->dma);
-		tx_desc->read.cmd_type_len = cmd_type |
-					     cpu_to_le32(buffer_info->length);
-		tx_desc->read.olinfo_status = olinfo_status;
-		count--;
-		i++;
-		if (i == tx_ring->count)
-			i = 0;
-	} while (count > 0);
-
-	tx_desc->read.cmd_type_len |= cpu_to_le32(IGB_TXD_DCMD);
-	/* Force memory writes to complete before letting h/w
-	 * know there are new descriptors to fetch.  (Only
-	 * applicable for weak-ordered memory model archs,
-	 * such as IA-64). */
-	wmb();
-
 	tx_ring->next_to_use = i;
-	writel(i, tx_ring->tail);
-	/* we need this if more than one processor can write to our tail
-	 * at a time, it syncronizes IO on IA64/Altix systems */
-	mmiowb();
 }
 
 static int __igb_maybe_stop_tx(struct igb_ring *tx_ring, int size)
@@ -4295,7 +4321,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 				struct igb_ring *tx_ring)
 {
 	struct igb_tx_buffer *first;
-	int tso, count;
+	int tso;
 	u32 tx_flags = 0;
 	__be16 protocol = vlan_get_protocol(skb);
 	u8 hdr_len = 0;
@@ -4335,19 +4361,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 		tx_flags |= IGB_TX_FLAGS_CSUM;
 	}
 
-	/*
-	 * count reflects descriptors mapped, if 0 or less then mapping error
-	 * has occurred and we need to rewind the descriptor queue
-	 */
-	count = igb_tx_map(tx_ring, skb, first, tx_flags);
-	if (!count) {
-		dev_kfree_skb_any(skb);
-		first->time_stamp = 0;
-		tx_ring->next_to_use = first - tx_ring->tx_buffer_info;
-		return NETDEV_TX_OK;
-	}
-
-	igb_tx_queue(tx_ring, tx_flags, count, skb->len, hdr_len);
+	igb_tx_map(tx_ring, skb, first, tx_flags, hdr_len);
 
 	/* Make sure there is space in the ring for the next send. */
 	igb_maybe_stop_tx(tx_ring, MAX_SKB_FRAGS + 4);
@@ -5609,17 +5623,26 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 		/* clear next_to_watch to prevent false hangs */
 		tx_buffer->next_to_watch = NULL;
 
-		do {
-			tx_desc->wb.status = 0;
-			if (likely(tx_desc == eop_desc)) {
-				eop_desc = NULL;
+		/* update the statistics for this packet */
+		total_bytes += tx_buffer->bytecount;
+		total_packets += tx_buffer->gso_segs;
 
-				total_bytes += tx_buffer->bytecount;
-				total_packets += tx_buffer->gso_segs;
-				igb_tx_hwtstamp(q_vector, tx_buffer);
-			}
+		/* retrieve hardware timestamp */
+		igb_tx_hwtstamp(q_vector, tx_buffer);
+
+		/* free the skb */
+		dev_kfree_skb_any(tx_buffer->skb);
+		tx_buffer->skb = NULL;
+
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 tx_buffer->dma,
+				 tx_buffer->length,
+				 DMA_TO_DEVICE);
 
-			igb_unmap_and_free_tx_resource(tx_ring, tx_buffer);
+		/* clear last DMA location and unmap remaining buffers */
+		while (tx_desc != eop_desc) {
+			tx_buffer->dma = 0;
 
 			tx_buffer++;
 			tx_desc++;
@@ -5629,7 +5652,28 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				tx_buffer = tx_ring->tx_buffer_info;
 				tx_desc = IGB_TX_DESC(tx_ring, 0);
 			}
-		} while (eop_desc);
+
+			/* unmap any remaining paged data */
+			if (tx_buffer->dma) {
+				dma_unmap_page(tx_ring->dev,
+					       tx_buffer->dma,
+					       tx_buffer->length,
+					       DMA_TO_DEVICE);
+			}
+		}
+
+		/* clear last DMA location */
+		tx_buffer->dma = 0;
+
+		/* move us one more past the eop_desc for start of next pkt */
+		tx_buffer++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buffer = tx_ring->tx_buffer_info;
+			tx_desc = IGB_TX_DESC(tx_ring, 0);
+		}
 	}
 
 	i += tx_ring->count;
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 12/13] igb: Combine all flag info fields into a single tx_flags structure
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change is meant to combine all of the TX flags fields into one u32
flags field so that it can be stored into the tx_buffer_info structure.
This includes the time stamp flag as well as mapped_as_page flag info.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h      |    9 +++++++++
 drivers/net/ethernet/intel/igb/igb_main.c |   24 ++++++++----------------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 1608110..b71d186 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -130,6 +130,15 @@ struct vf_data_storage {
 
 #define IGB_MNG_VLAN_NONE -1
 
+#define IGB_TX_FLAGS_CSUM		0x00000001
+#define IGB_TX_FLAGS_VLAN		0x00000002
+#define IGB_TX_FLAGS_TSO		0x00000004
+#define IGB_TX_FLAGS_IPV4		0x00000008
+#define IGB_TX_FLAGS_TSTAMP		0x00000010
+#define IGB_TX_FLAGS_MAPPED_AS_PAGE	0x00000020
+#define IGB_TX_FLAGS_VLAN_MASK		0xffff0000
+#define IGB_TX_FLAGS_VLAN_SHIFT	16
+
 /* wrapper around a pointer to a socket buffer,
  * so a DMA handle can be stored along with the buffer */
 struct igb_tx_buffer {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 3ebeb3e..dc93d64 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3143,7 +3143,7 @@ void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
 				    struct igb_tx_buffer *buffer_info)
 {
 	if (buffer_info->dma) {
-		if (buffer_info->mapped_as_page)
+		if (buffer_info->tx_flags & IGB_TX_FLAGS_MAPPED_AS_PAGE)
 			dma_unmap_page(tx_ring->dev,
 					buffer_info->dma,
 					buffer_info->length,
@@ -3162,7 +3162,6 @@ void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
 	buffer_info->time_stamp = 0;
 	buffer_info->length = 0;
 	buffer_info->next_to_watch = NULL;
-	buffer_info->mapped_as_page = false;
 }
 
 /**
@@ -3955,14 +3954,6 @@ set_itr_now:
 	}
 }
 
-#define IGB_TX_FLAGS_CSUM		0x00000001
-#define IGB_TX_FLAGS_VLAN		0x00000002
-#define IGB_TX_FLAGS_TSO		0x00000004
-#define IGB_TX_FLAGS_IPV4		0x00000008
-#define IGB_TX_FLAGS_TSTAMP		0x00000010
-#define IGB_TX_FLAGS_VLAN_MASK		0xffff0000
-#define IGB_TX_FLAGS_VLAN_SHIFT		        16
-
 void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens,
 		     u32 type_tucmd, u32 mss_l4len_idx)
 {
@@ -4151,7 +4142,7 @@ static __le32 igb_tx_olinfo_status(u32 tx_flags, unsigned int paylen,
 #define IGB_MAX_DATA_PER_TXD	(1<<IGB_MAX_TXD_PWR)
 
 static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
-			     struct igb_tx_buffer *first)
+			     struct igb_tx_buffer *first, u32 tx_flags)
 {
 	struct igb_tx_buffer *buffer_info;
 	struct device *dev = tx_ring->dev;
@@ -4165,11 +4156,14 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 	buffer_info = &tx_ring->tx_buffer_info[i];
 	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
 	buffer_info->length = hlen;
+	buffer_info->tx_flags = tx_flags;
 	buffer_info->dma = dma_map_single(dev, skb->data, hlen,
 					  DMA_TO_DEVICE);
 	if (dma_mapping_error(dev, buffer_info->dma))
 		goto dma_error;
 
+	tx_flags |= IGB_TX_FLAGS_MAPPED_AS_PAGE;
+
 	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) {
 		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[f];
 		unsigned int len = frag->size;
@@ -4182,7 +4176,7 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 		buffer_info = &tx_ring->tx_buffer_info[i];
 		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
 		buffer_info->length = len;
-		buffer_info->mapped_as_page = true;
+		buffer_info->tx_flags = tx_flags;
 		buffer_info->dma = skb_frag_dma_map(dev, frag, 0, len,
 						DMA_TO_DEVICE);
 		if (dma_mapping_error(dev, buffer_info->dma))
@@ -4191,7 +4185,6 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 	}
 
 	buffer_info->skb = skb;
-	buffer_info->tx_flags = skb_shinfo(skb)->tx_flags;
 	/* multiply data chunks by size of headers */
 	buffer_info->bytecount = ((gso_segs - 1) * hlen) + skb->len;
 	buffer_info->gso_segs = gso_segs;
@@ -4211,7 +4204,6 @@ dma_error:
 	buffer_info->dma = 0;
 	buffer_info->time_stamp = 0;
 	buffer_info->length = 0;
-	buffer_info->mapped_as_page = false;
 
 	/* clear timestamp and dma mappings for remaining portion of packet */
 	while (count--) {
@@ -4347,7 +4339,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	 * count reflects descriptors mapped, if 0 or less then mapping error
 	 * has occurred and we need to rewind the descriptor queue
 	 */
-	count = igb_tx_map(tx_ring, skb, first);
+	count = igb_tx_map(tx_ring, skb, first, tx_flags);
 	if (!count) {
 		dev_kfree_skb_any(skb);
 		first->time_stamp = 0;
@@ -5567,7 +5559,7 @@ static void igb_tx_hwtstamp(struct igb_q_vector *q_vector,
 	u64 regval;
 
 	/* if skb does not support hw timestamp or TX stamp not valid exit */
-	if (likely(!(buffer_info->tx_flags & SKBTX_HW_TSTAMP)) ||
+	if (likely(!(buffer_info->tx_flags & IGB_TX_FLAGS_TSTAMP)) ||
 	    !(rd32(E1000_TSYNCTXCTL) & E1000_TSYNCTXCTL_VALID))
 		return;
 
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 11/13] igb: Cleanup protocol handling in transmit path
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change is meant to cleanup the protocol handling in the transmit path
so that it correctly offloads software VLAN tagged frames.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c |   23 +++++++++++------------
 1 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 2c61ec4..3ebeb3e 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3987,8 +3987,8 @@ void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens,
 	context_desc->mss_l4len_idx	= cpu_to_le32(mss_l4len_idx);
 }
 
-static inline int igb_tso(struct igb_ring *tx_ring,
-			  struct sk_buff *skb, u32 tx_flags, u8 *hdr_len)
+static inline int igb_tso(struct igb_ring *tx_ring, struct sk_buff *skb,
+			  u32 tx_flags, __be16 protocol, u8 *hdr_len)
 {
 	int err;
 	u32 vlan_macip_lens, type_tucmd;
@@ -4006,7 +4006,7 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
 	type_tucmd = E1000_ADVTXD_TUCMD_L4T_TCP;
 
-	if (skb->protocol == htons(ETH_P_IP)) {
+	if (protocol == __constant_htons(ETH_P_IP)) {
 		struct iphdr *iph = ip_hdr(skb);
 		iph->tot_len = 0;
 		iph->check = 0;
@@ -4039,8 +4039,8 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 	return 1;
 }
 
-static inline bool igb_tx_csum(struct igb_ring *tx_ring,
-			       struct sk_buff *skb, u32 tx_flags)
+static inline bool igb_tx_csum(struct igb_ring *tx_ring, struct sk_buff *skb,
+			       u32 tx_flags, __be16 protocol)
 {
 	u32 vlan_macip_lens = 0;
 	u32 mss_l4len_idx = 0;
@@ -4051,7 +4051,7 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 			return false;
 	} else {
 		u8 l4_hdr = 0;
-		switch (skb->protocol) {
+		switch (protocol) {
 		case __constant_htons(ETH_P_IP):
 			vlan_macip_lens |= skb_network_header_len(skb);
 			type_tucmd |= E1000_ADVTXD_TUCMD_IPV4;
@@ -4065,7 +4065,7 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 			if (unlikely(net_ratelimit())) {
 				dev_warn(tx_ring->dev,
 				 "partial checksum but proto=%x!\n",
-				 skb->protocol);
+				 protocol);
 			}
 			break;
 		}
@@ -4305,6 +4305,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	struct igb_tx_buffer *first;
 	int tso, count;
 	u32 tx_flags = 0;
+	__be16 protocol = vlan_get_protocol(skb);
 	u8 hdr_len = 0;
 
 	/* need: 1 descriptor per page,
@@ -4330,16 +4331,14 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
 
-	tso = igb_tso(tx_ring, skb, tx_flags, &hdr_len);
-
+	tso = igb_tso(tx_ring, skb, tx_flags, protocol, &hdr_len);
 	if (tso < 0) {
 		goto out_drop;
 	} else if (tso) {
 		tx_flags |= IGB_TX_FLAGS_TSO | IGB_TX_FLAGS_CSUM;
-		if (skb->protocol == htons(ETH_P_IP))
+		if (protocol == htons(ETH_P_IP))
 			tx_flags |= IGB_TX_FLAGS_IPV4;
-
-	} else if (igb_tx_csum(tx_ring, skb, tx_flags) &&
+	} else if (igb_tx_csum(tx_ring, skb, tx_flags, protocol) &&
 		   (skb->ip_summed == CHECKSUM_PARTIAL)) {
 		tx_flags |= IGB_TX_FLAGS_CSUM;
 	}
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 08/13] igb: Consolidate creation of Tx context descriptors into a single function
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch is meant to simplify the transmit path by reducing the overhead
for creating a transmit context descriptor.  The current implementation is
split with igb_tso and igb_tx_csum doing two separate implementations on
how to setup the tx_buffer_info structure and the tx_desc.  By combining
them it is possible to reduce code and simplify things since now only one
function will create context descriptors.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c |  233 +++++++++++++----------------
 1 files changed, 106 insertions(+), 127 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 2bdc783..a0bb81d 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -45,6 +45,9 @@
 #include <linux/pci-aspm.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sctp.h>
 #include <linux/if_ether.h>
 #include <linux/aer.h>
 #include <linux/prefetch.h>
@@ -3960,16 +3963,39 @@ set_itr_now:
 #define IGB_TX_FLAGS_VLAN_MASK		0xffff0000
 #define IGB_TX_FLAGS_VLAN_SHIFT		        16
 
+void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens,
+		     u32 type_tucmd, u32 mss_l4len_idx)
+{
+	struct e1000_adv_tx_context_desc *context_desc;
+	u16 i = tx_ring->next_to_use;
+
+	context_desc = IGB_TX_CTXTDESC(tx_ring, i);
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	/* set bits to identify this as an advanced context descriptor */
+	type_tucmd |= E1000_TXD_CMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
+
+	/* For 82575, context index must be unique per ring. */
+	if (tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX)
+		mss_l4len_idx |= tx_ring->reg_idx << 4;
+
+	context_desc->vlan_macip_lens	= cpu_to_le32(vlan_macip_lens);
+	context_desc->seqnum_seed	= 0;
+	context_desc->type_tucmd_mlhl	= cpu_to_le32(type_tucmd);
+	context_desc->mss_l4len_idx	= cpu_to_le32(mss_l4len_idx);
+}
+
 static inline int igb_tso(struct igb_ring *tx_ring,
 			  struct sk_buff *skb, u32 tx_flags, u8 *hdr_len)
 {
-	struct e1000_adv_tx_context_desc *context_desc;
-	unsigned int i;
 	int err;
-	struct igb_tx_buffer *buffer_info;
-	u32 info = 0, tu_cmd = 0;
-	u32 mss_l4len_idx;
-	u8 l4len;
+	u32 vlan_macip_lens, type_tucmd;
+	u32 mss_l4len_idx, l4len;
+
+	if (!skb_is_gso(skb))
+		return 0;
 
 	if (skb_header_cloned(skb)) {
 		err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
@@ -3977,8 +4003,8 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 			return err;
 	}
 
-	l4len = tcp_hdrlen(skb);
-	*hdr_len += l4len;
+	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
+	type_tucmd = E1000_ADVTXD_TUCMD_L4T_TCP;
 
 	if (skb->protocol == htons(ETH_P_IP)) {
 		struct iphdr *iph = ip_hdr(skb);
@@ -3988,6 +4014,7 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 							 iph->daddr, 0,
 							 IPPROTO_TCP,
 							 0);
+		type_tucmd |= E1000_ADVTXD_TUCMD_IPV4;
 	} else if (skb_is_gso_v6(skb)) {
 		ipv6_hdr(skb)->payload_len = 0;
 		tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
@@ -3995,131 +4022,85 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 						       0, IPPROTO_TCP, 0);
 	}
 
-	i = tx_ring->next_to_use;
-
-	buffer_info = &tx_ring->tx_buffer_info[i];
-	context_desc = IGB_TX_CTXTDESC(tx_ring, i);
-	/* VLAN MACLEN IPLEN */
-	if (tx_flags & IGB_TX_FLAGS_VLAN)
-		info |= (tx_flags & IGB_TX_FLAGS_VLAN_MASK);
-	info |= (skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT);
-	*hdr_len += skb_network_offset(skb);
-	info |= skb_network_header_len(skb);
-	*hdr_len += skb_network_header_len(skb);
-	context_desc->vlan_macip_lens = cpu_to_le32(info);
-
-	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
-	tu_cmd |= (E1000_TXD_CMD_DEXT | E1000_ADVTXD_DTYP_CTXT);
-
-	if (skb->protocol == htons(ETH_P_IP))
-		tu_cmd |= E1000_ADVTXD_TUCMD_IPV4;
-	tu_cmd |= E1000_ADVTXD_TUCMD_L4T_TCP;
-
-	context_desc->type_tucmd_mlhl = cpu_to_le32(tu_cmd);
+	l4len = tcp_hdrlen(skb);
+	*hdr_len = skb_transport_offset(skb) + l4len;
 
 	/* MSS L4LEN IDX */
-	mss_l4len_idx = (skb_shinfo(skb)->gso_size << E1000_ADVTXD_MSS_SHIFT);
-	mss_l4len_idx |= (l4len << E1000_ADVTXD_L4LEN_SHIFT);
+	mss_l4len_idx = l4len << E1000_ADVTXD_L4LEN_SHIFT;
+	mss_l4len_idx |= skb_shinfo(skb)->gso_size << E1000_ADVTXD_MSS_SHIFT;
 
-	/* For 82575, context index must be unique per ring. */
-	if (tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX)
-		mss_l4len_idx |= tx_ring->reg_idx << 4;
-
-	context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx);
-	context_desc->seqnum_seed = 0;
-
-	buffer_info->time_stamp = jiffies;
-	buffer_info->next_to_watch = i;
-	buffer_info->dma = 0;
-	i++;
-	if (i == tx_ring->count)
-		i = 0;
+	/* VLAN MACLEN IPLEN */
+	vlan_macip_lens = skb_network_header_len(skb);
+	vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT;
+	vlan_macip_lens |= tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
-	tx_ring->next_to_use = i;
+	igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx);
 
-	return true;
+	return 1;
 }
 
 static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 			       struct sk_buff *skb, u32 tx_flags)
 {
-	struct e1000_adv_tx_context_desc *context_desc;
-	struct device *dev = tx_ring->dev;
-	struct igb_tx_buffer *buffer_info;
-	u32 info = 0, tu_cmd = 0;
-	unsigned int i;
-
-	if ((skb->ip_summed == CHECKSUM_PARTIAL) ||
-	    (tx_flags & IGB_TX_FLAGS_VLAN)) {
-		i = tx_ring->next_to_use;
-		buffer_info = &tx_ring->tx_buffer_info[i];
-		context_desc = IGB_TX_CTXTDESC(tx_ring, i);
-
-		if (tx_flags & IGB_TX_FLAGS_VLAN)
-			info |= (tx_flags & IGB_TX_FLAGS_VLAN_MASK);
-
-		info |= (skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT);
-		if (skb->ip_summed == CHECKSUM_PARTIAL)
-			info |= skb_network_header_len(skb);
-
-		context_desc->vlan_macip_lens = cpu_to_le32(info);
-
-		tu_cmd |= (E1000_TXD_CMD_DEXT | E1000_ADVTXD_DTYP_CTXT);
-
-		if (skb->ip_summed == CHECKSUM_PARTIAL) {
-			__be16 protocol;
+	u32 vlan_macip_lens = 0;
+	u32 mss_l4len_idx = 0;
+	u32 type_tucmd = 0;
 
-			if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
-				const struct vlan_ethhdr *vhdr =
-				          (const struct vlan_ethhdr*)skb->data;
-
-				protocol = vhdr->h_vlan_encapsulated_proto;
-			} else {
-				protocol = skb->protocol;
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		if (!(tx_flags & IGB_TX_FLAGS_VLAN))
+			return false;
+	} else {
+		u8 l4_hdr = 0;
+		switch (skb->protocol) {
+		case __constant_htons(ETH_P_IP):
+			vlan_macip_lens |= skb_network_header_len(skb);
+			type_tucmd |= E1000_ADVTXD_TUCMD_IPV4;
+			l4_hdr = ip_hdr(skb)->protocol;
+			break;
+		case __constant_htons(ETH_P_IPV6):
+			vlan_macip_lens |= skb_network_header_len(skb);
+			l4_hdr = ipv6_hdr(skb)->nexthdr;
+			break;
+		default:
+			if (unlikely(net_ratelimit())) {
+				dev_warn(tx_ring->dev,
+				 "partial checksum but proto=%x!\n",
+				 skb->protocol);
 			}
+			break;
+		}
 
-			switch (protocol) {
-			case cpu_to_be16(ETH_P_IP):
-				tu_cmd |= E1000_ADVTXD_TUCMD_IPV4;
-				if (ip_hdr(skb)->protocol == IPPROTO_TCP)
-					tu_cmd |= E1000_ADVTXD_TUCMD_L4T_TCP;
-				else if (ip_hdr(skb)->protocol == IPPROTO_SCTP)
-					tu_cmd |= E1000_ADVTXD_TUCMD_L4T_SCTP;
-				break;
-			case cpu_to_be16(ETH_P_IPV6):
-				/* XXX what about other V6 headers?? */
-				if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
-					tu_cmd |= E1000_ADVTXD_TUCMD_L4T_TCP;
-				else if (ipv6_hdr(skb)->nexthdr == IPPROTO_SCTP)
-					tu_cmd |= E1000_ADVTXD_TUCMD_L4T_SCTP;
-				break;
-			default:
-				if (unlikely(net_ratelimit()))
-					dev_warn(dev,
-					    "partial checksum but proto=%x!\n",
-					    skb->protocol);
-				break;
+		switch (l4_hdr) {
+		case IPPROTO_TCP:
+			type_tucmd |= E1000_ADVTXD_TUCMD_L4T_TCP;
+			mss_l4len_idx = tcp_hdrlen(skb) <<
+					E1000_ADVTXD_L4LEN_SHIFT;
+			break;
+		case IPPROTO_SCTP:
+			type_tucmd |= E1000_ADVTXD_TUCMD_L4T_SCTP;
+			mss_l4len_idx = sizeof(struct sctphdr) <<
+					E1000_ADVTXD_L4LEN_SHIFT;
+			break;
+		case IPPROTO_UDP:
+			mss_l4len_idx = sizeof(struct udphdr) <<
+					E1000_ADVTXD_L4LEN_SHIFT;
+			break;
+		default:
+			if (unlikely(net_ratelimit())) {
+				dev_warn(tx_ring->dev,
+				 "partial checksum but l4 proto=%x!\n",
+				 l4_hdr);
 			}
+			break;
 		}
+	}
 
-		context_desc->type_tucmd_mlhl = cpu_to_le32(tu_cmd);
-		context_desc->seqnum_seed = 0;
-		if (tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX)
-			context_desc->mss_l4len_idx =
-				cpu_to_le32(tx_ring->reg_idx << 4);
+	vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT;
+	vlan_macip_lens |= tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
-		buffer_info->time_stamp = jiffies;
-		buffer_info->next_to_watch = i;
-		buffer_info->dma = 0;
+	igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx);
 
-		i++;
-		if (i == tx_ring->count)
-			i = 0;
-		tx_ring->next_to_use = i;
-
-		return true;
-	}
-	return false;
+	return (skb->ip_summed == CHECKSUM_PARTIAL);
 }
 
 #define IGB_MAX_TXD_PWR	16
@@ -4140,8 +4121,6 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 	buffer_info = &tx_ring->tx_buffer_info[i];
 	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
 	buffer_info->length = hlen;
-	/* set time_stamp *before* dma to help avoid a possible race */
-	buffer_info->time_stamp = jiffies;
 	buffer_info->next_to_watch = i;
 	buffer_info->dma = dma_map_single(dev, skb->data, hlen,
 					  DMA_TO_DEVICE);
@@ -4160,7 +4139,6 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 		buffer_info = &tx_ring->tx_buffer_info[i];
 		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
 		buffer_info->length = len;
-		buffer_info->time_stamp = jiffies;
 		buffer_info->next_to_watch = i;
 		buffer_info->mapped_as_page = true;
 		buffer_info->dma = skb_frag_dma_map(dev, frag, 0, len,
@@ -4176,6 +4154,7 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 	buffer_info->bytecount = ((gso_segs - 1) * hlen) + skb->len;
 	buffer_info->gso_segs = gso_segs;
 	tx_ring->tx_buffer_info[first].next_to_watch = i;
+	tx_ring->tx_buffer_info[first].time_stamp = jiffies;
 
 	return ++count;
 
@@ -4304,7 +4283,7 @@ static inline int igb_maybe_stop_tx(struct igb_ring *tx_ring, int size)
 netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 				struct igb_ring *tx_ring)
 {
-	int tso = 0, count;
+	int tso, count;
 	u32 tx_flags = 0;
 	u16 first;
 	u8 hdr_len = 0;
@@ -4333,16 +4312,12 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 		tx_flags |= IGB_TX_FLAGS_IPV4;
 
 	first = tx_ring->next_to_use;
-	if (skb_is_gso(skb)) {
-		tso = igb_tso(tx_ring, skb, tx_flags, &hdr_len);
 
-		if (tso < 0) {
-			dev_kfree_skb_any(skb);
-			return NETDEV_TX_OK;
-		}
-	}
+	tso = igb_tso(tx_ring, skb, tx_flags, &hdr_len);
 
-	if (tso)
+	if (tso < 0)
+		goto out_drop;
+	else if (tso)
 		tx_flags |= IGB_TX_FLAGS_TSO;
 	else if (igb_tx_csum(tx_ring, skb, tx_flags) &&
 	         (skb->ip_summed == CHECKSUM_PARTIAL))
@@ -4366,6 +4341,10 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	igb_maybe_stop_tx(tx_ring, MAX_SKB_FRAGS + 4);
 
 	return NETDEV_TX_OK;
+
+out_drop:
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
 }
 
 static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 09/13] igb: Make first and tx_buffer_info->next_to_watch into pointers
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change converts two tx_buffer_info index values into pointers.  The
advantage to this is that we reduce unnecessary computations and in the case
of next_to_watch we get an added bonus of the value being able to provide
additional information as a NULL value indicates it is unset versus a 0 not
having any meaning for the index value.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h      |    2 +-
 drivers/net/ethernet/intel/igb/igb_main.c |   66 ++++++++++++++++-------------
 2 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 56c68fc..7185667 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -133,7 +133,7 @@ struct vf_data_storage {
 /* wrapper around a pointer to a socket buffer,
  * so a DMA handle can be stored along with the buffer */
 struct igb_tx_buffer {
-	u16 next_to_watch;
+	union e1000_adv_tx_desc *next_to_watch;
 	unsigned long time_stamp;
 	dma_addr_t dma;
 	u32 length;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index a0bb81d..edc2cae 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -381,7 +381,7 @@ static void igb_dump(struct igb_adapter *adapter)
 		struct igb_tx_buffer *buffer_info;
 		tx_ring = adapter->tx_ring[n];
 		buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_clean];
-		printk(KERN_INFO " %5d %5X %5X %016llX %04X %3X %016llX\n",
+		printk(KERN_INFO " %5d %5X %5X %016llX %04X %p %016llX\n",
 			   n, tx_ring->next_to_use, tx_ring->next_to_clean,
 			   (u64)buffer_info->dma,
 			   buffer_info->length,
@@ -421,7 +421,7 @@ static void igb_dump(struct igb_adapter *adapter)
 			buffer_info = &tx_ring->tx_buffer_info[i];
 			u0 = (struct my_u0 *)tx_desc;
 			printk(KERN_INFO "T [0x%03X]    %016llX %016llX %016llX"
-				" %04X  %3X %016llX %p", i,
+				" %04X  %p %016llX %p", i,
 				le64_to_cpu(u0->a),
 				le64_to_cpu(u0->b),
 				(u64)buffer_info->dma,
@@ -3161,7 +3161,7 @@ void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
 	}
 	buffer_info->time_stamp = 0;
 	buffer_info->length = 0;
-	buffer_info->next_to_watch = 0;
+	buffer_info->next_to_watch = NULL;
 	buffer_info->mapped_as_page = false;
 }
 
@@ -4107,7 +4107,7 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 #define IGB_MAX_DATA_PER_TXD	(1<<IGB_MAX_TXD_PWR)
 
 static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
-			     unsigned int first)
+			     struct igb_tx_buffer *first)
 {
 	struct igb_tx_buffer *buffer_info;
 	struct device *dev = tx_ring->dev;
@@ -4121,7 +4121,6 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 	buffer_info = &tx_ring->tx_buffer_info[i];
 	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
 	buffer_info->length = hlen;
-	buffer_info->next_to_watch = i;
 	buffer_info->dma = dma_map_single(dev, skb->data, hlen,
 					  DMA_TO_DEVICE);
 	if (dma_mapping_error(dev, buffer_info->dma))
@@ -4139,7 +4138,6 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 		buffer_info = &tx_ring->tx_buffer_info[i];
 		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
 		buffer_info->length = len;
-		buffer_info->next_to_watch = i;
 		buffer_info->mapped_as_page = true;
 		buffer_info->dma = skb_frag_dma_map(dev, frag, 0, len,
 						DMA_TO_DEVICE);
@@ -4153,8 +4151,12 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 	/* multiply data chunks by size of headers */
 	buffer_info->bytecount = ((gso_segs - 1) * hlen) + skb->len;
 	buffer_info->gso_segs = gso_segs;
-	tx_ring->tx_buffer_info[first].next_to_watch = i;
-	tx_ring->tx_buffer_info[first].time_stamp = jiffies;
+
+	/* set the timestamp */
+	first->time_stamp = jiffies;
+
+	/* set next_to_watch value indicating a packet is present */
+	first->next_to_watch = IGB_TX_DESC(tx_ring, i);
 
 	return ++count;
 
@@ -4165,7 +4167,6 @@ dma_error:
 	buffer_info->dma = 0;
 	buffer_info->time_stamp = 0;
 	buffer_info->length = 0;
-	buffer_info->next_to_watch = 0;
 	buffer_info->mapped_as_page = false;
 
 	/* clear timestamp and dma mappings for remaining portion of packet */
@@ -4283,9 +4284,9 @@ static inline int igb_maybe_stop_tx(struct igb_ring *tx_ring, int size)
 netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 				struct igb_ring *tx_ring)
 {
+	struct igb_tx_buffer *first;
 	int tso, count;
 	u32 tx_flags = 0;
-	u16 first;
 	u8 hdr_len = 0;
 
 	/* need: 1 descriptor per page,
@@ -4311,7 +4312,8 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	if (skb->protocol == htons(ETH_P_IP))
 		tx_flags |= IGB_TX_FLAGS_IPV4;
 
-	first = tx_ring->next_to_use;
+	/* record the location of the first descriptor for this packet */
+	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
 
 	tso = igb_tso(tx_ring, skb, tx_flags, &hdr_len);
 
@@ -4330,8 +4332,8 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	count = igb_tx_map(tx_ring, skb, first);
 	if (!count) {
 		dev_kfree_skb_any(skb);
-		tx_ring->tx_buffer_info[first].time_stamp = 0;
-		tx_ring->next_to_use = first;
+		first->time_stamp = 0;
+		tx_ring->next_to_use = first - tx_ring->tx_buffer_info;
 		return NETDEV_TX_OK;
 	}
 
@@ -5568,29 +5570,34 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct igb_ring *tx_ring = q_vector->tx_ring;
 	struct igb_tx_buffer *tx_buffer;
-	union e1000_adv_tx_desc *tx_desc;
+	union e1000_adv_tx_desc *tx_desc, *eop_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = q_vector->tx_work_limit;
-	u16 i = tx_ring->next_to_clean;
+	unsigned int i = tx_ring->next_to_clean;
 
 	if (test_bit(__IGB_DOWN, &adapter->state))
 		return true;
 
 	tx_buffer = &tx_ring->tx_buffer_info[i];
 	tx_desc = IGB_TX_DESC(tx_ring, i);
+	i -= tx_ring->count;
 
 	for (; budget; budget--) {
-		u16 eop = tx_buffer->next_to_watch;
-		union e1000_adv_tx_desc *eop_desc;
+		eop_desc = tx_buffer->next_to_watch;
 
-		eop_desc = IGB_TX_DESC(tx_ring, eop);
+		/* prevent any other reads prior to eop_desc */
+		rmb();
+
+		/* if next_to_watch is not set then there is no work pending */
+		if (!eop_desc)
+			break;
 
 		/* if DD is not set pending work has not been completed */
 		if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)))
 			break;
 
-		/* prevent any other reads prior to eop_desc being verified */
-		rmb();
+		/* clear next_to_watch to prevent false hangs */
+		tx_buffer->next_to_watch = NULL;
 
 		do {
 			tx_desc->wb.status = 0;
@@ -5607,14 +5614,15 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 			tx_buffer++;
 			tx_desc++;
 			i++;
-			if (unlikely(i == tx_ring->count)) {
-				i = 0;
+			if (unlikely(!i)) {
+				i -= tx_ring->count;
 				tx_buffer = tx_ring->tx_buffer_info;
 				tx_desc = IGB_TX_DESC(tx_ring, 0);
 			}
 		} while (eop_desc);
 	}
 
+	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
 	u64_stats_update_begin(&tx_ring->tx_syncp);
 	tx_ring->tx_stats.bytes += total_bytes;
@@ -5625,16 +5633,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 
 	if (tx_ring->detect_tx_hung) {
 		struct e1000_hw *hw = &adapter->hw;
-		u16 eop = tx_ring->tx_buffer_info[i].next_to_watch;
-		union e1000_adv_tx_desc *eop_desc;
 
-		eop_desc = IGB_TX_DESC(tx_ring, eop);
+		eop_desc = tx_buffer->next_to_watch;
 
 		/* Detect a transmit hang in hardware, this serializes the
 		 * check with the clearing of time_stamp and movement of i */
 		tx_ring->detect_tx_hung = false;
-		if (tx_ring->tx_buffer_info[i].time_stamp &&
-		    time_after(jiffies, tx_ring->tx_buffer_info[i].time_stamp +
+		if (eop_desc &&
+		    time_after(jiffies, tx_buffer->time_stamp +
 			       (adapter->tx_timeout_factor * HZ)) &&
 		    !(rd32(E1000_STATUS) & E1000_STATUS_TXOFF)) {
 
@@ -5648,7 +5654,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				"  next_to_clean        <%x>\n"
 				"buffer_info[next_to_clean]\n"
 				"  time_stamp           <%lx>\n"
-				"  next_to_watch        <%x>\n"
+				"  next_to_watch        <%p>\n"
 				"  jiffies              <%lx>\n"
 				"  desc.status          <%x>\n",
 				tx_ring->queue_index,
@@ -5656,8 +5662,8 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				readl(tx_ring->tail),
 				tx_ring->next_to_use,
 				tx_ring->next_to_clean,
-				tx_ring->tx_buffer_info[eop].time_stamp,
-				eop,
+				tx_buffer->time_stamp,
+				eop_desc,
 				jiffies,
 				eop_desc->wb.status);
 			netif_stop_subqueue(tx_ring->netdev,
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 10/13] igb: Create separate functions for generating cmd_type and olinfo
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change is meant to improve the readability of the driver by separating
out the cmd_type configuration and the olinfo configuration into their own
functions.  By doing this it is much easier to determine which ingredients
go into setting up these to portions of the descriptor.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_82575.h |    2 +
 drivers/net/ethernet/intel/igb/igb.h         |    2 +-
 drivers/net/ethernet/intel/igb/igb_main.c    |  105 +++++++++++++++-----------
 3 files changed, 65 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.h b/drivers/net/ethernet/intel/igb/e1000_82575.h
index 786e110..08a757e 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.h
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.h
@@ -130,7 +130,9 @@ union e1000_adv_tx_desc {
 #define E1000_ADVTXD_MAC_TSTAMP   0x00080000 /* IEEE1588 Timestamp packet */
 #define E1000_ADVTXD_DTYP_CTXT    0x00200000 /* Advanced Context Descriptor */
 #define E1000_ADVTXD_DTYP_DATA    0x00300000 /* Advanced Data Descriptor */
+#define E1000_ADVTXD_DCMD_EOP     0x01000000 /* End of Packet */
 #define E1000_ADVTXD_DCMD_IFCS    0x02000000 /* Insert FCS (Ethernet CRC) */
+#define E1000_ADVTXD_DCMD_RS      0x08000000 /* Report Status */
 #define E1000_ADVTXD_DCMD_DEXT    0x20000000 /* Descriptor extension (1=Adv) */
 #define E1000_ADVTXD_DCMD_VLE     0x40000000 /* VLAN pkt enable */
 #define E1000_ADVTXD_DCMD_TSE     0x80000000 /* TCP Seg enable */
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 7185667..1608110 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -231,7 +231,7 @@ struct igb_ring {
 
 #define IGB_RING_FLAG_TX_CTX_IDX     0x00000001 /* HW requires context index */
 
-#define IGB_ADVTXD_DCMD (E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS)
+#define IGB_TXD_DCMD (E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS)
 
 #define IGB_RX_DESC(R, i)	    \
 	(&(((union e1000_adv_rx_desc *)((R)->desc))[i]))
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index edc2cae..2c61ec4 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4103,6 +4103,50 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 	return (skb->ip_summed == CHECKSUM_PARTIAL);
 }
 
+static __le32 igb_tx_cmd_type(u32 tx_flags)
+{
+	/* set type for advanced descriptor with frame checksum insertion */
+	__le32 cmd_type = cpu_to_le32(E1000_ADVTXD_DTYP_DATA |
+				      E1000_ADVTXD_DCMD_IFCS |
+				      E1000_ADVTXD_DCMD_DEXT);
+
+	/* set HW vlan bit if vlan is present */
+	if (tx_flags & IGB_TX_FLAGS_VLAN)
+		cmd_type |= cpu_to_le32(E1000_ADVTXD_DCMD_VLE);
+
+	/* set timestamp bit if present */
+	if (tx_flags & IGB_TX_FLAGS_TSTAMP)
+		cmd_type |= cpu_to_le32(E1000_ADVTXD_MAC_TSTAMP);
+
+	/* set segmentation bits for TSO */
+	if (tx_flags & IGB_TX_FLAGS_TSO)
+		cmd_type |= cpu_to_le32(E1000_ADVTXD_DCMD_TSE);
+
+	return cmd_type;
+}
+
+static __le32 igb_tx_olinfo_status(u32 tx_flags, unsigned int paylen,
+				   struct igb_ring *tx_ring)
+{
+	u32 olinfo_status = paylen << E1000_ADVTXD_PAYLEN_SHIFT;
+
+	/* 82575 requires a unique index per ring if any offload is enabled */
+	if ((tx_flags & (IGB_TX_FLAGS_CSUM | IGB_TX_FLAGS_VLAN)) &&
+	    (tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX))
+		olinfo_status |= tx_ring->reg_idx << 4;
+
+	/* insert L4 checksum */
+	if (tx_flags & IGB_TX_FLAGS_CSUM) {
+		olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
+
+		/* insert IPv4 checksum */
+		if (tx_flags & IGB_TX_FLAGS_IPV4)
+			olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
+	}
+
+	return cpu_to_le32(olinfo_status);
+}
+
 #define IGB_MAX_TXD_PWR	16
 #define IGB_MAX_DATA_PER_TXD	(1<<IGB_MAX_TXD_PWR)
 
@@ -4187,54 +4231,28 @@ static inline void igb_tx_queue(struct igb_ring *tx_ring,
 {
 	union e1000_adv_tx_desc *tx_desc;
 	struct igb_tx_buffer *buffer_info;
-	u32 olinfo_status = 0, cmd_type_len;
+	__le32 olinfo_status, cmd_type;
 	unsigned int i = tx_ring->next_to_use;
 
-	cmd_type_len = (E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_IFCS |
-			E1000_ADVTXD_DCMD_DEXT);
-
-	if (tx_flags & IGB_TX_FLAGS_VLAN)
-		cmd_type_len |= E1000_ADVTXD_DCMD_VLE;
-
-	if (tx_flags & IGB_TX_FLAGS_TSTAMP)
-		cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
-
-	if (tx_flags & IGB_TX_FLAGS_TSO) {
-		cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
-
-		/* insert tcp checksum */
-		olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
-
-		/* insert ip checksum */
-		if (tx_flags & IGB_TX_FLAGS_IPV4)
-			olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
-
-	} else if (tx_flags & IGB_TX_FLAGS_CSUM) {
-		olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
-	}
-
-	if ((tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX) &&
-	    (tx_flags & (IGB_TX_FLAGS_CSUM |
-	                 IGB_TX_FLAGS_TSO |
-			 IGB_TX_FLAGS_VLAN)))
-		olinfo_status |= tx_ring->reg_idx << 4;
-
-	olinfo_status |= ((paylen - hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT);
+	cmd_type = igb_tx_cmd_type(tx_flags);
+	olinfo_status = igb_tx_olinfo_status(tx_flags,
+					     paylen - hdr_len,
+					     tx_ring);
 
 	do {
 		buffer_info = &tx_ring->tx_buffer_info[i];
 		tx_desc = IGB_TX_DESC(tx_ring, i);
 		tx_desc->read.buffer_addr = cpu_to_le64(buffer_info->dma);
-		tx_desc->read.cmd_type_len =
-			cpu_to_le32(cmd_type_len | buffer_info->length);
-		tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
+		tx_desc->read.cmd_type_len = cmd_type |
+					     cpu_to_le32(buffer_info->length);
+		tx_desc->read.olinfo_status = olinfo_status;
 		count--;
 		i++;
 		if (i == tx_ring->count)
 			i = 0;
 	} while (count > 0);
 
-	tx_desc->read.cmd_type_len |= cpu_to_le32(IGB_ADVTXD_DCMD);
+	tx_desc->read.cmd_type_len |= cpu_to_le32(IGB_TXD_DCMD);
 	/* Force memory writes to complete before letting h/w
 	 * know there are new descriptors to fetch.  (Only
 	 * applicable for weak-ordered memory model archs,
@@ -4309,21 +4327,22 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 		tx_flags |= (vlan_tx_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
 	}
 
-	if (skb->protocol == htons(ETH_P_IP))
-		tx_flags |= IGB_TX_FLAGS_IPV4;
-
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
 
 	tso = igb_tso(tx_ring, skb, tx_flags, &hdr_len);
 
-	if (tso < 0)
+	if (tso < 0) {
 		goto out_drop;
-	else if (tso)
-		tx_flags |= IGB_TX_FLAGS_TSO;
-	else if (igb_tx_csum(tx_ring, skb, tx_flags) &&
-	         (skb->ip_summed == CHECKSUM_PARTIAL))
+	} else if (tso) {
+		tx_flags |= IGB_TX_FLAGS_TSO | IGB_TX_FLAGS_CSUM;
+		if (skb->protocol == htons(ETH_P_IP))
+			tx_flags |= IGB_TX_FLAGS_IPV4;
+
+	} else if (igb_tx_csum(tx_ring, skb, tx_flags) &&
+		   (skb->ip_summed == CHECKSUM_PARTIAL)) {
 		tx_flags |= IGB_TX_FLAGS_CSUM;
+	}
 
 	/*
 	 * count reflects descriptors mapped, if 0 or less then mapping error
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 07/13] igb: split buffer_info into tx_buffer_info and rx_buffer_info
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

In order to be able to improve the performance of the TX path it has been
necessary to add addition info to the tx_buffer_info structure.  However a
side effect is that the structure has gotten larger and this in turn has
also increased the size of the RX buffer info structure.  In order to avoid
this in the future I am splitting the single buffer_info structure into two
separate ones and instead I will join them by making the buffer_info
pointer in the ring a union of the two.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |   42 +++++-----
 drivers/net/ethernet/intel/igb/igb_ethtool.c |   15 ++--
 drivers/net/ethernet/intel/igb/igb_main.c    |  123 +++++++++++++-------------
 3 files changed, 92 insertions(+), 88 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index beab918..56c68fc 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -132,27 +132,24 @@ struct vf_data_storage {
 
 /* wrapper around a pointer to a socket buffer,
  * so a DMA handle can be stored along with the buffer */
-struct igb_buffer {
+struct igb_tx_buffer {
+	u16 next_to_watch;
+	unsigned long time_stamp;
+	dma_addr_t dma;
+	u32 length;
+	u32 tx_flags;
+	struct sk_buff *skb;
+	unsigned int bytecount;
+	u16 gso_segs;
+	u8 mapped_as_page;
+};
+
+struct igb_rx_buffer {
 	struct sk_buff *skb;
 	dma_addr_t dma;
-	union {
-		/* TX */
-		struct {
-			unsigned long time_stamp;
-			u16 length;
-			u16 next_to_watch;
-			unsigned int bytecount;
-			u16 gso_segs;
-			u8 tx_flags;
-			u8 mapped_as_page;
-		};
-		/* RX */
-		struct {
-			struct page *page;
-			dma_addr_t page_dma;
-			u16 page_offset;
-		};
-	};
+	struct page *page;
+	dma_addr_t page_dma;
+	u32 page_offset;
 };
 
 struct igb_tx_queue_stats {
@@ -191,7 +188,10 @@ struct igb_ring {
 	struct igb_q_vector *q_vector;	/* backlink to q_vector */
 	struct net_device *netdev;	/* back pointer to net_device */
 	struct device *dev;		/* device pointer for dma mapping */
-	struct igb_buffer *buffer_info;	/* array of buffer info structs */
+	union {				/* array of buffer info structs */
+		struct igb_tx_buffer *tx_buffer_info;
+		struct igb_rx_buffer *rx_buffer_info;
+	};
 	void *desc;			/* descriptor ring memory */
 	unsigned long flags;		/* ring specific flags */
 	void __iomem *tail;		/* pointer to ring tail register */
@@ -377,7 +377,7 @@ extern void igb_setup_tctl(struct igb_adapter *);
 extern void igb_setup_rctl(struct igb_adapter *);
 extern netdev_tx_t igb_xmit_frame_ring(struct sk_buff *, struct igb_ring *);
 extern void igb_unmap_and_free_tx_resource(struct igb_ring *,
-					   struct igb_buffer *);
+					   struct igb_tx_buffer *);
 extern void igb_alloc_rx_buffers(struct igb_ring *, u16);
 extern void igb_update_stats(struct igb_adapter *, struct rtnl_link_stats64 *);
 extern bool igb_has_link(struct igb_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index a445c4f..f227fc5 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1579,7 +1579,8 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
                                 unsigned int size)
 {
 	union e1000_adv_rx_desc *rx_desc;
-	struct igb_buffer *buffer_info;
+	struct igb_rx_buffer *rx_buffer_info;
+	struct igb_tx_buffer *tx_buffer_info;
 	int rx_ntc, tx_ntc, count = 0;
 	u32 staterr;
 
@@ -1591,22 +1592,22 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 
 	while (staterr & E1000_RXD_STAT_DD) {
 		/* check rx buffer */
-		buffer_info = &rx_ring->buffer_info[rx_ntc];
+		rx_buffer_info = &rx_ring->rx_buffer_info[rx_ntc];
 
 		/* unmap rx buffer, will be remapped by alloc_rx_buffers */
 		dma_unmap_single(rx_ring->dev,
-		                 buffer_info->dma,
+				 rx_buffer_info->dma,
 				 IGB_RX_HDR_LEN,
 				 DMA_FROM_DEVICE);
-		buffer_info->dma = 0;
+		rx_buffer_info->dma = 0;
 
 		/* verify contents of skb */
-		if (!igb_check_lbtest_frame(buffer_info->skb, size))
+		if (!igb_check_lbtest_frame(rx_buffer_info->skb, size))
 			count++;
 
 		/* unmap buffer on tx side */
-		buffer_info = &tx_ring->buffer_info[tx_ntc];
-		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
+		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ntc];
+		igb_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
 
 		/* increment rx/tx next to clean counters */
 		rx_ntc++;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 12faa99..2bdc783 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -339,7 +339,6 @@ static void igb_dump(struct igb_adapter *adapter)
 	struct igb_ring *tx_ring;
 	union e1000_adv_tx_desc *tx_desc;
 	struct my_u0 { u64 a; u64 b; } *u0;
-	struct igb_buffer *buffer_info;
 	struct igb_ring *rx_ring;
 	union e1000_adv_rx_desc *rx_desc;
 	u32 staterr;
@@ -376,8 +375,9 @@ static void igb_dump(struct igb_adapter *adapter)
 	printk(KERN_INFO "Queue [NTU] [NTC] [bi(ntc)->dma  ]"
 		" leng ntw timestamp\n");
 	for (n = 0; n < adapter->num_tx_queues; n++) {
+		struct igb_tx_buffer *buffer_info;
 		tx_ring = adapter->tx_ring[n];
-		buffer_info = &tx_ring->buffer_info[tx_ring->next_to_clean];
+		buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_clean];
 		printk(KERN_INFO " %5d %5X %5X %016llX %04X %3X %016llX\n",
 			   n, tx_ring->next_to_use, tx_ring->next_to_clean,
 			   (u64)buffer_info->dma,
@@ -413,8 +413,9 @@ static void igb_dump(struct igb_adapter *adapter)
 			"leng  ntw timestamp        bi->skb\n");
 
 		for (i = 0; tx_ring->desc && (i < tx_ring->count); i++) {
+			struct igb_tx_buffer *buffer_info;
 			tx_desc = IGB_TX_DESC(tx_ring, i);
-			buffer_info = &tx_ring->buffer_info[i];
+			buffer_info = &tx_ring->tx_buffer_info[i];
 			u0 = (struct my_u0 *)tx_desc;
 			printk(KERN_INFO "T [0x%03X]    %016llX %016llX %016llX"
 				" %04X  %3X %016llX %p", i,
@@ -493,7 +494,8 @@ rx_ring_summary:
 			"<-- Adv Rx Write-Back format\n");
 
 		for (i = 0; i < rx_ring->count; i++) {
-			buffer_info = &rx_ring->buffer_info[i];
+			struct igb_rx_buffer *buffer_info;
+			buffer_info = &rx_ring->rx_buffer_info[i];
 			rx_desc = IGB_RX_DESC(rx_ring, i);
 			u0 = (struct my_u0 *)rx_desc;
 			staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
@@ -2576,9 +2578,9 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring)
 	struct device *dev = tx_ring->dev;
 	int size;
 
-	size = sizeof(struct igb_buffer) * tx_ring->count;
-	tx_ring->buffer_info = vzalloc(size);
-	if (!tx_ring->buffer_info)
+	size = sizeof(struct igb_tx_buffer) * tx_ring->count;
+	tx_ring->tx_buffer_info = vzalloc(size);
+	if (!tx_ring->tx_buffer_info)
 		goto err;
 
 	/* round up to nearest 4K */
@@ -2598,7 +2600,7 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring)
 	return 0;
 
 err:
-	vfree(tx_ring->buffer_info);
+	vfree(tx_ring->tx_buffer_info);
 	dev_err(dev,
 		"Unable to allocate memory for the transmit descriptor ring\n");
 	return -ENOMEM;
@@ -2719,9 +2721,9 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	struct device *dev = rx_ring->dev;
 	int size, desc_len;
 
-	size = sizeof(struct igb_buffer) * rx_ring->count;
-	rx_ring->buffer_info = vzalloc(size);
-	if (!rx_ring->buffer_info)
+	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
+	rx_ring->rx_buffer_info = vzalloc(size);
+	if (!rx_ring->rx_buffer_info)
 		goto err;
 
 	desc_len = sizeof(union e1000_adv_rx_desc);
@@ -2744,8 +2746,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	return 0;
 
 err:
-	vfree(rx_ring->buffer_info);
-	rx_ring->buffer_info = NULL;
+	vfree(rx_ring->rx_buffer_info);
+	rx_ring->rx_buffer_info = NULL;
 	dev_err(dev, "Unable to allocate memory for the receive descriptor"
 		" ring\n");
 	return -ENOMEM;
@@ -3107,8 +3109,8 @@ void igb_free_tx_resources(struct igb_ring *tx_ring)
 {
 	igb_clean_tx_ring(tx_ring);
 
-	vfree(tx_ring->buffer_info);
-	tx_ring->buffer_info = NULL;
+	vfree(tx_ring->tx_buffer_info);
+	tx_ring->tx_buffer_info = NULL;
 
 	/* if not set, then don't free */
 	if (!tx_ring->desc)
@@ -3135,7 +3137,7 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter)
 }
 
 void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
-				    struct igb_buffer *buffer_info)
+				    struct igb_tx_buffer *buffer_info)
 {
 	if (buffer_info->dma) {
 		if (buffer_info->mapped_as_page)
@@ -3166,21 +3168,21 @@ void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
  **/
 static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 {
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	unsigned long size;
 	unsigned int i;
 
-	if (!tx_ring->buffer_info)
+	if (!tx_ring->tx_buffer_info)
 		return;
 	/* Free all the Tx ring sk_buffs */
 
 	for (i = 0; i < tx_ring->count; i++) {
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 	}
 
-	size = sizeof(struct igb_buffer) * tx_ring->count;
-	memset(tx_ring->buffer_info, 0, size);
+	size = sizeof(struct igb_tx_buffer) * tx_ring->count;
+	memset(tx_ring->tx_buffer_info, 0, size);
 
 	/* Zero out the descriptor ring */
 	memset(tx_ring->desc, 0, tx_ring->size);
@@ -3211,8 +3213,8 @@ void igb_free_rx_resources(struct igb_ring *rx_ring)
 {
 	igb_clean_rx_ring(rx_ring);
 
-	vfree(rx_ring->buffer_info);
-	rx_ring->buffer_info = NULL;
+	vfree(rx_ring->rx_buffer_info);
+	rx_ring->rx_buffer_info = NULL;
 
 	/* if not set, then don't free */
 	if (!rx_ring->desc)
@@ -3247,12 +3249,12 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	unsigned long size;
 	u16 i;
 
-	if (!rx_ring->buffer_info)
+	if (!rx_ring->rx_buffer_info)
 		return;
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
-		struct igb_buffer *buffer_info = &rx_ring->buffer_info[i];
+		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
 		if (buffer_info->dma) {
 			dma_unmap_single(rx_ring->dev,
 			                 buffer_info->dma,
@@ -3279,8 +3281,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 		}
 	}
 
-	size = sizeof(struct igb_buffer) * rx_ring->count;
-	memset(rx_ring->buffer_info, 0, size);
+	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
+	memset(rx_ring->rx_buffer_info, 0, size);
 
 	/* Zero out the descriptor ring */
 	memset(rx_ring->desc, 0, rx_ring->size);
@@ -3964,7 +3966,7 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 	struct e1000_adv_tx_context_desc *context_desc;
 	unsigned int i;
 	int err;
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	u32 info = 0, tu_cmd = 0;
 	u32 mss_l4len_idx;
 	u8 l4len;
@@ -3995,7 +3997,7 @@ static inline int igb_tso(struct igb_ring *tx_ring,
 
 	i = tx_ring->next_to_use;
 
-	buffer_info = &tx_ring->buffer_info[i];
+	buffer_info = &tx_ring->tx_buffer_info[i];
 	context_desc = IGB_TX_CTXTDESC(tx_ring, i);
 	/* VLAN MACLEN IPLEN */
 	if (tx_flags & IGB_TX_FLAGS_VLAN)
@@ -4043,14 +4045,14 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 {
 	struct e1000_adv_tx_context_desc *context_desc;
 	struct device *dev = tx_ring->dev;
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	u32 info = 0, tu_cmd = 0;
 	unsigned int i;
 
 	if ((skb->ip_summed == CHECKSUM_PARTIAL) ||
 	    (tx_flags & IGB_TX_FLAGS_VLAN)) {
 		i = tx_ring->next_to_use;
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		context_desc = IGB_TX_CTXTDESC(tx_ring, i);
 
 		if (tx_flags & IGB_TX_FLAGS_VLAN)
@@ -4126,7 +4128,7 @@ static inline bool igb_tx_csum(struct igb_ring *tx_ring,
 static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 			     unsigned int first)
 {
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	struct device *dev = tx_ring->dev;
 	unsigned int hlen = skb_headlen(skb);
 	unsigned int count = 0, i;
@@ -4135,7 +4137,7 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 
 	i = tx_ring->next_to_use;
 
-	buffer_info = &tx_ring->buffer_info[i];
+	buffer_info = &tx_ring->tx_buffer_info[i];
 	BUG_ON(hlen >= IGB_MAX_DATA_PER_TXD);
 	buffer_info->length = hlen;
 	/* set time_stamp *before* dma to help avoid a possible race */
@@ -4155,7 +4157,7 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 		if (i == tx_ring->count)
 			i = 0;
 
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		BUG_ON(len >= IGB_MAX_DATA_PER_TXD);
 		buffer_info->length = len;
 		buffer_info->time_stamp = jiffies;
@@ -4168,12 +4170,12 @@ static inline int igb_tx_map(struct igb_ring *tx_ring, struct sk_buff *skb,
 
 	}
 
-	tx_ring->buffer_info[i].skb = skb;
-	tx_ring->buffer_info[i].tx_flags = skb_shinfo(skb)->tx_flags;
+	buffer_info->skb = skb;
+	buffer_info->tx_flags = skb_shinfo(skb)->tx_flags;
 	/* multiply data chunks by size of headers */
-	tx_ring->buffer_info[i].bytecount = ((gso_segs - 1) * hlen) + skb->len;
-	tx_ring->buffer_info[i].gso_segs = gso_segs;
-	tx_ring->buffer_info[first].next_to_watch = i;
+	buffer_info->bytecount = ((gso_segs - 1) * hlen) + skb->len;
+	buffer_info->gso_segs = gso_segs;
+	tx_ring->tx_buffer_info[first].next_to_watch = i;
 
 	return ++count;
 
@@ -4192,7 +4194,7 @@ dma_error:
 		if (i == 0)
 			i = tx_ring->count;
 		i--;
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 	}
 
@@ -4204,7 +4206,7 @@ static inline void igb_tx_queue(struct igb_ring *tx_ring,
 				u8 hdr_len)
 {
 	union e1000_adv_tx_desc *tx_desc;
-	struct igb_buffer *buffer_info;
+	struct igb_tx_buffer *buffer_info;
 	u32 olinfo_status = 0, cmd_type_len;
 	unsigned int i = tx_ring->next_to_use;
 
@@ -4240,7 +4242,7 @@ static inline void igb_tx_queue(struct igb_ring *tx_ring,
 	olinfo_status |= ((paylen - hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT);
 
 	do {
-		buffer_info = &tx_ring->buffer_info[i];
+		buffer_info = &tx_ring->tx_buffer_info[i];
 		tx_desc = IGB_TX_DESC(tx_ring, i);
 		tx_desc->read.buffer_addr = cpu_to_le64(buffer_info->dma);
 		tx_desc->read.cmd_type_len =
@@ -4353,7 +4355,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	count = igb_tx_map(tx_ring, skb, first);
 	if (!count) {
 		dev_kfree_skb_any(skb);
-		tx_ring->buffer_info[first].time_stamp = 0;
+		tx_ring->tx_buffer_info[first].time_stamp = 0;
 		tx_ring->next_to_use = first;
 		return NETDEV_TX_OK;
 	}
@@ -5551,13 +5553,14 @@ static void igb_systim_to_hwtstamp(struct igb_adapter *adapter,
 /**
  * igb_tx_hwtstamp - utility function which checks for TX time stamp
  * @q_vector: pointer to q_vector containing needed info
- * @buffer: pointer to igb_buffer structure
+ * @buffer: pointer to igb_tx_buffer structure
  *
  * If we were asked to do hardware stamping and such a time stamp is
  * available, then it must have been for this skb here because we only
  * allow only one such packet into the queue.
  */
-static void igb_tx_hwtstamp(struct igb_q_vector *q_vector, struct igb_buffer *buffer_info)
+static void igb_tx_hwtstamp(struct igb_q_vector *q_vector,
+			    struct igb_tx_buffer *buffer_info)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct e1000_hw *hw = &adapter->hw;
@@ -5585,7 +5588,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct igb_ring *tx_ring = q_vector->tx_ring;
-	struct igb_buffer *tx_buffer;
+	struct igb_tx_buffer *tx_buffer;
 	union e1000_adv_tx_desc *tx_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = q_vector->tx_work_limit;
@@ -5594,7 +5597,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 	if (test_bit(__IGB_DOWN, &adapter->state))
 		return true;
 
-	tx_buffer = &tx_ring->buffer_info[i];
+	tx_buffer = &tx_ring->tx_buffer_info[i];
 	tx_desc = IGB_TX_DESC(tx_ring, i);
 
 	for (; budget; budget--) {
@@ -5627,7 +5630,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 			i++;
 			if (unlikely(i == tx_ring->count)) {
 				i = 0;
-				tx_buffer = tx_ring->buffer_info;
+				tx_buffer = tx_ring->tx_buffer_info;
 				tx_desc = IGB_TX_DESC(tx_ring, 0);
 			}
 		} while (eop_desc);
@@ -5643,7 +5646,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 
 	if (tx_ring->detect_tx_hung) {
 		struct e1000_hw *hw = &adapter->hw;
-		u16 eop = tx_ring->buffer_info[i].next_to_watch;
+		u16 eop = tx_ring->tx_buffer_info[i].next_to_watch;
 		union e1000_adv_tx_desc *eop_desc;
 
 		eop_desc = IGB_TX_DESC(tx_ring, eop);
@@ -5651,8 +5654,8 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 		/* Detect a transmit hang in hardware, this serializes the
 		 * check with the clearing of time_stamp and movement of i */
 		tx_ring->detect_tx_hung = false;
-		if (tx_ring->buffer_info[i].time_stamp &&
-		    time_after(jiffies, tx_ring->buffer_info[i].time_stamp +
+		if (tx_ring->tx_buffer_info[i].time_stamp &&
+		    time_after(jiffies, tx_ring->tx_buffer_info[i].time_stamp +
 			       (adapter->tx_timeout_factor * HZ)) &&
 		    !(rd32(E1000_STATUS) & E1000_STATUS_TXOFF)) {
 
@@ -5674,7 +5677,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				readl(tx_ring->tail),
 				tx_ring->next_to_use,
 				tx_ring->next_to_clean,
-				tx_ring->buffer_info[eop].time_stamp,
+				tx_ring->tx_buffer_info[eop].time_stamp,
 				eop,
 				jiffies,
 				eop_desc->wb.status);
@@ -5802,7 +5805,7 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 	staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
 
 	while (staterr & E1000_RXD_STAT_DD) {
-		struct igb_buffer *buffer_info = &rx_ring->buffer_info[i];
+		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
 		struct sk_buff *skb = buffer_info->skb;
 		union e1000_adv_rx_desc *next_rxd;
 
@@ -5855,8 +5858,8 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 		}
 
 		if (!(staterr & E1000_RXD_STAT_EOP)) {
-			struct igb_buffer *next_buffer;
-			next_buffer = &rx_ring->buffer_info[i];
+			struct igb_rx_buffer *next_buffer;
+			next_buffer = &rx_ring->rx_buffer_info[i];
 			buffer_info->skb = next_buffer->skb;
 			buffer_info->dma = next_buffer->dma;
 			next_buffer->skb = skb;
@@ -5917,7 +5920,7 @@ next_desc:
 }
 
 static bool igb_alloc_mapped_skb(struct igb_ring *rx_ring,
-				 struct igb_buffer *bi)
+				 struct igb_rx_buffer *bi)
 {
 	struct sk_buff *skb = bi->skb;
 	dma_addr_t dma = bi->dma;
@@ -5951,7 +5954,7 @@ static bool igb_alloc_mapped_skb(struct igb_ring *rx_ring,
 }
 
 static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
-				  struct igb_buffer *bi)
+				  struct igb_rx_buffer *bi)
 {
 	struct page *page = bi->page;
 	dma_addr_t page_dma = bi->page_dma;
@@ -5990,11 +5993,11 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
 void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 {
 	union e1000_adv_rx_desc *rx_desc;
-	struct igb_buffer *bi;
+	struct igb_rx_buffer *bi;
 	u16 i = rx_ring->next_to_use;
 
 	rx_desc = IGB_RX_DESC(rx_ring, i);
-	bi = &rx_ring->buffer_info[i];
+	bi = &rx_ring->rx_buffer_info[i];
 	i -= rx_ring->count;
 
 	while (cleaned_count--) {
@@ -6015,7 +6018,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 		i++;
 		if (unlikely(!i)) {
 			rx_desc = IGB_RX_DESC(rx_ring, 0);
-			bi = rx_ring->buffer_info;
+			bi = rx_ring->rx_buffer_info;
 			i -= rx_ring->count;
 		}
 
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 06/13] igb: Make Tx budget for NAPI user adjustable
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change is to make the NAPI budget limits for transmit
adjustable.  Currently they are only set to 128, and when
the changes/improvements to NAPI occur to allow for adjustability,
it would be possible to tune the value for optimal
performance with applications such as routing.

v2: remove tie between NAPI and interrupt moderation
    fix work limit define name (s/IXGBE/IGB/)
    Update patch description to better reflect patch

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |    3 +
 drivers/net/ethernet/intel/igb/igb_ethtool.c |    1 +
 drivers/net/ethernet/intel/igb/igb_main.c    |  136 ++++++++++++++++----------
 3 files changed, 87 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index b725937..beab918 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -47,6 +47,7 @@ struct igb_adapter;
 
 /* TX/RX descriptor defines */
 #define IGB_DEFAULT_TXD                  256
+#define IGB_DEFAULT_TX_WORK		 128
 #define IGB_MIN_TXD                       80
 #define IGB_MAX_TXD                     4096
 
@@ -177,6 +178,7 @@ struct igb_q_vector {
 
 	u32 eims_value;
 	u16 cpu;
+	u16 tx_work_limit;
 
 	u16 itr_val;
 	u8 set_itr;
@@ -266,6 +268,7 @@ struct igb_adapter {
 	u16 rx_itr;
 
 	/* TX */
+	u16 tx_work_limit;
 	u32 tx_timeout_count;
 	int num_tx_queues;
 	struct igb_ring *tx_ring[16];
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index f231d82..a445c4f 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2011,6 +2011,7 @@ static int igb_set_coalesce(struct net_device *netdev,
 
 	for (i = 0; i < adapter->num_q_vectors; i++) {
 		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		q_vector->tx_work_limit = adapter->tx_work_limit;
 		if (q_vector->rx_ring)
 			q_vector->itr_val = adapter->rx_itr_setting;
 		else
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 7ad25e8..12faa99 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -136,8 +136,8 @@ static irqreturn_t igb_msix_ring(int irq, void *);
 static void igb_update_dca(struct igb_q_vector *);
 static void igb_setup_dca(struct igb_adapter *);
 #endif /* CONFIG_IGB_DCA */
-static bool igb_clean_tx_irq(struct igb_q_vector *);
 static int igb_poll(struct napi_struct *, int);
+static bool igb_clean_tx_irq(struct igb_q_vector *);
 static bool igb_clean_rx_irq(struct igb_q_vector *, int);
 static int igb_ioctl(struct net_device *, struct ifreq *, int cmd);
 static void igb_tx_timeout(struct net_device *);
@@ -1120,6 +1120,7 @@ static void igb_map_tx_ring_to_vector(struct igb_adapter *adapter,
 	q_vector->tx_ring = adapter->tx_ring[ring_idx];
 	q_vector->tx_ring->q_vector = q_vector;
 	q_vector->itr_val = adapter->tx_itr_setting;
+	q_vector->tx_work_limit = adapter->tx_work_limit;
 	if (q_vector->itr_val && q_vector->itr_val <= 3)
 		q_vector->itr_val = IGB_START_ITR;
 }
@@ -2388,11 +2389,17 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 
 	pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word);
 
+	/* set default ring sizes */
 	adapter->tx_ring_count = IGB_DEFAULT_TXD;
 	adapter->rx_ring_count = IGB_DEFAULT_RXD;
+
+	/* set default ITR values */
 	adapter->rx_itr_setting = IGB_DEFAULT_ITR;
 	adapter->tx_itr_setting = IGB_DEFAULT_ITR;
 
+	/* set default work limits */
+	adapter->tx_work_limit = IGB_DEFAULT_TX_WORK;
+
 	adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN +
 				  VLAN_HLEN;
 	adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
@@ -5496,7 +5503,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
 		igb_update_dca(q_vector);
 #endif
 	if (q_vector->tx_ring)
-		clean_complete = !!igb_clean_tx_irq(q_vector);
+		clean_complete = igb_clean_tx_irq(q_vector);
 
 	if (q_vector->rx_ring)
 		clean_complete &= igb_clean_rx_irq(q_vector, budget);
@@ -5578,64 +5585,69 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct igb_ring *tx_ring = q_vector->tx_ring;
-	struct net_device *netdev = tx_ring->netdev;
-	struct e1000_hw *hw = &adapter->hw;
-	struct igb_buffer *buffer_info;
-	union e1000_adv_tx_desc *tx_desc, *eop_desc;
+	struct igb_buffer *tx_buffer;
+	union e1000_adv_tx_desc *tx_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
-	unsigned int i, eop, count = 0;
-	bool cleaned = false;
+	unsigned int budget = q_vector->tx_work_limit;
+	u16 i = tx_ring->next_to_clean;
 
-	i = tx_ring->next_to_clean;
-	eop = tx_ring->buffer_info[i].next_to_watch;
-	eop_desc = IGB_TX_DESC(tx_ring, eop);
+	if (test_bit(__IGB_DOWN, &adapter->state))
+		return true;
 
-	while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&
-	       (count < tx_ring->count)) {
-		rmb();	/* read buffer_info after eop_desc status */
-		for (cleaned = false; !cleaned; count++) {
-			tx_desc = IGB_TX_DESC(tx_ring, i);
-			buffer_info = &tx_ring->buffer_info[i];
-			cleaned = (i == eop);
+	tx_buffer = &tx_ring->buffer_info[i];
+	tx_desc = IGB_TX_DESC(tx_ring, i);
 
-			if (buffer_info->skb) {
-				total_bytes += buffer_info->bytecount;
-				/* gso_segs is currently only valid for tcp */
-				total_packets += buffer_info->gso_segs;
-				igb_tx_hwtstamp(q_vector, buffer_info);
-			}
+	for (; budget; budget--) {
+		u16 eop = tx_buffer->next_to_watch;
+		union e1000_adv_tx_desc *eop_desc;
+
+		eop_desc = IGB_TX_DESC(tx_ring, eop);
+
+		/* if DD is not set pending work has not been completed */
+		if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)))
+			break;
+
+		/* prevent any other reads prior to eop_desc being verified */
+		rmb();
 
-			igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
+		do {
 			tx_desc->wb.status = 0;
+			if (likely(tx_desc == eop_desc)) {
+				eop_desc = NULL;
+
+				total_bytes += tx_buffer->bytecount;
+				total_packets += tx_buffer->gso_segs;
+				igb_tx_hwtstamp(q_vector, tx_buffer);
+			}
+
+			igb_unmap_and_free_tx_resource(tx_ring, tx_buffer);
 
+			tx_buffer++;
+			tx_desc++;
 			i++;
-			if (i == tx_ring->count)
+			if (unlikely(i == tx_ring->count)) {
 				i = 0;
-		}
-		eop = tx_ring->buffer_info[i].next_to_watch;
-		eop_desc = IGB_TX_DESC(tx_ring, eop);
+				tx_buffer = tx_ring->buffer_info;
+				tx_desc = IGB_TX_DESC(tx_ring, 0);
+			}
+		} while (eop_desc);
 	}
 
 	tx_ring->next_to_clean = i;
+	u64_stats_update_begin(&tx_ring->tx_syncp);
+	tx_ring->tx_stats.bytes += total_bytes;
+	tx_ring->tx_stats.packets += total_packets;
+	u64_stats_update_end(&tx_ring->tx_syncp);
+	tx_ring->total_bytes += total_bytes;
+	tx_ring->total_packets += total_packets;
 
-	if (unlikely(count &&
-		     netif_carrier_ok(netdev) &&
-		     igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {
-		/* Make sure that anybody stopping the queue after this
-		 * sees the new next_to_clean.
-		 */
-		smp_mb();
-		if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&
-		    !(test_bit(__IGB_DOWN, &adapter->state))) {
-			netif_wake_subqueue(netdev, tx_ring->queue_index);
+	if (tx_ring->detect_tx_hung) {
+		struct e1000_hw *hw = &adapter->hw;
+		u16 eop = tx_ring->buffer_info[i].next_to_watch;
+		union e1000_adv_tx_desc *eop_desc;
 
-			u64_stats_update_begin(&tx_ring->tx_syncp);
-			tx_ring->tx_stats.restart_queue++;
-			u64_stats_update_end(&tx_ring->tx_syncp);
-		}
-	}
+		eop_desc = IGB_TX_DESC(tx_ring, eop);
 
-	if (tx_ring->detect_tx_hung) {
 		/* Detect a transmit hang in hardware, this serializes the
 		 * check with the clearing of time_stamp and movement of i */
 		tx_ring->detect_tx_hung = false;
@@ -5666,16 +5678,34 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				eop,
 				jiffies,
 				eop_desc->wb.status);
-			netif_stop_subqueue(netdev, tx_ring->queue_index);
+			netif_stop_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
+
+			/* we are about to reset, no point in enabling stuff */
+			return true;
 		}
 	}
-	tx_ring->total_bytes += total_bytes;
-	tx_ring->total_packets += total_packets;
-	u64_stats_update_begin(&tx_ring->tx_syncp);
-	tx_ring->tx_stats.bytes += total_bytes;
-	tx_ring->tx_stats.packets += total_packets;
-	u64_stats_update_end(&tx_ring->tx_syncp);
-	return count < tx_ring->count;
+
+	if (unlikely(total_packets &&
+		     netif_carrier_ok(tx_ring->netdev) &&
+		     igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {
+		/* Make sure that anybody stopping the queue after this
+		 * sees the new next_to_clean.
+		 */
+		smp_mb();
+		if (__netif_subqueue_stopped(tx_ring->netdev,
+					     tx_ring->queue_index) &&
+		    !(test_bit(__IGB_DOWN, &adapter->state))) {
+			netif_wake_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
+
+			u64_stats_update_begin(&tx_ring->tx_syncp);
+			tx_ring->tx_stats.restart_queue++;
+			u64_stats_update_end(&tx_ring->tx_syncp);
+		}
+	}
+
+	return !!budget;
 }
 
 static inline void igb_rx_checksum(struct igb_ring *ring,
-- 
1.7.6.4

^ permalink raw reply related

* [net-next 05/13] e1000e: bad short packets received when jumbos enabled on 82579
From: Jeff Kirsher @ 2011-10-07  7:18 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1317971926-23145-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

When short packets are received with jumbos enabled on 82579, they can be
interpreted to have a receive address that does not match any configured
address.  This is due to a hardware bug that can be worked around by
reducing the number of IPG octets added when the packet is transferred from
the PHY to the MAC.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index ad34de0..4f70974 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1578,7 +1578,7 @@ s32 e1000_lv_jumbo_workaround_ich8lan(struct e1000_hw *hw, bool enable)
 		ret_val = e1e_wphy(hw, PHY_REG(776, 20), data);
 		if (ret_val)
 			goto out;
-		ret_val = e1e_wphy(hw, PHY_REG(776, 23), 0xFE00);
+		ret_val = e1e_wphy(hw, PHY_REG(776, 23), 0xF100);
 		if (ret_val)
 			goto out;
 		e1e_rphy(hw, HV_PM_CTRL, &data);
-- 
1.7.6.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox