Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: UDP regression with packets rates < 10k per sec
From: Christoph Lameter @ 2009-09-14 21:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <alpine.DEB.1.10.0909101741520.9964@V090114053VZO-1>


Where are we on this? Definitely a regression?

^ permalink raw reply

* [PATCH iproute2] ip: print "temporary" for IPv6 temp addresses
From: Brian Haley @ 2009-09-14 21:01 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev@vger.kernel.org

IPv6 addresses that have IFA_F_SECONDARY set are actually temporary addresses,
hence the IFA_F_TEMPORARY equivalent.  Change the output in this case and
allow filtering on the word "temporary".

Signed-off-by: Brian Haley <brian.haley@hp.com>
---

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 32c9008..8492fa8 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -74,7 +74,7 @@ static void usage(void)
 	fprintf(stderr, "SCOPE-ID := [ host | link | global | NUMBER ]\n");
 	fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n");
 	fprintf(stderr, "FLAG  := [ permanent | dynamic | secondary | primary |\n");
-	fprintf(stderr, "           tentative | deprecated | dadfailed |\n");
+	fprintf(stderr, "           tentative | deprecated | dadfailed | temporary |\n");
 	fprintf(stderr, "           CONFFLAG-LIST ]\n");
 	fprintf(stderr, "CONFFLAG-LIST := [ CONFFLAG-LIST ] CONFFLAG\n");
 	fprintf(stderr, "CONFFLAG  := [ home | nodad ]\n");
@@ -486,7 +486,10 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
 	fprintf(fp, "scope %s ", rtnl_rtscope_n2a(ifa->ifa_scope, b1, sizeof(b1)));
 	if (ifa->ifa_flags&IFA_F_SECONDARY) {
 		ifa->ifa_flags &= ~IFA_F_SECONDARY;
-		fprintf(fp, "secondary ");
+		if (ifa->ifa_family == AF_INET6)
+			fprintf(fp, "temporary ");
+		else
+			fprintf(fp, "secondary ");
 	}
 	if (ifa->ifa_flags&IFA_F_TENTATIVE) {
 		ifa->ifa_flags &= ~IFA_F_TENTATIVE;
@@ -642,7 +645,8 @@ static int ipaddr_list_or_flush(int argc, char **argv, int flush)
 		} else if (strcmp(*argv, "permanent") == 0) {
 			filter.flags |= IFA_F_PERMANENT;
 			filter.flagmask |= IFA_F_PERMANENT;
-		} else if (strcmp(*argv, "secondary") == 0) {
+		} else if (strcmp(*argv, "secondary") == 0 ||
+			   strcmp(*argv, "temporary") == 0) {
 			filter.flags |= IFA_F_SECONDARY;
 			filter.flagmask |= IFA_F_SECONDARY;
 		} else if (strcmp(*argv, "primary") == 0) {
diff --git a/man/man8/ip.8 b/man/man8/ip.8
index 1145b28..24a3812 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8
@@ -97,7 +97,7 @@ ip \- show / manipulate routing, devices, policy routing and tunnels
 .ti -8
 .IR FLAG " := "
 .RB "[ " permanent " | " dynamic " | " secondary " | " primary " | "\
-tentative " | " deprecated " | " dadfailed " ]"
+tentative " | " deprecated " | " dadfailed " | " temporary " ]"
 
 .ti -8
 .BR "ip addrlabel" " { " add " | " del " } " prefix
@@ -1043,6 +1043,10 @@ address detection.
 address detection.
 
 .TP
+.B temporary
+(IPv6 only) only list temporary addresses.
+
+.TP
 .BR primary " and " secondary
 only list primary (or secondary) addresses.
 

^ permalink raw reply related

* [PATCH iproute2] ip: add support for IFA_F_DADFAILED
From: Brian Haley @ 2009-09-14 21:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev@vger.kernel.org

[I'm not sure if I needed to update if_addr.h, or if that will happen
 automatically when you update the headers to 2.6.32]


Add support for IFA_F_DADFAILED and update ip.8 man page.

Signed-off-by: Brian Haley <brian.haley@hp.com>
---

diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
index 08ea980..41b0193 100644
--- a/include/linux/if_addr.h
+++ b/include/linux/if_addr.h
@@ -41,6 +41,7 @@ enum
 
 #define	IFA_F_NODAD		0x02
 #define IFA_F_OPTIMISTIC	0x04
+#define IFA_F_DADFAILED		0x08
 #define	IFA_F_HOMEADDRESS	0x10
 #define IFA_F_DEPRECATED	0x20
 #define IFA_F_TENTATIVE		0x40
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 267ecb3..32c9008 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -64,7 +64,7 @@ static void usage(void)
 		iplink_usage();
 	}
 	fprintf(stderr, "Usage: ip addr {add|change|replace} IFADDR dev STRING [ LIFETIME ]\n");
-	fprintf(stderr, "                                                      [ CONFFLAG-LIST]\n");
+	fprintf(stderr, "                                                      [ CONFFLAG-LIST ]\n");
 	fprintf(stderr, "       ip addr del IFADDR dev STRING\n");
 	fprintf(stderr, "       ip addr {show|flush} [ dev STRING ] [ scope SCOPE-ID ]\n");
 	fprintf(stderr, "                            [ to PREFIX ] [ FLAG-LIST ] [ label PATTERN ]\n");
@@ -74,7 +74,8 @@ static void usage(void)
 	fprintf(stderr, "SCOPE-ID := [ host | link | global | NUMBER ]\n");
 	fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n");
 	fprintf(stderr, "FLAG  := [ permanent | dynamic | secondary | primary |\n");
-	fprintf(stderr, "           tentative | deprecated | CONFFLAG-LIST ]\n");
+	fprintf(stderr, "           tentative | deprecated | dadfailed |\n");
+	fprintf(stderr, "           CONFFLAG-LIST ]\n");
 	fprintf(stderr, "CONFFLAG-LIST := [ CONFFLAG-LIST ] CONFFLAG\n");
 	fprintf(stderr, "CONFFLAG  := [ home | nodad ]\n");
 	fprintf(stderr, "LIFETIME := [ valid_lft LFT ] [ preferred_lft LFT ]\n");
@@ -508,6 +509,10 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
 		fprintf(fp, "dynamic ");
 	} else
 		ifa->ifa_flags &= ~IFA_F_PERMANENT;
+	if (ifa->ifa_flags&IFA_F_DADFAILED) {
+		ifa->ifa_flags &= ~IFA_F_DADFAILED;
+		fprintf(fp, "dadfailed ");
+	}
 	if (ifa->ifa_flags)
 		fprintf(fp, "flags %02x ", ifa->ifa_flags);
 	if (rta_tb[IFA_LABEL])
@@ -655,6 +660,9 @@ static int ipaddr_list_or_flush(int argc, char **argv, int flush)
 		} else if (strcmp(*argv, "nodad") == 0) {
 			filter.flags |= IFA_F_NODAD;
 			filter.flagmask |= IFA_F_NODAD;
+		} else if (strcmp(*argv, "dadfailed") == 0) {
+			filter.flags |= IFA_F_DADFAILED;
+			filter.flagmask |= IFA_F_DADFAILED;
 		} else if (strcmp(*argv, "label") == 0) {
 			NEXT_ARG();
 			filter.label = *argv;
diff --git a/man/man8/ip.8 b/man/man8/ip.8
index a8fccc4..1145b28 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8
@@ -97,7 +97,7 @@ ip \- show / manipulate routing, devices, policy routing and tunnels
 .ti -8
 .IR FLAG " := "
 .RB "[ " permanent " | " dynamic " | " secondary " | " primary " | "\
-tentative " | " deprecated " ]"
+tentative " | " deprecated " | " dadfailed " ]"
 
 .ti -8
 .BR "ip addrlabel" " { " add " | " del " } " prefix
@@ -1030,7 +1030,7 @@ addresses.
 
 .TP
 .B tentative
-(IPv6 only) only list addresses which did not pass duplicate
+(IPv6 only) only list addresses which have not yet passed duplicate
 address detection.
 
 .TP
@@ -1038,6 +1038,11 @@ address detection.
 (IPv6 only) only list deprecated addresses.
 
 .TP
+.B dadfailed
+(IPv6 only) only list addresses which have failed duplicate
+address detection.
+
+.TP
 .BR primary " and " secondary
 only list primary (or secondary) addresses.
 

^ permalink raw reply related

* Re: [Bonding-devel] [PATCH 4/4] bonding: add sysfs files to display tlb and alb hash table contents
From: Nicolas de Pesloüan @ 2009-09-14 19:37 UTC (permalink / raw)
  To: Andy Gospodarek; +Cc: Jay Vosburgh, netdev, bonding-devel
In-Reply-To: <20090914144532.GU8515@gospo.rdu.redhat.com>

Andy Gospodarek wrote:
> On Fri, Sep 11, 2009 at 02:48:17PM -0700, Jay Vosburgh wrote:
>> Andy Gospodarek <andy@greyhouse.net> wrote:
>>
>>> bonding: add sysfs files to display tlb and alb hash table contents
>> 	Isn't it considered bad form to have sysfs files that kick out
>> large amounts of data like this?  Not that I think this is a bad
>> facility to have, just checking on the mechanism.
>>
> 
> I'm not aware of such a restriction -- though I'm sure at least one
> person out there doesn't like it.

In Documentation/filesystems/sysfs.txt:

"Attributes should be ASCII text files, preferably with only one value
per file. It is noted that it may not be efficient to contain only one
value per file, so it is socially acceptable to express an array of
values of the same type.

Mixing types, expressing multiple lines of data, and doing fancy
formatting of data is heavily frowned upon. Doing these things may get
you publically humiliated and your code rewritten without notice."

Apparently, thinks are becoming more relaxed these days.

	Nicolas.

> If that's the case, there are certainly a few files that should be
> cleaned up:
> 
> # find -type f -exec wc -l {} 2> /dev/null \; | sort -r -n | head -10
> 1657 ./firmware/acpi/tables/SSDT
> 132 ./firmware/acpi/tables/dynamic/SSDT2
> 128 ./devices/pci0000:00/0000:00:1c.5/0000:3f:00.0/vpd
> 27 ./devices/system/node/node0/meminfo
> 24 ./devices/pnp0/00:08/options
> 24 ./devices/pnp0/00:07/options
> 12 ./devices/pci0000:00/0000:00:1e.0/resource
> 12 ./devices/pci0000:00/0000:00:1c.5/resource
> 12 ./devices/pci0000:00/0000:00:1c.4/resource
> 12 ./devices/pci0000:00/0000:00:1c.0/resource


^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Gregory Haskins @ 2009-09-14 19:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo,
	linux-mm, akpm, hpa, Rusty Russell, s.hetze
In-Reply-To: <20090914165320.GA3851@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2169 bytes --]

Michael S. Tsirkin wrote:
> On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote:
>> Michael S. Tsirkin wrote:
>>> On Fri, Sep 11, 2009 at 12:00:21PM -0400, Gregory Haskins wrote:
>>>> FWIW: VBUS handles this situation via the "memctx" abstraction.  IOW,
>>>> the memory is not assumed to be a userspace address.  Rather, it is a
>>>> memctx-specific address, which can be userspace, or any other type
>>>> (including hardware, dma-engine, etc).  As long as the memctx knows how
>>>> to translate it, it will work.
>>> How would permissions be handled?
>> Same as anything else, really.  Read on for details.
>>
>>> it's easy to allow an app to pass in virtual addresses in its own address space.
>> Agreed, and this is what I do.
>>
>> The guest always passes its own physical addresses (using things like
>> __pa() in linux).  This address passed is memctx specific, but generally
>> would fall into the category of "virtual-addresses" from the hosts
>> perspective.
>>
>> For a KVM/AlacrityVM guest example, the addresses are GPAs, accessed
>> internally to the context via a gfn_to_hva conversion (you can see this
>> occuring in the citation links I sent)
>>
>> For Ira's example, the addresses would represent a physical address on
>> the PCI boards, and would follow any kind of relevant rules for
>> converting a "GPA" to a host accessible address (even if indirectly, via
>> a dma controller).
> 
> So vbus can let an application

"application" means KVM guest, or ppc board, right?

> access either its own virtual memory or a physical memory on a PCI device.

To reiterate from the last reply: the model is the "guest" owns the
memory.  The host is granted access to that memory by means of a memctx
object, which must be admitted to the host kernel and accessed according
 to standard access-policy mechanisms.  Generally the "application" or
guest would never be accessing anything other than its own memory.

> My question is, is any application
> that's allowed to do the former also granted rights to do the later?

If I understand your question, no.  Can you elaborate?

Kind Regards,
-Greg


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 267 bytes --]

^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Gregory Haskins @ 2009-09-14 19:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo,
	linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel
In-Reply-To: <20090914164750.GB3745@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 1921 bytes --]

Michael S. Tsirkin wrote:
> On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote:
>> For Ira's example, the addresses would represent a physical address on
>> the PCI boards, and would follow any kind of relevant rules for
>> converting a "GPA" to a host accessible address (even if indirectly, via
>> a dma controller).
> 
> I don't think limiting addresses to PCI physical addresses will work
> well.

The only "limit" is imposed by the memctx.  If a given context needs to
meet certain requirements beyond PCI physical addresses, it would
presumably be designed that way.

>  From what I rememeber, Ira's x86 can not initiate burst
> transactions on PCI, and it's the ppc that initiates all DMA.

The only requirement is that the "guest" "owns" the memory.  IOW: As
with virtio/vhost, the guest can access the pointers in the ring
directly but the host must pass through a translation function.

Your translation is direct: you use a slots/hva scheme.  My translation
is abstracted, which means it can support slots/hva (such as in
alacrityvm) or some other scheme as long as the general model of "guest
owned" holds true.

> 
>>>  But we can't let the guest specify physical addresses.
>> Agreed.  Neither your proposal nor mine operate this way afaict.
> 
> But this seems to be what Ira needs.

So what he could do then is implement the memctx to integrate with the
ppc side dma controller.  E.g. "translation" in his box means a protocol
from the x86 to the ppc to initiate the dma cycle.  This could be
exposed as a dma facility in the register file of the ppc boards, for
instance.

To reiterate, as long as the model is such that the ppc boards are
considered the "owner" (direct access, no translation needed) I believe
it will work.  If the pointers are expected to be owned by the host,
then my model doesn't work well either.

Kind Regards,
-Greg

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 267 bytes --]

^ permalink raw reply

* fanotify as syscalls
From: Eric Paris @ 2009-09-14 19:08 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Jamie Lokier, David Miller, linux-kernel, linux-fsdevel, netdev,
	viro, alan, hch, torvalds
In-Reply-To: <20090914140720.GA8564@ioremap.net>

Long ago I implemented fanotify as basically a /dev interface using
ioctls().  Alan suggested I use a socket protocol and could then make
use of get/setsockopt() which although still not great is light years
better than ioctl.  Currently the fanotify interface as I want to push
it to Linus and as I've been requesting comments on for the last 1.25
years is just that.  It really makes no use of the networking system
other than bind() and setsockopt() and everyone tends to agree the
things I want to do can't reasonably be done using network hooks and a
'real' socket protocol.  I like this interface, setsockopt() makes it so
easy to add new functionality as we flush out other users.  fanotify as
it stands today has a number of groups who will port to it, has a nubmer
of advantages over inotify, and I have been told privately meets the
needs of the original group of people who paid me to work on it (two
very large anti-malware companies who currently unprotect and hack the
syscall table of their users)

Just this week I got another request to look at syscalls.  So I did, I
haven't prototyped it, but I can do it with syscalls, they would look
like this:

int fanotify_init(int flags, int f_flags, __u64 mask, unsigned int priority);

int fanotify_add_mark(int fanotify_fd, char *path, __u64 mask, __u64 ignored_mask);
int fanotify_add_mark_fd(int fanotify_fd, int fd, __u64 mask, __u64 ignored_mask);
int fanotify_rm_mark(int fanotify_fd, char *path, __u64 mask);
int fanotify_rm_mark_fd(int fanotify_fd, int fd, __u64 mask);
	Those above 4 could probably be squashed into 2 syscalls with an extra
flags field.

int fanotify_clear_marks(int fanotify_fd);

int fanotify_perm_response(int fanotify_fd, __u64 cookie, int response);

int fanotify_ignore_sb(int fanotify_fd, long f_type);
int fanotify_ignore_fsid(int fanotify_fd, fsid_t f_fsid);
	These 2 are the most questionable, they would honestly only be used for
things that wanted system wide notification, I can't imagine that being
many things other than AV vendors.  But they really need a way to
exclude notification when people open/close/read/write to /proc (which
is the point of the ignore_sb.)

Since I don't have a solution to subtree notification I don't know if it
will work in this syscall framework.  I know people want subtree
notification and I'm willing to take a stab at it after the fscking all
notification is accepted.  That's one of the main reasons I like
setsockopt over tons of syscalls.  I can add a new one very easily.  I
also can easily expand arguments by just creating a new sockopt.  No
userspace headaches.

Are there demands that I convert to syscalls?  Do I really gain anything
using 9 new inextensible syscalls over socket(), bind(), and 8
setsockopt() calls?

I'd like to send these patches along, so a ruling from on high would be
great....

-Eric

^ permalink raw reply

* Re: [PATCH] pkt_sched: Fix tx queue selection in tc_modify_qdisc
From: Jarek Poplawski @ 2009-09-14 19:03 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David Miller, netdev
In-Reply-To: <4AAE85F9.2060600@trash.net>

On Mon, Sep 14, 2009 at 08:05:45PM +0200, Patrick McHardy wrote:
> Jarek Poplawski wrote:
> > After the recent mq change there is the new select_queue qdisc class
> > method used in tc_modify_qdisc, but it works OK only for direct child
> > qdiscs of mq qdisc. Grandchildren always get the first tx queue, which
> > would give wrong qdisc_root etc. results (e.g. for sch_htb as child of
> > sch_prio). This patch fixes it by using parent's dev_queue for such
> > grandchildren qdiscs. The select_queue method is replaced BTW with the
> > static qdisc_select_tx_queue function (it's used only in one place).
> 
> Thanks, this looks correct. My assumption was that we shouldn't
> be using the locks of grandchildren anyways, but we do need the
> proper root lock for estimators.

Actually, in the above example I was mainly concerned with a watchdog
parameter. But of course there should more (etc.).

> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index 88eb9de..865120c 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -81,7 +81,6 @@ struct Qdisc
> >  struct Qdisc_class_ops
> >  {
> >  	/* Child qdisc manipulation */
> > -	unsigned int		(*select_queue)(struct Qdisc *, struct tcmsg *);
> >  	int			(*graft)(struct Qdisc *, unsigned long cl,
> >  					struct Qdisc *, struct Qdisc **);
> >  	struct Qdisc *		(*leaf)(struct Qdisc *, unsigned long cl);
> > diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> > index 3af1061..223a6bc 100644
> > --- a/net/sched/sch_api.c
> > +++ b/net/sched/sch_api.c
> > @@ -990,6 +990,24 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
> >  	return 0;
> >  }
> >  
> > +static struct netdev_queue *qdisc_select_tx_queue(struct net_device *dev,
> > +						  struct Qdisc *p, u32 clid)
> > +{
> > +	unsigned long ntx;
> > +
> > +	if (!p)
> > +		return netdev_get_tx_queue(dev, 0);
> > +
> > +	if (!(p->flags & TCQ_F_MQROOT))
> > +		return p->dev_queue;
> > +
> > +	ntx = TC_H_MIN(clid) - 1;
> 
> I didn't want to expose the numbering scheme used by sch_mq internally,
> but fine, I see you really don't like the callback :)

I only don't like the callback just for one exceptional qdisc. On the
other hand it would look more sensible to me if implemented at least
by all classful qdiscs to return parent's dev_queue always; so I could
re-do it like this, or simply mix this fix with the current
implementation, no problem (I don't "don't like it" too much).

Jarek P.

^ permalink raw reply

* [net-next PATCH 2/2] igb: do not allow phy sw reset code to make calls to null pointers
From: Jeff Kirsher @ 2009-09-14 18:23 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Don Skidmore
In-Reply-To: <20090914182253.4859.11176.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

In the case of fiber and serdes adapters we were seeing issues with ethtool
-t causing kernel panics due to null function pointers.  To prevent this we
need to exit out of the phy reset code in the event that we do not have a
valid phy.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
---

 drivers/net/igb/e1000_phy.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/drivers/net/igb/e1000_phy.c b/drivers/net/igb/e1000_phy.c
index c1f4da6..ee46060 100644
--- a/drivers/net/igb/e1000_phy.c
+++ b/drivers/net/igb/e1000_phy.c
@@ -1565,9 +1565,12 @@ out:
  **/
 s32 igb_phy_sw_reset(struct e1000_hw *hw)
 {
-	s32 ret_val;
+	s32 ret_val = 0;
 	u16 phy_ctrl;
 
+	if (!(hw->phy.ops.read_reg))
+		goto out;
+
 	ret_val = hw->phy.ops.read_reg(hw, PHY_CONTROL, &phy_ctrl);
 	if (ret_val)
 		goto out;


^ permalink raw reply related

* [net-next PATCH 1/2] igb: reset sgmii phy at start of init
From: Jeff Kirsher @ 2009-09-14 18:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Don Skidmore

From: Alexander Duyck <alexander.h.duyck@intel.com>

Our SGMII phy code was incomplete in that it was not actually placing the
phy in SGMII mode and as a result the PHY was not able to establish a link
when connected to a non serdes link partner.  This patch updates the code
to combine the SGMII/serdes PCS init and to add the necessary reset.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
---

 drivers/net/igb/e1000_82575.c   |  198 ++++++++++++++++++---------------------
 drivers/net/igb/e1000_82575.h   |    2 
 drivers/net/igb/e1000_defines.h |    2 
 drivers/net/igb/igb_main.c      |    2 
 4 files changed, 95 insertions(+), 109 deletions(-)

diff --git a/drivers/net/igb/e1000_82575.c b/drivers/net/igb/e1000_82575.c
index 6158c0f..f8f5772 100644
--- a/drivers/net/igb/e1000_82575.c
+++ b/drivers/net/igb/e1000_82575.c
@@ -49,11 +49,10 @@ static s32  igb_read_phy_reg_sgmii_82575(struct e1000_hw *, u32, u16 *);
 static s32  igb_reset_hw_82575(struct e1000_hw *);
 static s32  igb_set_d0_lplu_state_82575(struct e1000_hw *, bool);
 static s32  igb_setup_copper_link_82575(struct e1000_hw *);
-static s32  igb_setup_fiber_serdes_link_82575(struct e1000_hw *);
+static s32  igb_setup_serdes_link_82575(struct e1000_hw *);
 static s32  igb_write_phy_reg_sgmii_82575(struct e1000_hw *, u32, u16);
 static void igb_clear_hw_cntrs_82575(struct e1000_hw *);
 static s32  igb_acquire_swfw_sync_82575(struct e1000_hw *, u16);
-static void igb_configure_pcs_link_82575(struct e1000_hw *);
 static s32  igb_get_pcs_speed_and_duplex_82575(struct e1000_hw *, u16 *,
 						 u16 *);
 static s32  igb_get_phy_id_82575(struct e1000_hw *);
@@ -105,16 +104,20 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw)
 	dev_spec->sgmii_active = false;
 
 	ctrl_ext = rd32(E1000_CTRL_EXT);
-	if ((ctrl_ext & E1000_CTRL_EXT_LINK_MODE_MASK) ==
-	    E1000_CTRL_EXT_LINK_MODE_PCIE_SERDES) {
-		hw->phy.media_type = e1000_media_type_internal_serdes;
-		ctrl_ext |= E1000_CTRL_I2C_ENA;
-	} else if (ctrl_ext & E1000_CTRL_EXT_LINK_MODE_SGMII) {
+	switch (ctrl_ext & E1000_CTRL_EXT_LINK_MODE_MASK) {
+	case E1000_CTRL_EXT_LINK_MODE_SGMII:
 		dev_spec->sgmii_active = true;
 		ctrl_ext |= E1000_CTRL_I2C_ENA;
-	} else {
+		break;
+	case E1000_CTRL_EXT_LINK_MODE_PCIE_SERDES:
+		hw->phy.media_type = e1000_media_type_internal_serdes;
+		ctrl_ext |= E1000_CTRL_I2C_ENA;
+		break;
+	default:
 		ctrl_ext &= ~E1000_CTRL_I2C_ENA;
+		break;
 	}
+
 	wr32(E1000_CTRL_EXT, ctrl_ext);
 
 	/* Set mta register count */
@@ -134,7 +137,7 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw)
 	mac->ops.setup_physical_interface =
 		(hw->phy.media_type == e1000_media_type_copper)
 			? igb_setup_copper_link_82575
-			: igb_setup_fiber_serdes_link_82575;
+			: igb_setup_serdes_link_82575;
 
 	/* NVM initialization */
 	eecd = rd32(E1000_EECD);
@@ -379,6 +382,7 @@ static s32 igb_get_phy_id_82575(struct e1000_hw *hw)
 	struct e1000_phy_info *phy = &hw->phy;
 	s32  ret_val = 0;
 	u16 phy_id;
+	u32 ctrl_ext;
 
 	/*
 	 * For SGMII PHYs, we try the list of possible addresses until
@@ -393,6 +397,12 @@ static s32 igb_get_phy_id_82575(struct e1000_hw *hw)
 		goto out;
 	}
 
+	/* Power on sgmii phy if it is disabled */
+	ctrl_ext = rd32(E1000_CTRL_EXT);
+	wr32(E1000_CTRL_EXT, ctrl_ext & ~E1000_CTRL_EXT_SDP3_DATA);
+	wrfl();
+	msleep(300);
+
 	/*
 	 * The address field in the I2CCMD register is 3 bits and 0 is invalid.
 	 * Therefore, we need to test 1-7
@@ -418,9 +428,12 @@ static s32 igb_get_phy_id_82575(struct e1000_hw *hw)
 		phy->addr = 0;
 		ret_val = -E1000_ERR_PHY;
 		goto out;
+	} else {
+		ret_val = igb_get_phy_id(hw);
 	}
 
-	ret_val = igb_get_phy_id(hw);
+	/* restore previous sfp cage power state */
+	wr32(E1000_CTRL_EXT, ctrl_ext);
 
 out:
 	return ret_val;
@@ -766,17 +779,18 @@ static s32 igb_get_pcs_speed_and_duplex_82575(struct e1000_hw *hw, u16 *speed,
 }
 
 /**
- *  igb_shutdown_fiber_serdes_link_82575 - Remove link during power down
+ *  igb_shutdown_serdes_link_82575 - Remove link during power down
  *  @hw: pointer to the HW structure
  *
  *  In the case of fiber serdes, shut down optics and PCS on driver unload
  *  when management pass thru is not enabled.
  **/
-void igb_shutdown_fiber_serdes_link_82575(struct e1000_hw *hw)
+void igb_shutdown_serdes_link_82575(struct e1000_hw *hw)
 {
 	u32 reg;
 
-	if (hw->phy.media_type != e1000_media_type_internal_serdes)
+	if (hw->phy.media_type != e1000_media_type_internal_serdes ||
+	    igb_sgmii_active_82575(hw))
 		return;
 
 	/* if the management interface is not enabled, then power down */
@@ -788,7 +802,7 @@ void igb_shutdown_fiber_serdes_link_82575(struct e1000_hw *hw)
 
 		/* shutdown the laser */
 		reg = rd32(E1000_CTRL_EXT);
-		reg |= E1000_CTRL_EXT_SDP7_DATA;
+		reg |= E1000_CTRL_EXT_SDP3_DATA;
 		wr32(E1000_CTRL_EXT, reg);
 
 		/* flush the write to verify completion */
@@ -927,6 +941,17 @@ static s32 igb_setup_copper_link_82575(struct e1000_hw *hw)
 	ctrl &= ~(E1000_CTRL_FRCSPD | E1000_CTRL_FRCDPX);
 	wr32(E1000_CTRL, ctrl);
 
+	ret_val = igb_setup_serdes_link_82575(hw);
+	if (ret_val)
+		goto out;
+
+	if (igb_sgmii_active_82575(hw) && !hw->phy.reset_disable) {
+		ret_val = hw->phy.ops.reset(hw);
+		if (ret_val) {
+			hw_dbg("Error resetting the PHY.\n");
+			goto out;
+		}
+	}
 	switch (hw->phy.type) {
 	case e1000_phy_m88:
 		ret_val = igb_copper_link_setup_m88(hw);
@@ -963,8 +988,6 @@ static s32 igb_setup_copper_link_82575(struct e1000_hw *hw)
 		}
 	}
 
-	igb_configure_pcs_link_82575(hw);
-
 	/*
 	 * Check link status. Wait up to 100 microseconds for link to become
 	 * valid.
@@ -987,14 +1010,18 @@ out:
 }
 
 /**
- *  igb_setup_fiber_serdes_link_82575 - Setup link for fiber/serdes
+ *  igb_setup_serdes_link_82575 - Setup link for fiber/serdes
  *  @hw: pointer to the HW structure
  *
  *  Configures speed and duplex for fiber and serdes links.
  **/
-static s32 igb_setup_fiber_serdes_link_82575(struct e1000_hw *hw)
+static s32 igb_setup_serdes_link_82575(struct e1000_hw *hw)
 {
-	u32 reg;
+	u32 ctrl_reg, reg;
+
+	if ((hw->phy.media_type != e1000_media_type_internal_serdes) &&
+	    !igb_sgmii_active_82575(hw))
+		return 0;
 
 	/*
 	 * On the 82575, SerDes loopback mode persists until it is
@@ -1004,26 +1031,38 @@ static s32 igb_setup_fiber_serdes_link_82575(struct e1000_hw *hw)
 	 */
 	wr32(E1000_SCTL, E1000_SCTL_DISABLE_SERDES_LOOPBACK);
 
-	/* Force link up, set 1gb, set both sw defined pins */
-	reg = rd32(E1000_CTRL);
-	reg |= E1000_CTRL_SLU |
-	       E1000_CTRL_SPD_1000 |
-	       E1000_CTRL_FRCSPD |
-	       E1000_CTRL_SWDPIN0 |
-	       E1000_CTRL_SWDPIN1;
-	wr32(E1000_CTRL, reg);
-
-	/* Power on phy for 82576 fiber adapters */
-	if (hw->mac.type == e1000_82576) {
-		reg = rd32(E1000_CTRL_EXT);
-		reg &= ~E1000_CTRL_EXT_SDP7_DATA;
-		wr32(E1000_CTRL_EXT, reg);
+	/* power on the sfp cage if present */
+	reg = rd32(E1000_CTRL_EXT);
+	reg &= ~E1000_CTRL_EXT_SDP3_DATA;
+	wr32(E1000_CTRL_EXT, reg);
+
+	ctrl_reg = rd32(E1000_CTRL);
+	ctrl_reg |= E1000_CTRL_SLU;
+
+	if (hw->mac.type == e1000_82575 || hw->mac.type == e1000_82576) {
+		/* set both sw defined pins */
+		ctrl_reg |= E1000_CTRL_SWDPIN0 | E1000_CTRL_SWDPIN1;
+
+		/* Set switch control to serdes energy detect */
+		reg = rd32(E1000_CONNSW);
+		reg |= E1000_CONNSW_ENRGSRC;
+		wr32(E1000_CONNSW, reg);
+	}
+
+	reg = rd32(E1000_PCS_LCTL);
+
+	if (igb_sgmii_active_82575(hw)) {
+		/* allow time for SFP cage to power up phy */
+		msleep(300);
+
+		/* AN time out should be disabled for SGMII mode */
+		reg &= ~(E1000_PCS_LCTL_AN_TIMEOUT);
+	} else {
+		ctrl_reg |= E1000_CTRL_SPD_1000 | E1000_CTRL_FRCSPD |
+		            E1000_CTRL_FD | E1000_CTRL_FRCDPX;
 	}
 
-	/* Set switch control to serdes energy detect */
-	reg = rd32(E1000_CONNSW);
-	reg |= E1000_CONNSW_ENRGSRC;
-	wr32(E1000_CONNSW, reg);
+	wr32(E1000_CTRL, ctrl_reg);
 
 	/*
 	 * New SerDes mode allows for forcing speed or autonegotiating speed
@@ -1031,12 +1070,21 @@ static s32 igb_setup_fiber_serdes_link_82575(struct e1000_hw *hw)
 	 * mode that will be compatible with older link partners and switches.
 	 * However, both are supported by the hardware and some drivers/tools.
 	 */
-	reg = rd32(E1000_PCS_LCTL);
 
 	reg &= ~(E1000_PCS_LCTL_AN_ENABLE | E1000_PCS_LCTL_FLV_LINK_UP |
 		E1000_PCS_LCTL_FSD | E1000_PCS_LCTL_FORCE_LINK);
 
-	if (hw->mac.autoneg) {
+	/*
+	 * We force flow control to prevent the CTRL register values from being
+	 * overwritten by the autonegotiated flow control values
+	 */
+	reg |= E1000_PCS_LCTL_FORCE_FCTRL;
+
+	/*
+	 * we always set sgmii to autoneg since it is the phy that will be
+	 * forcing the link and the serdes is just a go-between
+	 */
+	if (hw->mac.autoneg || igb_sgmii_active_82575(hw)) {
 		/* Set PCS register for autoneg */
 		reg |= E1000_PCS_LCTL_FSV_1000 |      /* Force 1000    */
 		       E1000_PCS_LCTL_FDV_FULL |      /* SerDes Full duplex */
@@ -1053,75 +1101,12 @@ static s32 igb_setup_fiber_serdes_link_82575(struct e1000_hw *hw)
 		hw_dbg("Configuring Forced Link; PCS_LCTL = 0x%08X\n", reg);
 	}
 
-	if (hw->mac.type == e1000_82576) {
-		reg |= E1000_PCS_LCTL_FORCE_FCTRL;
-		igb_force_mac_fc(hw);
-	}
-
 	wr32(E1000_PCS_LCTL, reg);
 
-	return 0;
-}
-
-/**
- *  igb_configure_pcs_link_82575 - Configure PCS link
- *  @hw: pointer to the HW structure
- *
- *  Configure the physical coding sub-layer (PCS) link.  The PCS link is
- *  only used on copper connections where the serialized gigabit media
- *  independent interface (sgmii) is being used.  Configures the link
- *  for auto-negotiation or forces speed/duplex.
- **/
-static void igb_configure_pcs_link_82575(struct e1000_hw *hw)
-{
-	struct e1000_mac_info *mac = &hw->mac;
-	u32 reg = 0;
-
-	if (hw->phy.media_type != e1000_media_type_copper ||
-	    !(igb_sgmii_active_82575(hw)))
-		return;
-
-	/* For SGMII, we need to issue a PCS autoneg restart */
-	reg = rd32(E1000_PCS_LCTL);
-
-	/* AN time out should be disabled for SGMII mode */
-	reg &= ~(E1000_PCS_LCTL_AN_TIMEOUT);
-
-	if (mac->autoneg) {
-		/* Make sure forced speed and force link are not set */
-		reg &= ~(E1000_PCS_LCTL_FSD | E1000_PCS_LCTL_FORCE_LINK);
-
-		/*
-		 * The PHY should be setup prior to calling this function.
-		 * All we need to do is restart autoneg and enable autoneg.
-		 */
-		reg |= E1000_PCS_LCTL_AN_RESTART | E1000_PCS_LCTL_AN_ENABLE;
-	} else {
-		/* Set PCS register for forced speed */
-
-		/* Turn off bits for full duplex, speed, and autoneg */
-		reg &= ~(E1000_PCS_LCTL_FSV_1000 |
-			 E1000_PCS_LCTL_FSV_100 |
-			 E1000_PCS_LCTL_FDV_FULL |
-			 E1000_PCS_LCTL_AN_ENABLE);
-
-		/* Check for duplex first */
-		if (mac->forced_speed_duplex & E1000_ALL_FULL_DUPLEX)
-			reg |= E1000_PCS_LCTL_FDV_FULL;
-
-		/* Now set speed */
-		if (mac->forced_speed_duplex & E1000_ALL_100_SPEED)
-			reg |= E1000_PCS_LCTL_FSV_100;
-
-		/* Force speed and force link */
-		reg |= E1000_PCS_LCTL_FSD |
-		       E1000_PCS_LCTL_FORCE_LINK |
-		       E1000_PCS_LCTL_FLV_LINK_UP;
+	if (!igb_sgmii_active_82575(hw))
+		igb_force_mac_fc(hw);
 
-		hw_dbg("Wrote 0x%08X to PCS_LCTL to configure forced link\n",
-		       reg);
-	}
-	wr32(E1000_PCS_LCTL, reg);
+	return 0;
 }
 
 /**
@@ -1248,7 +1233,8 @@ static void igb_clear_hw_cntrs_82575(struct e1000_hw *hw)
 	temp = rd32(E1000_LENERRS);
 
 	/* This register should not be read in copper configurations */
-	if (hw->phy.media_type == e1000_media_type_internal_serdes)
+	if (hw->phy.media_type == e1000_media_type_internal_serdes ||
+	    igb_sgmii_active_82575(hw))
 		temp = rd32(E1000_SCVPC);
 }
 
diff --git a/drivers/net/igb/e1000_82575.h b/drivers/net/igb/e1000_82575.h
index 8a1e659..ebd146f 100644
--- a/drivers/net/igb/e1000_82575.h
+++ b/drivers/net/igb/e1000_82575.h
@@ -28,7 +28,7 @@
 #ifndef _E1000_82575_H_
 #define _E1000_82575_H_
 
-extern void igb_shutdown_fiber_serdes_link_82575(struct e1000_hw *hw);
+extern void igb_shutdown_serdes_link_82575(struct e1000_hw *hw);
 extern void igb_rx_fifo_flush_82575(struct e1000_hw *hw);
 
 #define ID_LED_DEFAULT_82575_SERDES ((ID_LED_DEF1_DEF2 << 12) | \
diff --git a/drivers/net/igb/e1000_defines.h b/drivers/net/igb/e1000_defines.h
index c858293..cb91683 100644
--- a/drivers/net/igb/e1000_defines.h
+++ b/drivers/net/igb/e1000_defines.h
@@ -44,7 +44,7 @@
 #define E1000_WUFC_BC   0x00000010 /* Broadcast Wakeup Enable */
 
 /* Extended Device Control */
-#define E1000_CTRL_EXT_SDP7_DATA 0x00000080 /* Value of SW Defineable Pin 7 */
+#define E1000_CTRL_EXT_SDP3_DATA 0x00000080 /* Value of SW Defineable Pin 3 */
 /* Physical Func Reset Done Indication */
 #define E1000_CTRL_EXT_PFRSTD    0x00004000
 #define E1000_CTRL_EXT_LINK_MODE_MASK 0x00C00000
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 290555c..8832c05 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -5322,7 +5322,7 @@ static int __igb_shutdown(struct pci_dev *pdev, bool *enable_wake)
 
 	*enable_wake = wufc || adapter->en_mng_pt;
 	if (!*enable_wake)
-		igb_shutdown_fiber_serdes_link_82575(hw);
+		igb_shutdown_serdes_link_82575(hw);
 
 	/* Release control of h/w to f/w.  If f/w is AMT enabled, this
 	 * would have already happened in close and is redundant. */


^ permalink raw reply related

* Re: [PATCH] pkt_sched: Fix tx queue selection in tc_modify_qdisc
From: Patrick McHardy @ 2009-09-14 18:05 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: David Miller, netdev
In-Reply-To: <20090914122226.GA14087@ff.dom.local>

Jarek Poplawski wrote:
> After the recent mq change there is the new select_queue qdisc class
> method used in tc_modify_qdisc, but it works OK only for direct child
> qdiscs of mq qdisc. Grandchildren always get the first tx queue, which
> would give wrong qdisc_root etc. results (e.g. for sch_htb as child of
> sch_prio). This patch fixes it by using parent's dev_queue for such
> grandchildren qdiscs. The select_queue method is replaced BTW with the
> static qdisc_select_tx_queue function (it's used only in one place).

Thanks, this looks correct. My assumption was that we shouldn't
be using the locks of grandchildren anyways, but we do need the
proper root lock for estimators.

> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 88eb9de..865120c 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -81,7 +81,6 @@ struct Qdisc
>  struct Qdisc_class_ops
>  {
>  	/* Child qdisc manipulation */
> -	unsigned int		(*select_queue)(struct Qdisc *, struct tcmsg *);
>  	int			(*graft)(struct Qdisc *, unsigned long cl,
>  					struct Qdisc *, struct Qdisc **);
>  	struct Qdisc *		(*leaf)(struct Qdisc *, unsigned long cl);
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 3af1061..223a6bc 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -990,6 +990,24 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
>  	return 0;
>  }
>  
> +static struct netdev_queue *qdisc_select_tx_queue(struct net_device *dev,
> +						  struct Qdisc *p, u32 clid)
> +{
> +	unsigned long ntx;
> +
> +	if (!p)
> +		return netdev_get_tx_queue(dev, 0);
> +
> +	if (!(p->flags & TCQ_F_MQROOT))
> +		return p->dev_queue;
> +
> +	ntx = TC_H_MIN(clid) - 1;

I didn't want to expose the numbering scheme used by sch_mq internally,
but fine, I see you really don't like the callback :)

^ permalink raw reply

* [PATCH] netdev: smc91x: drop Blackfin cruft
From: Mike Frysinger @ 2009-09-14 18:03 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: linux-kernel, Michael Hennerich

From: Michael Hennerich <michael.hennerich@analog.com>

Now that all Blackfin boards are using the board resources, we don't need
to keep the arch/board specific crap in the driver header.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/smc91x.h |   28 ----------------------------
 1 files changed, 0 insertions(+), 28 deletions(-)

diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 57a159f..9c8c6ed 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -83,34 +83,6 @@ static inline void SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 	}
 }
 
-#elif defined(CONFIG_BLACKFIN)
-
-#define SMC_IRQ_FLAGS		IRQF_TRIGGER_HIGH
-#define RPC_LSA_DEFAULT		RPC_LED_100_10
-#define RPC_LSB_DEFAULT		RPC_LED_TX_RX
-
-#define SMC_CAN_USE_8BIT	0
-#define SMC_CAN_USE_16BIT	1
-# if defined(CONFIG_BF561)
-#define SMC_CAN_USE_32BIT	1
-# else
-#define SMC_CAN_USE_32BIT	0
-# endif
-#define SMC_IO_SHIFT		0
-#define SMC_NOWAIT      	1
-#define SMC_USE_BFIN_DMA	0
-
-#define SMC_inw(a, r)		readw((a) + (r))
-#define SMC_outw(v, a, r)	writew(v, (a) + (r))
-#define SMC_insw(a, r, p, l)	readsw((a) + (r), p, l)
-#define SMC_outsw(a, r, p, l)	writesw((a) + (r), p, l)
-# if SMC_CAN_USE_32BIT
-#define SMC_inl(a, r)		readl((a) + (r))
-#define SMC_outl(v, a, r)	writel(v, (a) + (r))
-#define SMC_insl(a, r, p, l)	readsl((a) + (r), p, l)
-#define SMC_outsl(a, r, p, l)	writesl((a) + (r), p, l)
-# endif
-
 #elif defined(CONFIG_REDWOOD_5) || defined(CONFIG_REDWOOD_6)
 
 /* We can only do 16-bit reads and writes in the static memory space. */
-- 
1.6.4.2

^ permalink raw reply related

* Re: [PATCH] pkt_sched: Fix qdisc_graft WRT ingress qdisc
From: Patrick McHardy @ 2009-09-14 17:58 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: David Miller, netdev
In-Reply-To: <20090914083544.GA10444@ff.dom.local>

Jarek Poplawski wrote:
> After the recent mq change using ingress qdisc overwrites dev->qdisc;
> there is also a wrong old qdisc pointer passed to notify_and_destroy.

Good catch, thanks.

^ permalink raw reply

* [net-next PATCH 3/3] ixgbe: Create separate media type for CX4 adapters
From: Jeff Kirsher @ 2009-09-14 17:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Peter P Waskiewicz Jr, Jeff Kirsher, Don Skidmore
In-Reply-To: <20090914174659.4560.28814.stgit@localhost.localdomain>

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

Currently the media type detection for CX4 adapters lumps them into a
type of fiber.  This causes some strange fallout when firmware verification
is done on the NIC, and certain fiber NIC rules get enforced incorrectly.

This patch introduces a new media type for CX4, and puts both 82598 and
82599 CX4 adapters into this bucket.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
---

 drivers/net/ixgbe/ixgbe_82598.c |    6 ++++--
 drivers/net/ixgbe/ixgbe_82599.c |    2 +-
 drivers/net/ixgbe/ixgbe_type.h  |    1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c
index cb7f0c3..56b12f3 100644
--- a/drivers/net/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ixgbe/ixgbe_82598.c
@@ -322,14 +322,16 @@ static enum ixgbe_media_type ixgbe_get_media_type_82598(struct ixgbe_hw *hw)
 		break;
 	case IXGBE_DEV_ID_82598AF_DUAL_PORT:
 	case IXGBE_DEV_ID_82598AF_SINGLE_PORT:
-	case IXGBE_DEV_ID_82598EB_CX4:
-	case IXGBE_DEV_ID_82598_CX4_DUAL_PORT:
 	case IXGBE_DEV_ID_82598_DA_DUAL_PORT:
 	case IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM:
 	case IXGBE_DEV_ID_82598EB_XF_LR:
 	case IXGBE_DEV_ID_82598EB_SFP_LOM:
 		media_type = ixgbe_media_type_fiber;
 		break;
+	case IXGBE_DEV_ID_82598EB_CX4:
+	case IXGBE_DEV_ID_82598_CX4_DUAL_PORT:
+		media_type = ixgbe_media_type_cx4;
+		break;
 	case IXGBE_DEV_ID_82598AT:
 	case IXGBE_DEV_ID_82598AT2:
 		media_type = ixgbe_media_type_copper;
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index c9006bb..2ec58dc 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -338,7 +338,7 @@ static enum ixgbe_media_type ixgbe_get_media_type_82599(struct ixgbe_hw *hw)
 		media_type = ixgbe_media_type_fiber;
 		break;
 	case IXGBE_DEV_ID_82599_CX4:
-		media_type = ixgbe_media_type_fiber;
+		media_type = ixgbe_media_type_cx4;
 		break;
 	default:
 		media_type = ixgbe_media_type_unknown;
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 37303a1..8761d78 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -2144,6 +2144,7 @@ enum ixgbe_media_type {
 	ixgbe_media_type_fiber,
 	ixgbe_media_type_copper,
 	ixgbe_media_type_backplane,
+	ixgbe_media_type_cx4,
 	ixgbe_media_type_virtual
 };
 


^ permalink raw reply related

* [net-next PATCH 2/3] ixgbe: Add support for 82599-based CX4 adapters
From: Jeff Kirsher @ 2009-09-14 17:47 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Peter P Waskiewicz Jr, Jeff Kirsher, Don Skidmore
In-Reply-To: <20090914174659.4560.28814.stgit@localhost.localdomain>

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

This patch adds support for CX4 adapters based on 82599.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
---

 drivers/net/ixgbe/ixgbe_82599.c |    3 +++
 drivers/net/ixgbe/ixgbe_main.c  |    2 ++
 drivers/net/ixgbe/ixgbe_type.h  |    1 +
 3 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index 61af47e..c9006bb 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -337,6 +337,9 @@ static enum ixgbe_media_type ixgbe_get_media_type_82599(struct ixgbe_hw *hw)
 	case IXGBE_DEV_ID_82599_SFP:
 		media_type = ixgbe_media_type_fiber;
 		break;
+	case IXGBE_DEV_ID_82599_CX4:
+		media_type = ixgbe_media_type_fiber;
+		break;
 	default:
 		media_type = ixgbe_media_type_unknown;
 		break;
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index b47aaa8..59ad959 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -97,6 +97,8 @@ static struct pci_device_id ixgbe_pci_tbl[] = {
 	 board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP),
 	 board_82599 },
+	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_CX4),
+	 board_82599 },
 
 	/* required last entry */
 	{0, }
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 8ba90ee..37303a1 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -49,6 +49,7 @@
 #define IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM      0x10E1
 #define IXGBE_DEV_ID_82598EB_XF_LR       0x10F4
 #define IXGBE_DEV_ID_82599_KX4           0x10F7
+#define IXGBE_DEV_ID_82599_CX4           0x10F9
 #define IXGBE_DEV_ID_82599_SFP           0x10FB
 #define IXGBE_DEV_ID_82599_XAUI_LOM      0x10FC
 


^ permalink raw reply related

* [net-next PATCH 1/3] ixgbe: Properly disable packet split per-ring when globally disabled
From: Jeff Kirsher @ 2009-09-14 17:47 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Peter P Waskiewicz Jr, Jeff Kirsher, Don Skidmore

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

The packet split feature was recently moved out of the adapter-wide flags
feature field and into a per-Rx ring feature field.  In the process, packet
split isn't properly disabled in the Rx ring if the adapter has it globally
disabled, followed by a device reset.

This won't impact the driver today, since it's always in packet split mode.
However, this will prevent any pitfalls if someone disables packet split on
the adapter in the future and doesn't disable it in each ring.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 45bf8b9..b47aaa8 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -2055,6 +2055,8 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)

 		if (adapter->flags & IXGBE_FLAG_RX_PS_ENABLED)
 			rx_ring->flags |= IXGBE_RING_RX_PS_ENABLED;
+		else
+			rx_ring->flags &= ~IXGBE_RING_RX_PS_ENABLED;

 #ifdef IXGBE_FCOE
 		if (netdev->features & NETIF_F_FCOE_MTU) {

^ permalink raw reply related

* : Release of iptables-1.4.5
From: Patrick McHardy @ 2009-09-14 17:05 UTC (permalink / raw)
  To: Netfilter Development Mailinglist, Linux Netdev List,
	'netfilter@vger.kernel.org', netfilter

[-- Attachment #1: Type: text/plain, Size: 621 bytes --]

The netfilter coreteam presents:

    iptables version 1.4.5

the iptables release for the 2.6.31 kernel. Changes include:

- support for nfnetlink_queue queue balancing

- support for revision 2 of the conntrack match, which fixes a
  regression in state matching

- support for multiple IP address specifications with -s/-d

- various internal cleanups and improvements

- manpage updates

Version 1.4.5 can be obtained from:

http://www.netfilter.org/projects/iptables/downloads.html
ftp://ftp.netfilter.org/pub/iptables/
git://git.netfilter.org/iptables.git

On behalf of the Netfilter Core Team.
Happy firewalling!


[-- Attachment #2: changes-iptables-1.4.5.txt --]
[-- Type: text/plain, Size: 1335 bytes --]

Florian Westphal (1):
      libxt_NFQUEUE: add new v1 version with queue-balance option

Jan Engelhardt (18):
      xt_conntrack: revision 2 for enlarged state_mask member
      libxt_helper: fix invalid passed option to check_inverse
      libiptc: split v4 and v6
      extensions: collapse registration structures
      iptables: allow for parse-less extensions
      iptables: allow for help-less extensions
      extensions: remove empty help and parse functions
      xtables: add multi-registration functions
      extensions: collapse data variables to use multi-reg calls
      xtables: warn of missing version identifier in extensions
      COMMIT_NOTES: notice to check for soversion bumps
      build: order of dependent libs is sensitive
      multi binary: allow subcommand via argv[1]
      build: fix struct size mismatch
      build: combine iptables-multi and iptables-static
      build: build only iptables-multi
      Merge branch 'stable'
      manpages: more fixes to minuses, hyphens, dashes

Laurence J. Lane (1):
      manpage: fix lintian warnings

Michael Granzow (1):
      iptables: accept multiple IP address specifications for -s, -d

Patrick McHardy (2):
      man: fix incorrect plural in libipt_set.man
      Bump version number to 1.4.5

Trent W. Buck (1):
      ipt_set: fix a typo in the manpage


^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Michael S. Tsirkin @ 2009-09-14 16:53 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo,
	linux-mm, akpm, hpa, Rusty Russell, s.hetze
In-Reply-To: <4AAE6A97.7090808@gmail.com>

On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Fri, Sep 11, 2009 at 12:00:21PM -0400, Gregory Haskins wrote:
> >> FWIW: VBUS handles this situation via the "memctx" abstraction.  IOW,
> >> the memory is not assumed to be a userspace address.  Rather, it is a
> >> memctx-specific address, which can be userspace, or any other type
> >> (including hardware, dma-engine, etc).  As long as the memctx knows how
> >> to translate it, it will work.
> > 
> > How would permissions be handled?
> 
> Same as anything else, really.  Read on for details.
> 
> > it's easy to allow an app to pass in virtual addresses in its own address space.
> 
> Agreed, and this is what I do.
> 
> The guest always passes its own physical addresses (using things like
> __pa() in linux).  This address passed is memctx specific, but generally
> would fall into the category of "virtual-addresses" from the hosts
> perspective.
> 
> For a KVM/AlacrityVM guest example, the addresses are GPAs, accessed
> internally to the context via a gfn_to_hva conversion (you can see this
> occuring in the citation links I sent)
> 
> For Ira's example, the addresses would represent a physical address on
> the PCI boards, and would follow any kind of relevant rules for
> converting a "GPA" to a host accessible address (even if indirectly, via
> a dma controller).

So vbus can let an application access either its own virtual memory or a
physical memory on a PCI device.  My question is, is any application
that's allowed to do the former also granted rights to do the later?

> >  But we can't let the guest specify physical addresses.
> 
> Agreed.  Neither your proposal nor mine operate this way afaict.
> 
> HTH
> 
> Kind Regards,
> -Greg
> 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Michael S. Tsirkin @ 2009-09-14 16:47 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo,
	linux-mm, akpm, hpa, Rusty Russell, s.hetze
In-Reply-To: <4AAE6A97.7090808@gmail.com>

On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote:
> For Ira's example, the addresses would represent a physical address on
> the PCI boards, and would follow any kind of relevant rules for
> converting a "GPA" to a host accessible address (even if indirectly, via
> a dma controller).

I don't think limiting addresses to PCI physical addresses will work
well.  From what I rememeber, Ira's x86 can not initiate burst
transactions on PCI, and it's the ppc that initiates all DMA.

> 
> >  But we can't let the guest specify physical addresses.
> 
> Agreed.  Neither your proposal nor mine operate this way afaict.

But this seems to be what Ira needs.

> HTH
> 
> Kind Regards,
> -Greg
> 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: INGO Why you remove  set_user_nice() from kernel/kthread.c
From: Mike Galbraith @ 2009-09-14 16:36 UTC (permalink / raw)
  To: Chris Friesen
  Cc: pavel, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Linux Network Development list
In-Reply-To: <4AAE6FD7.5070401@nortel.com>

On Mon, 2009-09-14 at 10:31 -0600, Chris Friesen wrote:

> I've added netdev to the CC list.  Maybe some of the people there have
> an opinion on what the default priority should be for ksoftirqd.

Good idea.

	-Mike

^ permalink raw reply

* Re: INGO Why you remove  set_user_nice() from kernel/kthread.c
From: Chris Friesen @ 2009-09-14 16:31 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: pavel, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Linux Network Development list
In-Reply-To: <1252943123.12986.70.camel@marge.simson.net>

On 09/14/2009 09:45 AM, Mike Galbraith wrote:
> On Mon, 2009-09-14 at 09:12 -0600, Chris Friesen wrote:
>> On 09/14/2009 08:05 AM, Mike Galbraith wrote:

>>> I did that, not Ingo, and did so because with kthreads that use
>>> diddly-spit CPU (every one I see), it's just a waste of math.  What
>>> kthreads are you seeing using so much CPU that their weight is a factor?
>>> They _should_ be able to preempt and get their work done just fine
>>> without a boost.
>>
>> Under heavy network load ksoftirqd can use significant amounts of cpu.
> 
> OK, that may justify a weight adjustment, since it is a proxy for many.
> Question is, does it really need it?

I guess it depends how we want it to behave by default.  Likely anyone
that really cares is going to fine-tune the ksoftirqd priority level
anyways.

I've added netdev to the CC list.  Maybe some of the people there have
an opinion on what the default priority should be for ksoftirqd.

Chris

^ permalink raw reply

* Re: ipv4 regression in 2.6.31 ?
From: Stephen Hemminger @ 2009-09-14 16:31 UTC (permalink / raw)
  To: Stephan von Krawczynski
  Cc: Eric Dumazet, linux-kernel, davem, Linux Netdev List
In-Reply-To: <20090914175505.a3f132ee.skraw@ithnet.com>

On Mon, 14 Sep 2009 17:55:05 +0200
Stephan von Krawczynski <skraw@ithnet.com> wrote:

> On Mon, 14 Sep 2009 15:57:03 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > Stephan von Krawczynski a écrit :
> > > Hello all,
> > > 
> > > today we experienced some sort of regression in 2.6.31 ipv4 implementation, or
> > > at least some incompatibility with former 2.6.30.X kernels.
> > > 
> > > We have the following situation:
> > > 
> > >                                        ---------- vlan1@eth0 192.168.2.1/24
> > >                                       /
> > > host A 192.168.1.1/24 eth0  -------<router>            host B
> > >                                       \
> > >                                        ---------- eth1 192.168.3.1/24
> > > 
> > > 
> > > Now, if you route 192.168.1.0/24 via interface vlan1@eth0 on host B and let
> > > host A ping 192.168.2.1 everything works. But if you route 192.168.1.0/24 via
> > > interface eth1 on host B and let host A ping 192.168.2.1 you get no reply.
> > > With tcpdump we see the icmp packets arrive at vlan1@eth0, but no icmp echo
> > > reply being generated neither on vlan1 nor eth1.
> > > Kernels 2.6.30.X and below do not show this behaviour.
> > > Is this intended? Do we need to reconfigure something to restore the old
> > > behaviour?
> > > 
> > 
> > Asymetric routing ?
> > 
> > Check your rp_filter settings
> > 
> > grep . `find /proc/sys/net -name rp_filter`
> > 
> > rp_filter - INTEGER
> >         0 - No source validation.
> >         1 - Strict mode as defined in RFC3704 Strict Reverse Path
> >             Each incoming packet is tested against the FIB and if the interface
> >             is not the best reverse path the packet check will fail.
> >             By default failed packets are discarded.
> >         2 - Loose mode as defined in RFC3704 Loose Reverse Path
> >             Each incoming packet's source address is also tested against the FIB
> >             and if the source address is not reachable via any interface
> >             the packet check will fail.
> > 
> >         Current recommended practice in RFC3704 is to enable strict mode
> >         to prevent IP spoofing from DDos attacks. If using asymmetric routing
> >         or other complicated routing, then loose mode is recommended.
> > 
> >         conf/all/rp_filter must also be set to non-zero to do source validation
> >         on the interface
> > 
> >         Default value is 0. Note that some distributions enable it
> >         in startup scripts.
> 
> Ok, here you can see 2.6.31 values from the discussed box:
> (remember, no ping reply in this setup)
> 
> /proc/sys/net/ipv4/conf/all/rp_filter:1
> /proc/sys/net/ipv4/conf/default/rp_filter:0
> /proc/sys/net/ipv4/conf/lo/rp_filter:0
> /proc/sys/net/ipv4/conf/eth2/rp_filter:0
> /proc/sys/net/ipv4/conf/eth0/rp_filter:0
> /proc/sys/net/ipv4/conf/eth1/rp_filter:0
> /proc/sys/net/ipv4/conf/vlan1/rp_filter:0
> 
> 
> And these are from the same box with 2.6.30.5:
> (ping reply works)
> 
> /proc/sys/net/ipv4/conf/all/rp_filter:1
> /proc/sys/net/ipv4/conf/default/rp_filter:0
> /proc/sys/net/ipv4/conf/lo/rp_filter:0
> /proc/sys/net/ipv4/conf/eth2/rp_filter:0
> /proc/sys/net/ipv4/conf/eth0/rp_filter:0
> /proc/sys/net/ipv4/conf/eth1/rp_filter:0
> /proc/sys/net/ipv4/conf/vlan1/rp_filter:0
> 
> As you can see they're all the same. Does this mean that rp_filter never
> really worked as intended before 2.6.31 ? Or does it mean that rp_filter=0
> (eth1 and vlan1) gets overriden by all/rp_filter=1 in 2.6.31 and not before?

RP filter did not work correctly in 2.6.30. The code added to to the loose
mode caused a bug; the rp_filter value was being computed as:
  rp_filter = interface_value & all_value;
So in order to get reverse path filter both would have to be set.

In 2.6.31 this was change to:
   rp_filter = max(interface_value, all_value);

This was the intended behaviour, if user asks all interfaces to have rp
filtering turned on, then set /proc/sys/net/ipv4/conf/all/rp_filter = 1
or to turn on just one interface, set it for just that interface.

Sorry for any confusion this caused.



-- 

^ permalink raw reply

* [PATCH alt] sky2: Make sure both ports initialize correctly
From: Stephen Hemminger @ 2009-09-14 16:22 UTC (permalink / raw)
  To: Mike McCormack, David Miller; +Cc: netdev
In-Reply-To: <4AAD7B86.6060300@ring3k.org>

Sorry Mike, I sent you off the wrong way. The following is simpler and the
second port is diffrent enough in setup (because of NAPI), that the
following is simpler.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/drivers/net/sky2.c	2009-09-14 09:16:34.313731803 -0700
+++ b/drivers/net/sky2.c	2009-09-14 09:19:48.240726224 -0700
@@ -4550,16 +4550,18 @@ static int __devinit sky2_probe(struct p
 	if (hw->ports > 1) {
 		struct net_device *dev1;
 
+		err = -ENOMEM;
 		dev1 = sky2_init_netdev(hw, 1, using_dac, wol_default);
-		if (!dev1)
-			dev_warn(&pdev->dev, "allocation for second device failed\n");
-		else if ((err = register_netdev(dev1))) {
+		if (dev1 && (err = register_netdev(dev1)) == 0)
+			sky2_show_addr(dev1);
+		else {
 			dev_warn(&pdev->dev,
 				 "register of second port failed (%d)\n", err);
 			hw->dev[1] = NULL;
-			free_netdev(dev1);
-		} else
-			sky2_show_addr(dev1);
+			hw->ports = 1;
+			if (dev1)
+				free_netdev(dev1);
+		}
 	}
 
 	setup_timer(&hw->watchdog_timer, sky2_watchdog, (unsigned long) hw);



-- 

^ permalink raw reply

* [PATCH alt] sky2: transmit ring accounting
From: Stephen Hemminger @ 2009-09-14 16:12 UTC (permalink / raw)
  To: Mike McCormack, David Miller; +Cc: netdev
In-Reply-To: <4AAD7B3D.7010403@ring3k.org>

Be more accurate about number of transmit list elements required.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
Note: this is an optimization, the old code guaranteed more space
than necessary (MAX_SKB_TX_LE was always bigger than needed).

--- a/drivers/net/sky2.c	2009-09-14 08:45:55.730729606 -0700
+++ b/drivers/net/sky2.c	2009-09-14 09:09:42.395712810 -0700
@@ -65,8 +65,8 @@
 #define RX_DEF_PENDING		RX_MAX_PENDING
 
 /* This is the worst case number of transmit list elements for a single skb:
-   VLAN + TSO + CKSUM + Data + skb_frags * DMA */
-#define MAX_SKB_TX_LE	(4 + (sizeof(dma_addr_t)/sizeof(u32))*MAX_SKB_FRAGS)
+   VLAN:GSO + CKSUM + Data + skb_frags * DMA */
+#define MAX_SKB_TX_LE	(2 + (sizeof(dma_addr_t)/sizeof(u32))*(MAX_SKB_FRAGS+1))
 #define TX_MIN_PENDING		(MAX_SKB_TX_LE+1)
 #define TX_MAX_PENDING		4096
 #define TX_DEF_PENDING		127
@@ -1567,11 +1567,13 @@ static unsigned tx_le_req(const struct s
 {
 	unsigned count;
 
-	count = sizeof(dma_addr_t) / sizeof(u32);
-	count += skb_shinfo(skb)->nr_frags * count;
+	count = (skb_shinfo(skb)->nr_frags + 1)
+		* (sizeof(dma_addr_t) / sizeof(u32));
 
 	if (skb_is_gso(skb))
 		++count;
+	else if (sizeof(dma_addr_t) == sizeof(u32))
+		++count;	/* possible vlan */
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		++count;

^ permalink raw reply

* Re: ipv4 regression in 2.6.31 ?
From: Eric Dumazet @ 2009-09-14 16:10 UTC (permalink / raw)
  To: Stephan von Krawczynski
  Cc: Stephen Hemminger, linux-kernel, davem, Linux Netdev List
In-Reply-To: <20090914175505.a3f132ee.skraw@ithnet.com>

Stephan von Krawczynski a écrit :
> On Mon, 14 Sep 2009 15:57:03 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> Stephan von Krawczynski a écrit :
>>> Hello all,
>>>
>>> today we experienced some sort of regression in 2.6.31 ipv4 implementation, or
>>> at least some incompatibility with former 2.6.30.X kernels.
>>>
>>> We have the following situation:
>>>
>>>                                        ---------- vlan1@eth0 192.168.2.1/24
>>>                                       /
>>> host A 192.168.1.1/24 eth0  -------<router>            host B
>>>                                       \
>>>                                        ---------- eth1 192.168.3.1/24
>>>
>>>
>>> Now, if you route 192.168.1.0/24 via interface vlan1@eth0 on host B and let
>>> host A ping 192.168.2.1 everything works. But if you route 192.168.1.0/24 via
>>> interface eth1 on host B and let host A ping 192.168.2.1 you get no reply.
>>> With tcpdump we see the icmp packets arrive at vlan1@eth0, but no icmp echo
>>> reply being generated neither on vlan1 nor eth1.
>>> Kernels 2.6.30.X and below do not show this behaviour.
>>> Is this intended? Do we need to reconfigure something to restore the old
>>> behaviour?
>>>
>> Asymetric routing ?
>>
>> Check your rp_filter settings
>>
>> grep . `find /proc/sys/net -name rp_filter`
>>
>> rp_filter - INTEGER
>>         0 - No source validation.
>>         1 - Strict mode as defined in RFC3704 Strict Reverse Path
>>             Each incoming packet is tested against the FIB and if the interface
>>             is not the best reverse path the packet check will fail.
>>             By default failed packets are discarded.
>>         2 - Loose mode as defined in RFC3704 Loose Reverse Path
>>             Each incoming packet's source address is also tested against the FIB
>>             and if the source address is not reachable via any interface
>>             the packet check will fail.
>>
>>         Current recommended practice in RFC3704 is to enable strict mode
>>         to prevent IP spoofing from DDos attacks. If using asymmetric routing
>>         or other complicated routing, then loose mode is recommended.
>>
>>         conf/all/rp_filter must also be set to non-zero to do source validation
>>         on the interface
>>
>>         Default value is 0. Note that some distributions enable it
>>         in startup scripts.
> 
> Ok, here you can see 2.6.31 values from the discussed box:
> (remember, no ping reply in this setup)
> 
> /proc/sys/net/ipv4/conf/all/rp_filter:1
> /proc/sys/net/ipv4/conf/default/rp_filter:0
> /proc/sys/net/ipv4/conf/lo/rp_filter:0
> /proc/sys/net/ipv4/conf/eth2/rp_filter:0
> /proc/sys/net/ipv4/conf/eth0/rp_filter:0
> /proc/sys/net/ipv4/conf/eth1/rp_filter:0
> /proc/sys/net/ipv4/conf/vlan1/rp_filter:0
> 
> 
> And these are from the same box with 2.6.30.5:
> (ping reply works)
> 
> /proc/sys/net/ipv4/conf/all/rp_filter:1
> /proc/sys/net/ipv4/conf/default/rp_filter:0
> /proc/sys/net/ipv4/conf/lo/rp_filter:0
> /proc/sys/net/ipv4/conf/eth2/rp_filter:0
> /proc/sys/net/ipv4/conf/eth0/rp_filter:0
> /proc/sys/net/ipv4/conf/eth1/rp_filter:0
> /proc/sys/net/ipv4/conf/vlan1/rp_filter:0
> 
> As you can see they're all the same. Does this mean that rp_filter never
> really worked as intended before 2.6.31 ? Or does it mean that rp_filter=0
> (eth1 and vlan1) gets overriden by all/rp_filter=1 in 2.6.31 and not before?
>

Yes, previous kernels ignored /proc/sys/net/ipv4/conf/all/rp_filter value, it was a bug.

commit 27fed4175acf81ddd91d9a4ee2fd298981f60295
Author: Stephen Hemminger <shemminger@vyatta.com>
Date:   Mon Jul 27 18:39:45 2009 -0700

    ip: fix logic of reverse path filter sysctl

    Even though reverse path filter was changed from simple boolean to
    trinary control, the loose mode only works if both all and device are
    configured because of this logic error.

    Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>


In your case, you *need*
echo 0 >/proc/sys/net/ipv4/conf/all/rp_filter
or
echo 2 >/proc/sys/net/ipv4/conf/all/rp_filter


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox