Netdev List
 help / color / mirror / Atom feed
* [PATCH 3/4] [XFRM]: Kill some bloat
From: Ilpo Järvinen @ 2008-01-05 13:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Arnaldo Carvalho de Melo, Ilpo Järvinen
In-Reply-To: <11995403483512-git-send-email-ilpo.jarvinen@helsinki.fi>

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <ilpo.jarvinen@helsinki.fi>

net/xfrm/xfrm_state.c:
  xfrm_audit_state_delete          | -589
  xfrm_replay_check                | -542
  xfrm_audit_state_icvfail         | -520
  xfrm_audit_state_add             | -589
  xfrm_audit_state_replay_overflow | -523
  xfrm_audit_state_notfound_simple | -509
  xfrm_audit_state_notfound        | -521
 7 functions changed, 3793 bytes removed, diff: -3793

net/xfrm/xfrm_state.c:
  xfrm_audit_helper_pktinfo | +522
  xfrm_audit_helper_sainfo  | +598
 2 functions changed, 1120 bytes added, diff: +1120

net/xfrm/xfrm_state.o:
 9 functions changed, 1120 bytes added, 3793 bytes removed, diff: -2673

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
---
 net/xfrm/xfrm_state.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 4dec655..81b0fce 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2008,8 +2008,8 @@ void __init xfrm_state_init(void)
 }
 
 #ifdef CONFIG_AUDITSYSCALL
-static inline void xfrm_audit_helper_sainfo(struct xfrm_state *x,
-					    struct audit_buffer *audit_buf)
+static void xfrm_audit_helper_sainfo(struct xfrm_state *x,
+				     struct audit_buffer *audit_buf)
 {
 	struct xfrm_sec_ctx *ctx = x->security;
 	u32 spi = ntohl(x->id.spi);
@@ -2036,8 +2036,8 @@ static inline void xfrm_audit_helper_sainfo(struct xfrm_state *x,
 	audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
 }
 
-static inline void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
-					     struct audit_buffer *audit_buf)
+static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
+				      struct audit_buffer *audit_buf)
 {
 	struct iphdr *iph4;
 	struct ipv6hdr *iph6;
-- 
1.5.0.6


^ permalink raw reply related

* [PATCH 1/4] [NETFILTER]: Kill some supper dupper bloatry
From: Ilpo Järvinen @ 2008-01-05 13:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Arnaldo Carvalho de Melo, Ilpo Järvinen
In-Reply-To: <11995403482366-git-send-email-ilpo.jarvinen@helsinki.fi>

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <ilpo.jarvinen@helsinki.fi>

/me awards the bloatiest-of-all-net/-.c-code award to
nf_conntrack_netlink.c, congratulations to all the authors :-/!

Hall of (unquestionable) fame (measured per inline, top 10 under
net/):
  -4496 ctnetlink_parse_tuple        netfilter/nf_conntrack_netlink.c
  -2165 ctnetlink_dump_tuples        netfilter/nf_conntrack_netlink.c
  -2115 __ip_vs_get_out_rt           ipv4/ipvs/ip_vs_xmit.c
  -1924 xfrm_audit_helper_pktinfo    xfrm/xfrm_state.c
  -1799 ctnetlink_parse_tuple_proto  netfilter/nf_conntrack_netlink.c
  -1268 ctnetlink_parse_tuple_ip     netfilter/nf_conntrack_netlink.c
  -1093 ctnetlink_exp_dump_expect    netfilter/nf_conntrack_netlink.c
  -1060 void ccid3_update_send_interval  dccp/ccids/ccid3.c
  -983  ctnetlink_dump_tuples_proto  netfilter/nf_conntrack_netlink.c
  -827  ctnetlink_exp_dump_tuple     netfilter/nf_conntrack_netlink.c

  (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) /
   allyesconfig except CONFIG_FORCED_INLINING)

...and I left < 200 byte gains as future work item.

After iterative inline removal, I finally have this:

net/netfilter/nf_conntrack_netlink.c:
  ctnetlink_exp_fill_info   | -1104
  ctnetlink_new_expect      | -1572
  ctnetlink_fill_info       | -1303
  ctnetlink_new_conntrack   | -2230
  ctnetlink_get_expect      | -341
  ctnetlink_del_expect      | -352
  ctnetlink_expect_event    | -1110
  ctnetlink_conntrack_event | -1548
  ctnetlink_del_conntrack   | -729
  ctnetlink_get_conntrack   | -728
 10 functions changed, 11017 bytes removed, diff: -11017

net/netfilter/nf_conntrack_netlink.c:
  ctnetlink_parse_tuple     | +419
  dump_nat_seq_adj          | +183
  ctnetlink_dump_counters   | +166
  ctnetlink_dump_tuples     | +261
  ctnetlink_exp_dump_expect | +633
  ctnetlink_change_status   | +460
 6 functions changed, 2122 bytes added, diff: +2122

net/netfilter/nf_conntrack_netlink.o:
 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895

Without a number of CONFIG.*DEBUGs, I got this:
net/netfilter/nf_conntrack_netlink.o:
 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
---
 net/netfilter/nf_conntrack_netlink.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d93d58d..38141f1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -95,7 +95,7 @@ nla_put_failure:
 	return -1;
 }
 
-static inline int
+static int
 ctnetlink_dump_tuples(struct sk_buff *skb,
 		      const struct nf_conntrack_tuple *tuple)
 {
@@ -205,7 +205,7 @@ nla_put_failure:
 }
 
 #ifdef CONFIG_NF_CT_ACCT
-static inline int
+static int
 ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
 			enum ip_conntrack_dir dir)
 {
@@ -284,7 +284,7 @@ nla_put_failure:
 }
 
 #ifdef CONFIG_NF_NAT_NEEDED
-static inline int
+static int
 dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
 {
 	struct nlattr *nest_parms;
@@ -648,7 +648,7 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr,
 	return ret;
 }
 
-static inline int
+static int
 ctnetlink_parse_tuple(struct nlattr *cda[], struct nf_conntrack_tuple *tuple,
 		      enum ctattr_tuple type, u_int8_t l3num)
 {
@@ -888,7 +888,7 @@ out:
 	return err;
 }
 
-static inline int
+static int
 ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[])
 {
 	unsigned long d;
@@ -1349,7 +1349,7 @@ nla_put_failure:
 	return -1;
 }
 
-static inline int
+static int
 ctnetlink_exp_dump_expect(struct sk_buff *skb,
 			  const struct nf_conntrack_expect *exp)
 {
-- 
1.5.0.6


^ permalink raw reply related

* [PATCH 2/4] [IPVS]: Kill some bloat
From: Ilpo Järvinen @ 2008-01-05 13:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Arnaldo Carvalho de Melo, Ilpo Järvinen
In-Reply-To: <11995403481730-git-send-email-ilpo.jarvinen@helsinki.fi>

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <ilpo.jarvinen@helsinki.fi>

net/ipv4/ipvs/ip_vs_xmit.c:
  ip_vs_icmp_xmit   | -638
  ip_vs_tunnel_xmit | -674
  ip_vs_nat_xmit    | -716
  ip_vs_dr_xmit     | -682
 4 functions changed, 2710 bytes removed, diff: -2710

net/ipv4/ipvs/ip_vs_xmit.c:
  __ip_vs_get_out_rt | +595
 1 function changed, 595 bytes added, diff: +595

net/ipv4/ipvs/ip_vs_xmit.o:
 5 functions changed, 595 bytes added, 2710 bytes removed, diff: -2115

Without some CONFIG.*DEBUGs:

net/ipv4/ipvs/ip_vs_xmit.o:
 5 functions changed, 383 bytes added, 1513 bytes removed, diff: -1130

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
---
 net/ipv4/ipvs/ip_vs_xmit.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index 1e96bf8..8436bf8 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -59,7 +59,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
 	return dst;
 }
 
-static inline struct rtable *
+static struct rtable *
 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 {
 	struct rtable *rt;			/* Route to the other host */
-- 
1.5.0.6


^ permalink raw reply related

* [PATCH net-2.6.25 0/4] [NET]: Bloat, bloat and more bloat
From: Ilpo Järvinen @ 2008-01-05 13:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Arnaldo Carvalho de Melo

Hi Dave,

After Arnaldo got codiff's inline instrumentation bugs fixed
(thanks! :-)), I got my .c-inline-bloat-o-meter to power up
reliably after some tweaking and bug fixing on my behalf...
It shows some very high readings every now and then in the
code under net/.

...Aand... we've a sovereign winner, though it was only fifth
on a kernel wide list (arch/ excluded due to number of
reasons) :-/.

Hall of (unquestionable) fame (measured per inline, top 10 under
net/):
  -4496 ctnetlink_parse_tuple        netfilter/nf_conntrack_netlink.c
  -2165 ctnetlink_dump_tuples        netfilter/nf_conntrack_netlink.c
  -2115 __ip_vs_get_out_rt           ipv4/ipvs/ip_vs_xmit.c
  -1924 xfrm_audit_helper_pktinfo    xfrm/xfrm_state.c
  -1799 ctnetlink_parse_tuple_proto  netfilter/nf_conntrack_netlink.c
  -1268 ctnetlink_parse_tuple_ip     netfilter/nf_conntrack_netlink.c
  -1093 ctnetlink_exp_dump_expect    netfilter/nf_conntrack_netlink.c
  -1060 ccid3_update_send_interval   dccp/ccids/ccid3.c
  -983  ctnetlink_dump_tuples_proto  netfilter/nf_conntrack_netlink.c
  -827  ctnetlink_exp_dump_tuple     netfilter/nf_conntrack_netlink.c

Removing inlines is done iteratively because e.g., uninlining
ctnetlink_parse_tuple affected ..._{proto,ip} prices as well.

i386/gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13)/allyesconfig
except CONFIG_FORCED_INLINING. Tried without some CONFIG.*DEBUG
as well and got slightly better numbers for some functions, yet
the number don't differ enough to be that meaningful, ie., if  
there's bloat somewhere, removing DEBUGs won't make it go away.

...After this first aid we're at least below 1k :-).

--
 i.



^ permalink raw reply

* Re: NAPI poll behavior in various Intel drivers
From: Andi Kleen @ 2008-01-05 13:29 UTC (permalink / raw)
  To: David Miller; +Cc: jchapman, netdev, auke-jan.h.kok
In-Reply-To: <20080104.232504.238436937.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: James Chapman <jchapman@katalix.com>
> Date: Sat, 05 Jan 2008 00:18:31 +0000
>
>> David Miller wrote:
>> > From: James Chapman <jchapman@katalix.com>
>> > Date: Fri, 04 Jan 2008 20:10:30 +0000
>> > 
>> >> With the latest NAPI, this code has to change. But rather than remove
>> >> the tx_cleaned logic completely, shouldn't transmit processing be
>> >> included in the work_done accounting when a driver does transmit cleanup
>> >> processing in the poll?
>> > 
>> > Most other NAPI drivers don't do this, they just process all the
>> > pending TX work unconditionally and do not account it into the NAPI
>> > poll work.
>> 
>> This will cause the interface to thrash in/out of polled mode very
>> quickly when it is doing almost all transmit work. That's something to
>> avoid, no?
>
> I see your point although I've never seen this in practice
> with tg3 or niu.

In 2.4 we used to have (haven't checked recently) performance regressions
with NAPI vs non NAPI (or versus the old BCM vendor driver) on tg3 for
some workloads that didn't fully fill the link. The theory was always
that the reason for that was something like the regular switching in
and out. So I think we saw that problem on tg3 too.

-Andi

^ permalink raw reply

* Re: [PATCH] ibm_newemac: Increase number of default rx-/tx-buffers
From: Stefan Roese @ 2008-01-05 12:38 UTC (permalink / raw)
  To: benh; +Cc: netdev, linuxppc-dev
In-Reply-To: <1199528388.7291.55.camel@pasglop>

On Saturday 05 January 2008, Benjamin Herrenschmidt wrote:
> On Sat, 2008-01-05 at 10:50 +0100, Stefan Roese wrote:
> > Performance tests done by AMCC have shown that 256 buffer increase the
> > performance of the Linux EMAC driver. So let's update the default
> > values to match this setup.
> >
> > Signed-off-by: Stefan Roese <sr@denx.de>
> > ---
>
> Do we have the numbers ? Did they also measure latency ?

I hoped this question would not come. ;) No, unfortunately I don't have any 
numbers. Just the recommendation from AMCC to always use 256 buffers.

Best regards,
Stefan

^ permalink raw reply

* [PATCH net-2.6.25] [NET]: Remove obsolete comment
From: Ilpo Järvinen @ 2008-01-05 12:21 UTC (permalink / raw)
  To: David Miller; +Cc: Netdev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1017 bytes --]

[PATCH] [NET]: Remove obsolete comment

It seems that ip_build_xmit is no longer used in here and
ip_append_data is used.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
---

Hi Dave,

While reading some nearby code, I noticed this. I'm not 100%
sure if the removal is valid or not nor have interest to dig
it up from history but I suppose you know it better without
having to look it up from history :-).

 net/ipv4/ip_output.c |    2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3dc0c12..e57de0f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1339,8 +1339,6 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
  *
  *	Should run single threaded per socket because it uses the sock
  *     	structure to pass arguments.
- *
- *	LATER: switch from ip_build_xmit to ip_append_*
  */
 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 		   unsigned int len)
-- 
1.5.0.6

^ permalink raw reply related

* Re: [PATCH] ibm_newemac: Increase number of default rx-/tx-buffers
From: Benjamin Herrenschmidt @ 2008-01-05 10:19 UTC (permalink / raw)
  To: Stefan Roese; +Cc: netdev, linuxppc-dev
In-Reply-To: <1199526609-12290-1-git-send-email-sr@denx.de>


On Sat, 2008-01-05 at 10:50 +0100, Stefan Roese wrote:
> Performance tests done by AMCC have shown that 256 buffer increase the
> performance of the Linux EMAC driver. So let's update the default
> values to match this setup.
> 
> Signed-off-by: Stefan Roese <sr@denx.de>
> ---

Do we have the numbers ? Did they also measure latency ?

Ben.

>  drivers/net/ibm_newemac/Kconfig |    4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ibm_newemac/Kconfig b/drivers/net/ibm_newemac/Kconfig
> index 0d3e738..5a06727 100644
> --- a/drivers/net/ibm_newemac/Kconfig
> +++ b/drivers/net/ibm_newemac/Kconfig
> @@ -9,12 +9,12 @@ config IBM_NEW_EMAC
>  config IBM_NEW_EMAC_RXB
>  	int "Number of receive buffers"
>  	depends on IBM_NEW_EMAC
> -	default "128"
> +	default "256"
>  
>  config IBM_NEW_EMAC_TXB
>  	int "Number of transmit buffers"
>  	depends on IBM_NEW_EMAC
> -	default "64"
> +	default "256"
>  
>  config IBM_NEW_EMAC_POLL_WEIGHT
>  	int "MAL NAPI polling weight"


^ permalink raw reply

* Re: 2.6.24-rc6-mm1
From: Jarek Poplawski @ 2008-01-05 10:13 UTC (permalink / raw)
  To: Torsten Kaiser
  Cc: Herbert Xu, Andrew Morton, linux-kernel, Neil Brown,
	J. Bruce Fields, netdev, Tom Tucker
In-Reply-To: <64bb37e0801050001x65b104bdl5a68c731b3656d17@mail.gmail.com>

On Sat, Jan 05, 2008 at 09:01:02AM +0100, Torsten Kaiser wrote:
> On Jan 5, 2008 1:07 AM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> > On Fri, Jan 04, 2008 at 04:21:26PM +0100, Torsten Kaiser wrote:
> > > On Jan 4, 2008 2:30 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> > > The only thing that is sadly not practical is bisecting the borkenout
> > > mm-patches, as triggering this error is to unreliable /
> > > time-consuming.
> >
> > Right, but it seems there are these 2 main suspects here...
> >
> > > > - is it still vanilla -rc6-mm1; I've seen on kernel list you tried
> > > > some fixes around raid?
> > >
> > > Yes, without these fixes I can't boot.
> > > But they should only be run during starting the arrays, so I doubt
> > > that this is that cause.
> > > (Also -rc3-mm2 did not need this fix)
> >
> > You've written vanilla -rc6 is OK. Does it mean -rc6 with these fixes?
> 
> vanilla -rc6 is fine without these fixes.
> The raid-bugs from -rc6-mm1 are probably introduced by
> md-allow-devices-to-be-shared-between-md-arrays.patch and that patch
> is new in this mm-release.
> 
> > I think it would be easier just to start with this working -rc6 and
> > simply check if we have 'right' suspects, so: git-net.patch and
> > git-nfsd.patch from -mm1-broken-out, as suggested by Herbert (I hope,
> > can compile - otherwise you could try the other way: add the whole -mm
> > and revert these two). Using current gits could complicate this
> > "investigation".
> 
> OK, I will try this...

It seems that this last report gives the third one: ieee1394 to the pack,
so probably, you can hold on a "minute" - this all needs some rethinking.
(But, if you've begun with this already, let it be clear at last too.)

> > I didn't read all this thread, so probably I miss many points, but are
> > you sure there are no problems with filesystem corruption around these
> > packets or where you compile(?) them (e.g. after these raid problems)?
> 
> For my setup: It's a gentoo system, so compiling packages is the
> normal way of installing something.
> The compile itself is done on a tmpfs so a filesystem corruption there
> should be rather impossible. ;)
> (The system has 4Gb RAM, so it doesn't even need to swap)
> The sources are taken from a nfsv4 share that is served from a
> different system. Also gentoo checksums all sources it will use.

Yes, since this was no problem with vanilla 2.6.24-rc6, I've probably
gone astray...

> If you think some other slub_debug might catch it, I would try this...

OK! But, in the meantime could you send your current .config? I wonder
e.g. if there could be used this new ieee1394 code from
init_ohci1394_dma.c?
 
You are really helpful, thanks,
Jarek P.

^ permalink raw reply

* [PATCH] ibm_newemac: Increase number of default rx-/tx-buffers
From: Stefan Roese @ 2008-01-05  9:50 UTC (permalink / raw)
  To: netdev; +Cc: linuxppc-dev, benh

Performance tests done by AMCC have shown that 256 buffer increase the
performance of the Linux EMAC driver. So let's update the default
values to match this setup.

Signed-off-by: Stefan Roese <sr@denx.de>
---
 drivers/net/ibm_newemac/Kconfig |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ibm_newemac/Kconfig b/drivers/net/ibm_newemac/Kconfig
index 0d3e738..5a06727 100644
--- a/drivers/net/ibm_newemac/Kconfig
+++ b/drivers/net/ibm_newemac/Kconfig
@@ -9,12 +9,12 @@ config IBM_NEW_EMAC
 config IBM_NEW_EMAC_RXB
 	int "Number of receive buffers"
 	depends on IBM_NEW_EMAC
-	default "128"
+	default "256"
 
 config IBM_NEW_EMAC_TXB
 	int "Number of transmit buffers"
 	depends on IBM_NEW_EMAC
-	default "64"
+	default "256"
 
 config IBM_NEW_EMAC_POLL_WEIGHT
 	int "MAL NAPI polling weight"
-- 
1.5.4.rc2


^ permalink raw reply related

* Re: sparc oops in ip_fast_csum
From: Al Viro @ 2008-01-05  9:44 UTC (permalink / raw)
  To: Mariusz Kozlowski; +Cc: David Miller, sparclinux, netdev, linux-kernel
In-Reply-To: <200801041837.37650.m.kozlowski@tuxland.pl>

On Fri, Jan 04, 2008 at 06:37:36PM +0100, Mariusz Kozlowski wrote:
> Hello,
> 
>         This comes from the Linus latest linux-2.6 tree. Randomly happened.
> Can't reproduce that. More info below.

ip_fast_csum() called from raw_send_hdrinc() from raw_sendmsg() ran through
the page boundary into unmapped page...  Bloody odd, that, seeing that
we have checked iph->ihl * 4U <= length and had done
        err = memcpy_fromiovecend((void *)iph, from, 0, length);
before the
                iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
that had triggered that crap..

^ permalink raw reply

* Re: driver for et131x?
From: Christoph Hellwig @ 2008-01-05  8:58 UTC (permalink / raw)
  To: Manfred Schwarb; +Cc: netdev, Dean Adams
In-Reply-To: <20080104100253.63070@gmx.net>

On Fri, Jan 04, 2008 at 11:02:53AM +0100, Manfred Schwarb wrote:
> meanwhile the Agere et1310 chip has grown to a quite popular network
> chip, in laptops as well as for PCI-E cards. The release date of this chip was
> already in June 2004, and still no support in mainline linux.
> 
> There is apparently an opensource driver from Agere which is maintained by
> volunteers at http://sourceforge.net/projects/et131x/
> 
> It seems some distros (e.g. Ubuntu) even patch their kernels with this driver.
> 
> Please consider adding support for this chip in mainline linux.

The driver on the above site would need quite a bit of work to bring it
up to Linux standards.  That work shouldn't be difficult but enough to
keep someone busy for a while.  If you have the hardware I'd be happy yo
guide you through the required work.  Are you interested in taking up
this project?


^ permalink raw reply

* Question about arp_error_report() (IPV4): clarification
From: Andy Johnson @ 2008-01-05  8:43 UTC (permalink / raw)
  To: netdev

Hi,
In neigh_timer_handler( ),  (core/neighbour.c), there is a comment which says:

# It is very thin place. report_unreachable is very complicated
# routine. Particularly, it can hit the same neighbour entry!

#  So that, we try to be accurate and avoid dead loop. --ANK

I tried to figure exactly what this comment mean (when dealing with IPV4).
 Can someone elaborate in 3-4 sentences on it ?


Why  "hit the same neighbour entry" can cause any complications ?
Why is report_unreachable() considered complicated ?
It seems to be simple, or am I missing something?


As far as I understand, eventually  ipv4_link_failure()
(net/ipv4/route.c) is called when  report_unreachable() is invoked.

(The path is: arp_error_report() calls dst_link_failure(skb), which in
case of ipv4, calls ipv4_link_failure().)
So it seems simply a matter of calling eventually ipv4_link_failure(),
which ends up with generating ICMP.

Regards,
Andy

^ permalink raw reply

* Re: 2.6.24-rc6-mm1
From: Torsten Kaiser @ 2008-01-05  8:01 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Herbert Xu, Andrew Morton, linux-kernel, Neil Brown,
	J. Bruce Fields, netdev, Tom Tucker
In-Reply-To: <20080105000700.GA3224@ami.dom.local>

On Jan 5, 2008 1:07 AM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> On Fri, Jan 04, 2008 at 04:21:26PM +0100, Torsten Kaiser wrote:
> > On Jan 4, 2008 2:30 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> > The only thing that is sadly not practical is bisecting the borkenout
> > mm-patches, as triggering this error is to unreliable /
> > time-consuming.
>
> Right, but it seems there are these 2 main suspects here...
>
> > > - is it still vanilla -rc6-mm1; I've seen on kernel list you tried
> > > some fixes around raid?
> >
> > Yes, without these fixes I can't boot.
> > But they should only be run during starting the arrays, so I doubt
> > that this is that cause.
> > (Also -rc3-mm2 did not need this fix)
>
> You've written vanilla -rc6 is OK. Does it mean -rc6 with these fixes?

vanilla -rc6 is fine without these fixes.
The raid-bugs from -rc6-mm1 are probably introduced by
md-allow-devices-to-be-shared-between-md-arrays.patch and that patch
is new in this mm-release.

> I think it would be easier just to start with this working -rc6 and
> simply check if we have 'right' suspects, so: git-net.patch and
> git-nfsd.patch from -mm1-broken-out, as suggested by Herbert (I hope,
> can compile - otherwise you could try the other way: add the whole -mm
> and revert these two). Using current gits could complicate this
> "investigation".

OK, I will try this...

> > My skbuff-double-free-detector is still in there, but was never triggered.
> >
> > > - could you remind this lockdep warning; is it always and the same,
> > > always before crash, or no rules?
> >
> > ???
> > I see no lockdep warning before the crashes.
> > I have seen a warning about the dst->__refcnt in dst_release and
> > different warnings about list operations.
> >
> > I think I have always posted everything I have seen before the
> > crashes. (captured via serial console)
>
> So, you mean there are no more of these?:
>
> "looked into the log in question and the only other warning was a
>  circular locking dependency that lockdep detected around 1.5 hour
>  before this warning."
> ...
> "[ 7620.845168] INFO: lockdep is turned off."

Aha, I had forgotten about that one.
Looking at all the crashlogs, I do not find another one of this lockdep warning.
The only other lockdep related output was the bootup problem in vanilla -rc6.

> > (If you mean the lockdep-problem in -rc6: That is more or less a
> > missing annotation during early bootup. The only problem with that is,
> > that it will causes lockdep to be turned off and so it can not be used
> > to find any real problem. A fix for that is in -mm so I do have
> > lockdep on the mm-kernels)
> >
> > > - I've seen you looked after double freeing, but this last debug list
> > > warning could suggest locking problems during list modification too.
> >
> > Yes, but Herbert mentioned double freeing a skb explicit and so I
> > tried to catch this.
> > I do not know enough about the network core to verify the locking of
> > the involved lists.
>
> Right, the list corruption could be because of use after freeing too.

I had hoped that I could catch use-after-freeing by using
slub_debug=FZP, but that did not help.
(first oops in http://lkml.org/lkml/2007/12/28/159 )

I think that the main skb structs come from slub and should be
poisoned by this, so it might be some other data structure that is
allocated differently...

> > > - above git-nfsd and git-net tests should be probably repeated with
> > > -rc6-mm1 git versions: so vanilla rc6 plus both these -mm patches
> > > only, and if bug triggers, with one reversed; btw., since in previous
> > > message you mentioned that 50 packages could be not enough to trigger
> > > this, these 54 above could make too little margin yet.
> >
> > Yes, I think I really need to redo the git-nfsd-test.
> > With IOMMU_DEBUG enabled rc6-mm1worked for 52 packages, only a secound
> > run of kde-packages triggered it after only 5 packages.
> > I don't know what this bug hates about kdeartwork-wallpaper (triggered
> > it this time) or kdeartwork-styles.
>
> I didn't read all this thread, so probably I miss many points, but are
> you sure there are no problems with filesystem corruption around these
> packets or where you compile(?) them (e.g. after these raid problems)?

For my setup: It's a gentoo system, so compiling packages is the
normal way of installing something.
The compile itself is done on a tmpfs so a filesystem corruption there
should be rather impossible. ;)
(The system has 4Gb RAM, so it doesn't even need to swap)
The sources are taken from a nfsv4 share that is served from a
different system. Also gentoo checksums all sources it will use.

After the crashes I also did a checksum of the last installed
packages. Only in one instance there was corruption, all new files
where completely empty. Obviously XFS did not have the time to write
them back to disk before the system crashed.
Also as all crashes show network related traces and the system is
working fine otherwise, I doubt any permanent filesystem problems.

For the raid problems: I was just unable to even start the raid that
has / on it, because of a wrong check in the raid-autostart code.
( http://lkml.org/lkml/2007/12/27/45 )

> > Output from the crash with IOMMU_DEBUG (lockdep was enabled, but did
> > not trigger):
> > [15593.236374] Unable to handle kernel NULL pointer
> > dereference<3>list_add corruption. prev->next should be next
>
> Fine! I'll try to look at this. BTW, I guess/hope DEBUG_SLAB etc. are
> also on...

DEBUG_SLAB is off, because of:
 CONFIG_SLUB_DEBUG=y
# CONFIG_SLAB is not set
CONFIG_SLUB=y

But I'm currently did not have the slub_debug-option in my kernel
commandline, because:
a) slub_debug=FZP did not prevent the bug in -rc3-mm2
b) but it took a much longer time to trigger it
c) its a serious slowdown for these compiles

If you think some other slub_debug might catch it, I would try this...

Torsten

^ permalink raw reply

* Re: NAPI poll behavior in various Intel drivers
From: David Miller @ 2008-01-05  7:25 UTC (permalink / raw)
  To: jchapman; +Cc: netdev, auke-jan.h.kok
In-Reply-To: <477ECCD7.8090905@katalix.com>

From: James Chapman <jchapman@katalix.com>
Date: Sat, 05 Jan 2008 00:18:31 +0000

> David Miller wrote:
> > From: James Chapman <jchapman@katalix.com>
> > Date: Fri, 04 Jan 2008 20:10:30 +0000
> > 
> >> With the latest NAPI, this code has to change. But rather than remove
> >> the tx_cleaned logic completely, shouldn't transmit processing be
> >> included in the work_done accounting when a driver does transmit cleanup
> >> processing in the poll?
> > 
> > Most other NAPI drivers don't do this, they just process all the
> > pending TX work unconditionally and do not account it into the NAPI
> > poll work.
> 
> This will cause the interface to thrash in/out of polled mode very
> quickly when it is doing almost all transmit work. That's something to
> avoid, no?

I see your point although I've never seen this in practice
with tg3 or niu.

For a heavy transmit load with TCP, the cpu killer is the ACK receive
processing.

I guess for a datagram send situation it might start to edge up.

However, you're likely deferring TX events (say every 1/4 of the TX
ring or similar) which should make the effect matter much less.

But anyways, it's someone else's driver so if they want to take
TX work into account sure. :-)  I'll keep this in mind in my NAPI
changes.

Thanks.

^ permalink raw reply

* bridge, packet destination not found in fdb, how to match in ebtables?
From: Denys Fedoryshchenko @ 2008-01-05  6:51 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

Hi

In few words: i have problem with bridge, that if some MAC address is not
known, even expired by age bridge will send packet as "flood".

If i am not wrong
in br_device.c
int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
...
        if (dest[0] & 1)
                br_flood_deliver(br, skb);
        else if ((dst = __br_fdb_get(br, dest)) != NULL)
                br_deliver(dst->dst, skb);
        else
                br_flood_deliver(br, skb);
...

Probably, because packet becoming broadcast in bridge terms, it is useful to
set it  something skb->pkt_type = PACKET_BROADCAST; ?
But as i understand after looking in kernel code it can't be set
PACKET_BROADCAST, because it will mess with most of network drivers. So packet
is in ebtables pkttype is matching as "otherhost", same as "good"(non-flood)
packets.


But is there a way to set "flood packet" to some type, that i can match in
ebtables and block it as broadcast?

I had with PPPoE troubles with this, cause PPPoE acting differently than
IP+ARP, and situation when some buggy linksys router start flooding network by
PADT packet to address not existing in fdb, just killing network by "flood" of
this frames to all ports.

Here i see two solutions:
1)Change pkt_type
2)Develop additional module to handle "fdb attributes" (destination found in
fdb or unknown)

I think it is relatively trivial to implement, and it will be useful, as usual
settings available on modern switches, known as "flood limit", but i think i
am not enough programmer to do that. If possible guide me what to do.

--
Denys Fedoryshchenko
Technical Manager
Virtual ISP S.A.L.


^ permalink raw reply

* Re: forcedeth: MAC-address reversed on resume from suspend
From: Björn Steinbrink @ 2008-01-05  6:39 UTC (permalink / raw)
  To: Andreas Mohr
  Cc: Adrian Bunk, Richard Jonsson, linux-kernel, Ayaz Abdulla, jgarzik,
	netdev
In-Reply-To: <20080105061002.GA22739@atjola.homenet>

On 2008.01.05 07:10:02 +0100, Björn Steinbrink wrote:
> - nv_probe sees 00:11:22:00:00:01
> - doesn't match the vendor stuff
> - becomes 01:00:00:22:11:00
> 
> Oops, that's not quite the expected thing.

Haha, I just noticed that that even is a multicast address, so you'd get
a random MAC address in that case *lol.

Björn

^ permalink raw reply

* Re: forcedeth: MAC-address reversed on resume from suspend
From: Björn Steinbrink @ 2008-01-05  6:10 UTC (permalink / raw)
  To: Andreas Mohr
  Cc: Adrian Bunk, Richard Jonsson, linux-kernel, Ayaz Abdulla, jgarzik,
	netdev
In-Reply-To: <20080104224352.GA7853@rhlx01.hs-esslingen.de>

On 2008.01.04 23:43:52 +0100, Andreas Mohr wrote:
> On Fri, Jan 04, 2008 at 11:17:40AM +0100, Björn Steinbrink wrote:
> > On 2008.01.04 09:45:17 +0100, Andreas Mohr wrote:
> > > And then it needs these card I/O functions wrapped into two
> > > functions which interface with driver- and OS-related MAC
> > > variables (struct variables ALWAYS stored in usual system order,
> > > NOT H/W order!!!!!!) which optionally reverse the address (if
> > > needed for a particular card).
> > 
> > Hu? The MAC address is only ever reversed when the card is not in
> > use, why the hell would you want to reverse anything when the rest
> > of the OS is involved? This whole reversing stuff is purely before
> > and after the card is actually used. It's not that you need to
> > reverse the address to communicate with the card, it's just
> > initially wrong on the card.
> 
> Huhrmm? OK, let me ask this then: So what you're saying is that the
> address is only initially wrong (e.g. due to card EEPROM content
> somehow initializing the registers in wrong order on power-on), it's
> not _always_ supposed to be stored in wrong order during operation.
> 
> IOW, the default card state after power-on is _unusable_ (due to
> invalidly ordered MAC address) and has to be _corrected_ by the
> driver, _initially_ only?

Yeah. _But_, all deduced from the code.

> OTOH I know that at least acx100 has a _permanently_ wrong-ordered MAC
> address setup, i.e. it's required to have it in "wrong" order _during
> operation_. And I wouldn't be surprised to see several examples of
> other hardware having a permanently wrongly-ordered address
> requirement, given the amount of MAC reversal in Linux drivers.
>
> Couldn't it be by chance that it's _believed_ to be
> reversed-on-power-on only, whereas those cards should _actually_ have
> it reversed-during-lifetime instead? Such a misunderstanding might
> explain a lot of trouble...

Humm... Wouldn't that have made itself evident? The card's not running
in promisc mode all the time, so there should be some problems if the
card would expect the MAC in the reversed order, right? (At least I see
some code that can switch it into promisc mode, so I assume that it is
not enabled all the time).

There _is_ a comment about some cards not generating any TX interrupts
at all. But that seems to predate any card that stores the MAC address
in correct order (the patch for that came after git, the comment
predates git). So assuming that the card wants the address in reversed
order at runtime, would likely imply that _no_ card would generate TX
interrupts (unless promisc?), and the comment basically invalidates that
assumption.

> > > And then there will never be any confusion about any MAC address
> > > format anywhere any more, right?
> > 
> > At probing time, you'll have to know whether the address you read
> > from the hardware is reversed or not. Unless you write it back in
> > reversed order when you release the card, you need a flag that at
> > least survives until nv_probe is called the next time. kexec does
> > not write it back, so you do need such a flag. But the flag
> > apparently doesn't survive a suspend/resume cycle, so you need to
> > write back the reversed address then. But the flag will survive a
> > rmmod + modprobe cycle, so you need to reset it when writing back
> > the reversed address.
> 
> If it's indeed reversed-on-power-on only, then one probably cannot do
> much about it, thus I'd give up and simply check the MAC address for
> validity (should be very easy to check for a simple reversal by
> checking for the manufacturer-specific fingerprint in reverse order).
> Keeping _any_ sort of state to record the fact that it used to be
> reversed or now is reversed is futile (or rather: catastrophic) given
> the complexity of suspend / virtualisation or whichever other nice
> operations Linux may invent in the future ;)

There was once a patch that checked the vendor specific address part,
long ago. No idea what happened to it. But for kexec, I'd say that that
is broken, too.

- ifconfig eth0 hw ether 00:11:22:00:00:01
- $kexec_incantation

- nv_probe sees 00:11:22:00:00:01
- doesn't match the vendor stuff
- becomes 01:00:00:22:11:00

Oops, that's not quite the expected thing.

No way to tell whether we have to reverse from the address alone AFAICT.

Björn

^ permalink raw reply

* Re: NAPI poll behavior in various Intel drivers
From: James Chapman @ 2008-01-05  0:18 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, auke-jan.h.kok
In-Reply-To: <20080104.132443.25068269.davem@davemloft.net>

David Miller wrote:
> From: James Chapman <jchapman@katalix.com>
> Date: Fri, 04 Jan 2008 20:10:30 +0000
> 
>> With the latest NAPI, this code has to change. But rather than remove
>> the tx_cleaned logic completely, shouldn't transmit processing be
>> included in the work_done accounting when a driver does transmit cleanup
>> processing in the poll?
> 
> Most other NAPI drivers don't do this, they just process all the
> pending TX work unconditionally and do not account it into the NAPI
> poll work.

This will cause the interface to thrash in/out of polled mode very
quickly when it is doing almost all transmit work. That's something to
avoid, no?

> The logic is that, like link state handling, TX work is very cheap and
> not the cpu cycle eater that RX packet processing is.

The processing is cheap but it is being done in the poll so it is
scheduled by NAPI. The event rate for TX events can be just as high as
RX. For link state handling, the event rate is always low so won't cause
NAPI to schedule rapidly.

-- 
James Chapman
Katalix Systems Ltd
http://www.katalix.com
Catalysts for your Embedded Linux software development


^ permalink raw reply

* Re: 2.6.24-rc6-mm1
From: Jarek Poplawski @ 2008-01-05  0:07 UTC (permalink / raw)
  To: Torsten Kaiser
  Cc: Herbert Xu, Andrew Morton, linux-kernel, Neil Brown,
	J. Bruce Fields, netdev, Tom Tucker
In-Reply-To: <64bb37e0801040721p57ff3d54wc3de00546d1d2ff1@mail.gmail.com>

On Fri, Jan 04, 2008 at 04:21:26PM +0100, Torsten Kaiser wrote:
> On Jan 4, 2008 2:30 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
...
> I'm open for any suggestions and will try to answer any questions.

I'm very glad, thanks!

> The only thing that is sadly not practical is bisecting the borkenout
> mm-patches, as triggering this error is to unreliable /
> time-consuming.

Right, but it seems there are these 2 main suspects here...

> > - is it still vanilla -rc6-mm1; I've seen on kernel list you tried
> > some fixes around raid?
> 
> Yes, without these fixes I can't boot.
> But they should only be run during starting the arrays, so I doubt
> that this is that cause.
> (Also -rc3-mm2 did not need this fix)

You've written vanilla -rc6 is OK. Does it mean -rc6 with these fixes?
I think it would be easier just to start with this working -rc6 and
simply check if we have 'right' suspects, so: git-net.patch and
git-nfsd.patch from -mm1-broken-out, as suggested by Herbert (I hope,
can compile - otherwise you could try the other way: add the whole -mm
and revert these two). Using current gits could complicate this
"investigation". 

> My skbuff-double-free-detector is still in there, but was never triggered.
> 
> > - could you remind this lockdep warning; is it always and the same,
> > always before crash, or no rules?
> 
> ???
> I see no lockdep warning before the crashes.
> I have seen a warning about the dst->__refcnt in dst_release and
> different warnings about list operations.
> 
> I think I have always posted everything I have seen before the
> crashes. (captured via serial console)

So, you mean there are no more of these?:
 
"looked into the log in question and the only other warning was a
 circular locking dependency that lockdep detected around 1.5 hour
 before this warning."
...
"[ 7620.845168] INFO: lockdep is turned off."

> (If you mean the lockdep-problem in -rc6: That is more or less a
> missing annotation during early bootup. The only problem with that is,
> that it will causes lockdep to be turned off and so it can not be used
> to find any real problem. A fix for that is in -mm so I do have
> lockdep on the mm-kernels)
> 
> > - I've seen you looked after double freeing, but this last debug list
> > warning could suggest locking problems during list modification too.
> 
> Yes, but Herbert mentioned double freeing a skb explicit and so I
> tried to catch this.
> I do not know enough about the network core to verify the locking of
> the involved lists.

Right, the list corruption could be because of use after freeing too.

> > - above git-nfsd and git-net tests should be probably repeated with
> > -rc6-mm1 git versions: so vanilla rc6 plus both these -mm patches
> > only, and if bug triggers, with one reversed; btw., since in previous
> > message you mentioned that 50 packages could be not enough to trigger
> > this, these 54 above could make too little margin yet.
> 
> Yes, I think I really need to redo the git-nfsd-test.
> With IOMMU_DEBUG enabled rc6-mm1worked for 52 packages, only a secound
> run of kde-packages triggered it after only 5 packages.
> I don't know what this bug hates about kdeartwork-wallpaper (triggered
> it this time) or kdeartwork-styles.

I didn't read all this thread, so probably I miss many points, but are
you sure there are no problems with filesystem corruption around these
packets or where you compile(?) them (e.g. after these raid problems)?

> Output from the crash with IOMMU_DEBUG (lockdep was enabled, but did
> not trigger):
> [15593.236374] Unable to handle kernel NULL pointer
> dereference<3>list_add corruption. prev->next should be next

Fine! I'll try to look at this. BTW, I guess/hope DEBUG_SLAB etc. are
also on...

Thanks,
Jarek P.

^ permalink raw reply

* [ofa-general] Re: [RFC PATCH] IPoIB: improve IPv4/IPv6 to IB mcast mapping functions
From: Rolf Manderscheid @ 2008-01-04 23:59 UTC (permalink / raw)
  To: Roland Dreier; +Cc: netdev, general
In-Reply-To: <adair29mel2.fsf_-_@cisco.com>

Roland Dreier wrote:
> Any objection to merging the following for 2.6.25?
>
> [Rolf -- I think it makes more sense to delete the overwriting of the
> P_Key in ipoib_multicast.c in this patch rather than later in the
> series; do you agree?]
>   
Yes, the first patch is the logical place for it.  The only reason
I put it in the second patch was to keep the first patch as small
and simple as possible.

    Rolf

^ permalink raw reply

* Re: forcedeth: MAC-address reversed on resume from suspend
From: Andreas Mohr @ 2008-01-04 22:43 UTC (permalink / raw)
  To: Björn Steinbrink
  Cc: Andreas Mohr, Adrian Bunk, Richard Jonsson, linux-kernel,
	Ayaz Abdulla, jgarzik, netdev
In-Reply-To: <20080104101740.GA4406@atjola.homenet>

On Fri, Jan 04, 2008 at 11:17:40AM +0100, Björn Steinbrink wrote:
> On 2008.01.04 09:45:17 +0100, Andreas Mohr wrote:
> > And then it needs these card I/O functions wrapped into two functions which
> > interface with driver- and OS-related MAC variables
> > (struct variables ALWAYS stored in usual system order, NOT H/W order!!!!!!)
> > which optionally reverse the address (if needed for a particular card).
> 
> Hu? The MAC address is only ever reversed when the card is not in use,
> why the hell would you want to reverse anything when the rest of the OS
> is involved? This whole reversing stuff is purely before and after the
> card is actually used. It's not that you need to reverse the address to
> communicate with the card, it's just initially wrong on the card.

Huhrmm? OK, let me ask this then:
So what you're saying is that the address is only initially wrong
(e.g. due to card EEPROM content somehow initializing the registers
in wrong order on power-on), it's not _always_ supposed to be stored
in wrong order during operation.

IOW, the default card state after power-on is _unusable_ (due to
invalidly ordered MAC address) and has to be _corrected_ by the driver,
_initially_ only?

OTOH I know that at least acx100 has a _permanently_ wrong-ordered
MAC address setup, i.e. it's required to have it in "wrong" order
_during operation_. And I wouldn't be surprised to see several examples
of other hardware having a permanently wrongly-ordered address
requirement, given the amount of MAC reversal in Linux drivers.


Couldn't it be by chance that it's _believed_ to be reversed-on-power-on only,
whereas those cards should _actually_ have it reversed-during-lifetime
instead? Such a misunderstanding might explain a lot of trouble...

Obviously I was expecting the latter case, which my code layout proposal
was supposed to support in a clean way.

> > And then there will never be any confusion about any MAC address format
> > anywhere any more, right?
> 
> At probing time, you'll have to know whether the address you read from
> the hardware is reversed or not. Unless you write it back in reversed
> order when you release the card, you need a flag that at least survives
> until nv_probe is called the next time. kexec does not write it back, so
> you do need such a flag. But the flag apparently doesn't survive a
> suspend/resume cycle, so you need to write back the reversed address
> then. But the flag will survive a rmmod + modprobe cycle, so you need to
> reset it when writing back the reversed address.

If it's indeed reversed-on-power-on only, then one probably cannot do
much about it, thus I'd give up and simply check the MAC address for
validity (should be very easy to check for a simple reversal by
checking for the manufacturer-specific fingerprint in reverse order).
Keeping _any_ sort of state to record the fact that it used to be reversed
or now is reversed is futile (or rather: catastrophic) given the complexity of
suspend / virtualisation or whichever other nice operations Linux may invent
in the future ;)

> Well, let's see if my analysis was correct and the patch works, first. I
> saw the forcedeth code before, but kexec and suspend is totally new to
> me and I just made some assumptions based on the reported behaviour and
> the commit messages... ;-)

You most likely did more analysis than me, I just said
"this very obviously must be the problem" and replied ;)

Andreas Mohr

^ permalink raw reply

* Re: [PATCH 1/2] LSM: Add inet_sys_snd_skb() LSM hook
From: Paul Moore @ 2008-01-04 22:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20080104.130902.217217806.davem@davemloft.net>

On Friday 04 January 2008 4:09:02 pm David Miller wrote:
> From: Paul Moore <paul.moore@hp.com>
> Date: Fri, 4 Jan 2008 09:38:27 -0500
>
> > Unfortunately, it's not quite that easy at present.  The only field
> > we have in the skb where we could possibly set a flag is the
> > secmark field which is already taken.
>
> Herbert Xu added a "peeked" field in net-2.6.25 that is only used on
> input while processing socket receive queues.  You could use it on
> output.

Actually, I went back to the drawing board and I think I found a 
solution that _should_ work using the existing postroute hook.  It 
isn't as general but it is relatively simple.

Historically the problem has been with labeled IPsec and the fact that 
the postroute hook can get hit multiple times when it is in use.  While 
yes, the packet is different each time through the hook but the packet's 
security label never changes (the packet's security label is determined 
by the original sender).  From a security point of view we really only 
want to check the packet once on the way out and we want that check to 
happen at the very end, not while packet transforms are in progress.  
This was the motivation for the new LSM hook.

After the new hook was rejected I took a step back and thought about the 
problem a bit more.  The multi-hit postroute hook problem was really 
only an issue for IPsec; the other labeling protocols don't have this 
problem because they don't do any transformation of the packet.  If we 
could find a quick way to determine when all of the IPsec processing 
was finished would could use the existing postroute hook approach and 
simply fall through if the hook was hit when IPsec processing was still 
needed.

I still need to test this to make sure it does everything we need, but 
I'm pretty certain that using the we can key off the skb->dst->xfrm 
value as a way to determine if a packet is done with it's IPsec 
transformation, if any.  Basically we rewrite our postroute hook to 
look something like this:

 int hook(...)
 {
	/* stuff to do every time */

	if (skb->dst->xfrm != NULL)
		return NF_ACCEPT;

	/* stuff to do only on the last time we are called */

 }

If it doesn't end up meeting our needs I'll look into the 'peeked' 
field, thanks for the suggestion.

-- 
paul moore
linux security @ hp

^ permalink raw reply

* [PATCH] Fix forcedeth reversing the MAC address on suspend
From: Björn Steinbrink @ 2008-01-04 22:26 UTC (permalink / raw)
  To: Richard Jonsson
  Cc: linux-kernel, Adrian Bunk, Andreas Mohr, Ayaz Abdulla, jgarzik,
	netdev
In-Reply-To: <477E3DEA.4070001@coderworld.net>

For cards that initially have the MAC address stored in reverse order,
the forcedeth driver uses a flag to signal whether the address was
already corrected, so that it is not reversed again on a subsequent
probe.

Unfortunately this flag, which is stored in a register of the card,
seems to get lost during suspend, resulting in the MAC address being
reversed again. To fix that, the MAC address needs to be written back in
reversed order before we suspend and the flag needs to be reset.

The flag is still required because at least kexec will never write back
the reversed address and thus needs to know what state the card is in.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
---
On 2008.01.04 15:08:42 +0100, Richard Jonsson wrote:
> Björn Steinbrink skrev:
>> Richard, could you give this a spin? And then we'd likely need someone
>> to test that with kexec...
>
> The patch you sent does the trick, works fine now, thanks!
> I cannot test this with kexec as I barely know what it is, I'll leave that 
> to someone else.

Thanks.

Ayaz, you originally wrote the kexec fix (IIRC), was my analysis of the
problem correct? If so, I'm quite sure that the patch DTRT. Still it
should be tested for the rmmod+modprobe and the kexec case. I'll try to
get my box free for some testing, but that's unlikely in the next few
days. Plus, I've never used kexec myself either. So I'd be grateful if
someone else would step up.

diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index a96583c..f84c752 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -5199,10 +5199,6 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 		dev->dev_addr[3] = (np->orig_mac[0] >> 16) & 0xff;
 		dev->dev_addr[4] = (np->orig_mac[0] >>  8) & 0xff;
 		dev->dev_addr[5] = (np->orig_mac[0] >>  0) & 0xff;
-		/* set permanent address to be correct aswell */
-		np->orig_mac[0] = (dev->dev_addr[0] << 0) + (dev->dev_addr[1] << 8) +
-			(dev->dev_addr[2] << 16) + (dev->dev_addr[3] << 24);
-		np->orig_mac[1] = (dev->dev_addr[4] << 0) + (dev->dev_addr[5] << 8);
 		writel(txreg|NVREG_TRANSMITPOLL_MAC_ADDR_REV, base + NvRegTransmitPoll);
 	}
 	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
@@ -5414,6 +5410,8 @@ static void __devexit nv_remove(struct pci_dev *pci_dev)
 	 */
 	writel(np->orig_mac[0], base + NvRegMacAddrA);
 	writel(np->orig_mac[1], base + NvRegMacAddrB);
+	writel(readl(base + NvRegTransmitPoll) & ~NVREG_TRANSMITPOLL_MAC_ADDR_REV,
+	       base + NvRegTransmitPoll);
 
 	/* free all structures */
 	free_rings(dev);

^ permalink raw reply related

* [ofa-general] [RFC PATCH] IPoIB: improve IPv4/IPv6 to IB mcast mapping functions
From: Roland Dreier @ 2008-01-04 22:05 UTC (permalink / raw)
  To: netdev; +Cc: general
In-Reply-To: <20071210203841.GJ30090@obsidianresearch.com>

Any objection to merging the following for 2.6.25?

[Rolf -- I think it makes more sense to delete the overwriting of the
P_Key in ipoib_multicast.c in this patch rather than later in the
series; do you agree?]

Thanks,
  Roland


From: Rolf Manderscheid <rvm@obsidianresearch.com>

An IPoIB subnet on an IB fabric that spans multiple IB subnets can't
use link-local scope in multicast GIDs.  The existing routines that
map IP/IPv6 multicast addresses into IB link-level addresses hard-code
the scope to link-local, and they also leave the partition key field
uninitialised.  This patch adds a parameter (the link-level broadcast
address) to the mapping routines, allowing them to initialise both the
scope and the P_Key appropriately, and fixes up the call sites.

The next step will be to add a way to configure the scope for an IPoIB
interface.

Signed-off-by: Rolf Manderscheid <rvm@obsidianresearch.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/cma.c                  |    4 +---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |    4 ----
 include/net/if_inet6.h                         |   11 +++++++----
 include/net/ip.h                               |   10 ++++++----
 net/ipv4/arp.c                                 |    2 +-
 net/ipv6/ndisc.c                               |    2 +-
 6 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 312ec74..982836e 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2610,11 +2610,9 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
 		/* IPv6 address is an SA assigned MGID. */
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
-		ip_ib_mc_map(sin->sin_addr.s_addr, mc_map);
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
 		if (id_priv->id.ps == RDMA_PS_UDP)
 			mc_map[7] = 0x01;	/* Use RDMA CM signature */
-		mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8;
-		mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr);
 		*mgid = *(union ib_gid *) (mc_map + 4);
 	}
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 858ada1..2628339 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -788,10 +788,6 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 		memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
 
-		/* Add in the P_Key */
-		mgid.raw[4] = (priv->pkey >> 8) & 0xff;
-		mgid.raw[5] = priv->pkey & 0xff;
-
 		mcast = __ipoib_mcast_find(dev, &mgid);
 		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
 			struct ipoib_mcast *nmcast;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 448eccb..b24508a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -269,18 +269,21 @@ static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf)
 	buf[0] = 0x00;
 }
 
-static inline void ipv6_ib_mc_map(struct in6_addr *addr, char *buf)
+static inline void ipv6_ib_mc_map(const struct in6_addr *addr,
+				  const unsigned char *broadcast, char *buf)
 {
+	unsigned char scope = broadcast[5] & 0xF;
+
 	buf[0]  = 0;		/* Reserved */
 	buf[1]  = 0xff;		/* Multicast QPN */
 	buf[2]  = 0xff;
 	buf[3]  = 0xff;
 	buf[4]  = 0xff;
-	buf[5]  = 0x12;		/* link local scope */
+	buf[5]  = 0x10 | scope;	/* scope from broadcast address */
 	buf[6]  = 0x60;		/* IPv6 signature */
 	buf[7]  = 0x1b;
-	buf[8]  = 0;		/* P_Key */
-	buf[9]  = 0;
+	buf[8]  = broadcast[8];	/* P_Key */
+	buf[9]  = broadcast[9];
 	memcpy(buf + 10, addr->s6_addr + 6, 10);
 }
 #endif
diff --git a/include/net/ip.h b/include/net/ip.h
index 840dd91..50c8889 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -266,20 +266,22 @@ static inline void ip_eth_mc_map(__be32 naddr, char *buf)
  *	Leave P_Key as 0 to be filled in by driver.
  */
 
-static inline void ip_ib_mc_map(__be32 naddr, char *buf)
+static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
 {
 	__u32 addr;
+	unsigned char scope = broadcast[5] & 0xF;
+
 	buf[0]  = 0;		/* Reserved */
 	buf[1]  = 0xff;		/* Multicast QPN */
 	buf[2]  = 0xff;
 	buf[3]  = 0xff;
 	addr    = ntohl(naddr);
 	buf[4]  = 0xff;
-	buf[5]  = 0x12;		/* link local scope */
+	buf[5]  = 0x10 | scope;	/* scope from broadcast address */
 	buf[6]  = 0x40;		/* IPv4 signature */
 	buf[7]  = 0x1b;
-	buf[8]  = 0;		/* P_Key */
-	buf[9]  = 0;
+	buf[8]  = broadcast[8];		/* P_Key */
+	buf[9]  = broadcast[9];
 	buf[10] = 0;
 	buf[11] = 0;
 	buf[12] = 0;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 08174a2..54a76b8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -211,7 +211,7 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 		ip_tr_mc_map(addr, haddr);
 		return 0;
 	case ARPHRD_INFINIBAND:
-		ip_ib_mc_map(addr, haddr);
+		ip_ib_mc_map(addr, dev->broadcast, haddr);
 		return 0;
 	default:
 		if (dir) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 777ed73..85947ea 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -337,7 +337,7 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
 		ipv6_arcnet_mc_map(addr, buf);
 		return 0;
 	case ARPHRD_INFINIBAND:
-		ipv6_ib_mc_map(addr, buf);
+		ipv6_ib_mc_map(addr, dev->broadcast, buf);
 		return 0;
 	default:
 		if (dir) {
-- 
1.5.4.rc2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox