Netdev List
 help / color / mirror / Atom feed
* [PATCH 03/11] can: at91_can: don't copy data to rx'ed RTR frames
From: Marc Kleine-Budde @ 2011-06-06 13:42 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA, Marc Kleine-Budde
In-Reply-To: <1307367780-30715-1-git-send-email-mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

Signed-off-by: Marc Kleine-Budde <mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
Acked-by: Wolfgang Grandegger <wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>
---
 drivers/net/can/at91_can.c |   10 ++++++----
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index afd0f5d..5358d70 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -513,12 +513,14 @@ static void at91_read_mb(struct net_device *dev, unsigned int mb,
 		cf->can_id = (reg_mid >> 18) & CAN_SFF_MASK;
 
 	reg_msr = at91_read(priv, AT91_MSR(mb));
-	if (reg_msr & AT91_MSR_MRTR)
-		cf->can_id |= CAN_RTR_FLAG;
 	cf->can_dlc = get_can_dlc((reg_msr >> 16) & 0xf);
 
-	*(u32 *)(cf->data + 0) = at91_read(priv, AT91_MDL(mb));
-	*(u32 *)(cf->data + 4) = at91_read(priv, AT91_MDH(mb));
+	if (reg_msr & AT91_MSR_MRTR)
+		cf->can_id |= CAN_RTR_FLAG;
+	else {
+		*(u32 *)(cf->data + 0) = at91_read(priv, AT91_MDL(mb));
+		*(u32 *)(cf->data + 4) = at91_read(priv, AT91_MDH(mb));
+	}
 
 	/* allow RX of extended frames */
 	at91_write(priv, AT91_MID(mb), AT91_MID_MIDE);
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 02/11] can: at91_can: fix comment about priv->tx_next
From: Marc Kleine-Budde @ 2011-06-06 13:42 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA, Marc Kleine-Budde
In-Reply-To: <1307367780-30715-1-git-send-email-mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

Signed-off-by: Marc Kleine-Budde <mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 drivers/net/can/at91_can.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 8f15ae4..afd0f5d 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -375,7 +375,7 @@ static void at91_chip_stop(struct net_device *dev, enum can_state state)
  * mailbox, but without the offset AT91_MB_TX_FIRST. The lower bits
  * encode the mailbox number, the upper 4 bits the mailbox priority:
  *
- * priv->tx_next = (prio << AT91_NEXT_PRIO_SHIFT) ||
+ * priv->tx_next = (prio << AT91_NEXT_PRIO_SHIFT) |
  *                 (mb - AT91_MB_TX_FIRST);
  *
  */
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 01/11] can: at91_can: don't align struct definitions
From: Marc Kleine-Budde @ 2011-06-06 13:42 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA, Marc Kleine-Budde
In-Reply-To: <1307367780-30715-1-git-send-email-mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

Signed-off-by: Marc Kleine-Budde <mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 drivers/net/can/at91_can.c |   34 +++++++++++++++++-----------------
 1 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 74efb5a..8f15ae4 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -157,21 +157,21 @@ enum at91_mb_mode {
 #define AT91_IRQ_ALL		(0x1fffffff)
 
 struct at91_priv {
-	struct can_priv		can;	   /* must be the first member! */
-	struct net_device	*dev;
-	struct napi_struct	napi;
+	struct can_priv can;		/* must be the first member! */
+	struct net_device *dev;
+	struct napi_struct napi;
 
-	void __iomem		*reg_base;
+	void __iomem *reg_base;
 
-	u32			reg_sr;
-	unsigned int		tx_next;
-	unsigned int		tx_echo;
-	unsigned int		rx_next;
+	u32 reg_sr;
+	unsigned int tx_next;
+	unsigned int tx_echo;
+	unsigned int rx_next;
 
-	struct clk		*clk;
-	struct at91_can_data	*pdata;
+	struct clk *clk;
+	struct at91_can_data *pdata;
 
-	canid_t			mb0_id;
+	canid_t mb0_id;
 };
 
 static struct can_bittiming_const at91_bittiming_const = {
@@ -271,7 +271,7 @@ static void at91_setup_mailboxes(struct net_device *dev)
 
 	/* reset acceptance mask and id register */
 	for (i = AT91_MB_RX_FIRST; i <= AT91_MB_RX_LAST; i++) {
-		at91_write(priv, AT91_MAM(i), 0x0 );
+		at91_write(priv, AT91_MAM(i), 0x0);
 		at91_write(priv, AT91_MID(i), AT91_MID_MIDE);
 	}
 
@@ -1231,11 +1231,11 @@ static int __devexit at91_can_remove(struct platform_device *pdev)
 }
 
 static struct platform_driver at91_can_driver = {
-	.probe		= at91_can_probe,
-	.remove		= __devexit_p(at91_can_remove),
-	.driver		= {
-		.name	= KBUILD_MODNAME,
-		.owner	= THIS_MODULE,
+	.probe = at91_can_probe,
+	.remove = __devexit_p(at91_can_remove),
+	.driver = {
+		.name = KBUILD_MODNAME,
+		.owner = THIS_MODULE,
 	},
 };
 
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 00/11] can: at91_can: add support for AT91SAM9X5 series
From: Marc Kleine-Budde @ 2011-06-06 13:42 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA

Hello,

this patch series add support for the at91_can core on the upcoming
at91sam9x5 SOC series.

The original at91_can driver supports the at91sam9263 which has 16
mailboxes, but the CAN core on the at91sam9x5 has only 8. Most constants
used in the RX and TX path are derived from the number of mailboxes.

This patch series changes the driver from a static, compile time setup
of the mailboxes to a dynamic one. Patches 1-4 clean up the driver, 5+6
simplify the usage of the constants, patch 7 converts all derived constants
into functions. Patch 8 will add id_table support to the driver then all
remaining constants are converted into functions using the run time
selected mailbox constants. The next patch (9) takes care about an
at91sam9263 specific errata fix. Patch 10 and 11 will finally add supoprt
for the at91sam9x5 SOC.

This patch series applies to current net-next-2.6/master and has been tested
on a sam9263 and sam9x5ek.
(Allthough, the sam9x5 support is not mainline yet).

please review and consider to apply,
Marc

The following changes since commit e3cc055c18ab575291acf0af7622a2e97c4728fa:

  include/net: Remove unnecessary semicolons (2011-06-05 14:33:40 -0700)

are available in the git repository at:
  git://git.pengutronix.de/git/mkl/linux-2.6.git can/at91-sam9x5

Marc Kleine-Budde (10):
      can: at91_can: don't align struct definitions
      can: at91_can: fix comment about priv->tx_next
      can: at91_can: don't copy data to rx'ed RTR frames
      can: at91_can: let get_tx_* functions return unsigned int
      can: at91_can: directly define AT91_MB_RX_LAST
      can: at91_can: rename AT91_MB_RX_MASK to AT91_IRQ_MB_RX
      can: at91_can: convert derived mailbox constants into functions
      can: at91_can: add id_table and convert prime mailbox constats to functions
      can: at91_can: register mb0 sysfs entry only on at91sam9263
      can: at91_can: add support for the AT91SAM9X5 SOCs

Uwe Kleine-König (1):
      net/can: allow CAN_AT91 on AT91SAM9X5

 drivers/net/can/Kconfig    |    5 +-
 drivers/net/can/at91_can.c |  366 +++++++++++++++++++++++++++++++-------------
 2 files changed, 259 insertions(+), 112 deletions(-)


_______________________________________________
Socketcan-core mailing list
Socketcan-core@lists.berlios.de
https://lists.berlios.de/mailman/listinfo/socketcan-core

^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Alexander Holler @ 2011-06-06 13:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Neil Horman, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <1307366166.3098.15.camel@edumazet-laptop>


> Nice, now please submit a patch with 0972ddb237 as a guideline.
>
> BTW, you could also check other struct dst_ops methods for
> bridge/netfilter:

Sorry, but I prefer to submit patches I understand by myself and for 
stuff I know something about. The patch in my first mail was just meant 
as a quick fix (which seemed to work here).

So even if I might be able to construct a working patch using commit 
0972ddb237, I don't think I should do that.

Regards,

Alexander

^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Eric Dumazet @ 2011-06-06 13:18 UTC (permalink / raw)
  To: Neil Horman
  Cc: Alexander Holler, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <20110606131309.GB1000@hmsreliant.think-freely.org>

Le lundi 06 juin 2011 à 09:13 -0400, Neil Horman a écrit :

> Ok, this makes sense to me now, thanks.  The change to the dst initalization to
> mark our fake routing table as read only means we need a cow_metrics method to
> copy the metrics before we we can compete the dst_metric_set in br_change_mtu.
> Thats fine, but given that its really a fake routing table with only one dst
> entry which (I think) is only written under the rtnl lock, why not just modify
> the dst_init_metrics call so that its not marked as read-only?

It _is_ read-only, its even a "const", so if you do that you'll trap if
kernel const pages are RO

Please check commit 0972ddb237 for a proper way to deal with this.




^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Eric Dumazet @ 2011-06-06 13:16 UTC (permalink / raw)
  To: Alexander Holler
  Cc: Neil Horman, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <4DECCCC0.70905@ahsoftware.de>

Le lundi 06 juin 2011 à 14:49 +0200, Alexander Holler a écrit :
> Am 06.06.2011 14:12, schrieb Eric Dumazet:
> > Le lundi 06 juin 2011 à 13:48 +0200, Alexander Holler a écrit :
> >> Am 06.06.2011 13:15, schrieb Neil Horman:
> >>> On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
> >>>> Hello,
> >>>>
> >>>> I'm getting a oops in the bridge code in br_change_mtu() with
> >>>> 2.6.39.1. The patch below seems to fix that.
> >>>>
> >>>> I'm not sure about the usage of dst_cow_metrics_generic() in
> >>>> fake_dst_ops, but after having a quick look at it seems to be ok to
> >>>> use that here.
> >>>>
> >>>> Regards,
> >>>>
> >>>> Alexander
> >>>>
> >>> How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
> >>> wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
> >>> into that clause in which we call cow_metrics when we call dst_metric_set.  It
> >>> seems like that flag is set erroneously.  perhaps we should just update
> >>> fake_rtable.dst to have the correct flags?
> >>> Neil
> >>
> >> It is set by that change:
> >>
> >> --------
> >> @@ -124,7 +128,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
> >>    	atomic_set(&rt->dst.__refcnt, 1);
> >>    	rt->dst.dev = br->dev;
> >>    	rt->dst.path =&rt->dst;
> >> -	dst_metric_set(&rt->dst, RTAX_MTU, 1500);
> >> +	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
> >>    	rt->dst.flags	= DST_NOXFRM;
> >>    	rt->dst.ops =&fake_dst_ops;
> >>    }
> >> --------
> >>
> >> The true in dst_init_metrics() is responsible for that flag.
> >>
> >
> > You are aware this change fixed an oops ?
> 
> No, I'm not aware of this. I know almost nothing about what all that 
> stuff is doing. For me that change above just introduced an oops through 
> an immediate NULL pointer dereference in br_change_mtu().
> 
> > read_only in this context means : In case this must be written, we make
> > a COW first
> > (allocate a piece of memory, copy the source in it before applying any
> > change)
> >
> > It would be nice you send us the stack trace, so that we can have a clue
> > of whats going on.
> 
> Here is the text as found in my first mail:
> ----
> Commit 42923465fb8d025a2b5153f2e7ab1e6e1058bf00 does here what it
> should prevent, it introduces NULL a dereference.
> 

Hmm, this commit uncovers a previous bug, introduced in commit
62fa8a846d7d.

> The above commit uses dst_init_metrics() which sets the metrics as
> read only. As result br_change_mtu() dies in dst_metric_set()
> which calls dst_metrics_write_ptr() which calls
> dst->ops->cow_metrics() if the metrics are read only.
> 
> I don't have a system with 2.6.39.1 or 3.0-rcN ready which writes an 
> oops to somewhere else than the screen. Just set up a bridge and change 
> the MTU for the IF, that will trigger the oops.
> ----
> 
> Here is the oops:
> 
> ----
> [  136.546023] BUG: unable to handle kernel NULL pointer dereference at 
>    (null)
> [  136.546038] IP: [<  (null)>]   (null)
> [  136.546046] *pde = 00000000
> [  136.546052] Oops: 0000 [#1] SMP
> [  136.546059] last sysfs file: /sys/devices/virtual/net/mybridge/uevent
> [  136.546066] Modules linked in: sit tunnel4 tun iwlagn sch_sfq bridge 
> stp llc nfs lockd auth_rpcgss nfs_acl sunrpc ipt_LOG xt_recent xt_state 
> iptable_filter iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 
> xt_addrtype xt_dscp xt_iprange xt_DSCP xt_set ip_set nfnetlink ip6t_LOG 
> xt_limit ip6table_filter xt_string xt_owner xt_multiport xt_hashlimit 
> xt_conntrack xt_NFQUEUE xt_mark xt_connmark nf_conntrack ip6_tables 
> snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss 
> snd_mixer_oss uinput ipv6 dm_mod nvidiafb fb_ddc i2c_algo_bit vgastate 
> nvidia(P) arc4 uvcvideo videodev ecb snd_hda_codec_realtek iwl4965 
> snd_hda_intel iwl_legacy snd_hda_codec uhci_hcd firewire_ohci mac80211 
> sdhci_pci snd_hwdep snd_pcm sdhci ehci_hcd intel_agp firewire_core 
> cfg80211 snd_timer asus_laptop snd e1000e usbcore sparse_keymap rfkill 
> tpm_infineon crc_itu_t intel_gtt mmc_core iTCO_wdt tpm_tis ata_piix 
> agpgart tpm video soundcore sg tpm_bios joydev snd_page_alloc [last 
> unloaded: microcode]
> [  136.546235]
> [  136.546243] Pid: 8415, comm: ip Tainted: P 
> 2.6.39.1-00006-g40545b7 #103 ASUSTeK Computer Inc.         V1Sn 
>         /V1Sn
> [  136.546256] EIP: 0060:[<00000000>] EFLAGS: 00010202 CPU: 0
> [  136.546268] EIP is at 0x0
> [  136.546273] EAX: f14a389c EBX: 000005d4 ECX: f80d32c0 EDX: f80d1da1
> [  136.546279] ESI: f14a3000 EDI: f255bf10 EBP: f15c3b54 ESP: f15c3b48
> [  136.546285]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
> [  136.546293] Process ip (pid: 8415, ti=f15c2000 task=f4741f80 
> task.ti=f15c2000)
> [  136.546297] Stack:
> [  136.546301]  f80c658f f14a3000 ffffffed f15c3b64 c12cb9c8 f80d1b80 
> ffffffa1 f15c3bbc
> [  136.546315]  c12da347 c12d9c7d 00000000 f7670b00 00000000 f80d1b80 
> ffffffa6 f15c3be4
> [  136.546329]  00000004 f14a3000 f255bf20 00000008 f15c3bbc c11d6cae 
> 00000000 00000000
> [  136.546343] Call Trace:
> [  136.546359]  [<f80c658f>] ? br_change_mtu+0x5f/0x80 [bridge]
> [  136.546372]  [<c12cb9c8>] dev_set_mtu+0x38/0x80
> [  136.546381]  [<c12da347>] do_setlink+0x1a7/0x860
> [  136.546390]  [<c12d9c7d>] ? rtnl_fill_ifinfo+0x9bd/0xc70
> [  136.546400]  [<c11d6cae>] ? nla_parse+0x6e/0xb0
> [  136.546409]  [<c12db931>] rtnl_newlink+0x361/0x510
> [  136.546420]  [<c1023240>] ? vmalloc_sync_all+0x100/0x100
> [  136.546429]  [<c1362762>] ? error_code+0x5a/0x60
> [  136.546438]  [<c12db5d0>] ? rtnl_configure_link+0x80/0x80
> [  136.546446]  [<c12db27a>] rtnetlink_rcv_msg+0xfa/0x210
> [  136.546454]  [<c12db180>] ? __rtnl_unlock+0x20/0x20
> [  136.546463]  [<c12ee0fe>] netlink_rcv_skb+0x8e/0xb0
> [  136.546471]  [<c12daf1c>] rtnetlink_rcv+0x1c/0x30
> [  136.546479]  [<c12edafa>] netlink_unicast+0x23a/0x280
> [  136.546487]  [<c12ede6b>] netlink_sendmsg+0x26b/0x2f0
> [  136.546497]  [<c12bb828>] sock_sendmsg+0xc8/0x100
> [  136.546508]  [<c10adf61>] ? __alloc_pages_nodemask+0xe1/0x750
> [  136.546517]  [<c11d0602>] ? _copy_from_user+0x42/0x60
> [  136.546525]  [<c12c5e4c>] ? verify_iovec+0x4c/0xc0
> [  136.546534]  [<c12bd805>] sys_sendmsg+0x1c5/0x200
> [  136.546542]  [<c10c2150>] ? __do_fault+0x310/0x410
> [  136.546549]  [<c10c2c46>] ? do_wp_page+0x1d6/0x6b0
> [  136.546557]  [<c10c47d1>] ? handle_pte_fault+0xe1/0x720
> [  136.546565]  [<c12bd1af>] ? sys_getsockname+0x7f/0x90
> [  136.546574]  [<c10c4ec1>] ? handle_mm_fault+0xb1/0x180
> [  136.546582]  [<c1023240>] ? vmalloc_sync_all+0x100/0x100
> [  136.546589]  [<c10233b3>] ? do_page_fault+0x173/0x3d0
> [  136.546596]  [<c12bd87b>] ? sys_recvmsg+0x3b/0x60
> [  136.546605]  [<c12bdd83>] sys_socketcall+0x293/0x2d0
> [  136.546614]  [<c13629d0>] sysenter_do_call+0x12/0x26
> [  136.546619] Code:  Bad EIP value.
> [  136.546627] EIP: [<00000000>] 0x0 SS:ESP 0068:f15c3b48
> [  136.546645] CR2: 0000000000000000
> [  136.546652] ---[ end trace 6909b560e78934fa ]---
> ----
> 
> And if you want the oops on your machine (uisng 2.6.39.1 or 3.0-rcN):
> 
> ----
> brctl addbr mybridge
> ip link set mybridge mtu 1234
> oops
> ----

Nice, now please submit a patch with 0972ddb237 as a guideline.

BTW, you could also check other struct dst_ops methods for
bridge/netfilter:

- What about ADVMSS or things like that, see commit 214f45c9

^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Neil Horman @ 2011-06-06 13:13 UTC (permalink / raw)
  To: Alexander Holler
  Cc: Eric Dumazet, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <4DECCCC0.70905@ahsoftware.de>

On Mon, Jun 06, 2011 at 02:49:04PM +0200, Alexander Holler wrote:
> Am 06.06.2011 14:12, schrieb Eric Dumazet:
> >Le lundi 06 juin 2011 à 13:48 +0200, Alexander Holler a écrit :
> >>Am 06.06.2011 13:15, schrieb Neil Horman:
> >>>On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
> >>>>Hello,
> >>>>
> >>>>I'm getting a oops in the bridge code in br_change_mtu() with
> >>>>2.6.39.1. The patch below seems to fix that.
> >>>>
> >>>>I'm not sure about the usage of dst_cow_metrics_generic() in
> >>>>fake_dst_ops, but after having a quick look at it seems to be ok to
> >>>>use that here.
> >>>>
> >>>>Regards,
> >>>>
> >>>>Alexander
> >>>>
> >>>How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
> >>>wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
> >>>into that clause in which we call cow_metrics when we call dst_metric_set.  It
> >>>seems like that flag is set erroneously.  perhaps we should just update
> >>>fake_rtable.dst to have the correct flags?
> >>>Neil
> >>
> >>It is set by that change:
> >>
> >>--------
> >>@@ -124,7 +128,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
> >>   	atomic_set(&rt->dst.__refcnt, 1);
> >>   	rt->dst.dev = br->dev;
> >>   	rt->dst.path =&rt->dst;
> >>-	dst_metric_set(&rt->dst, RTAX_MTU, 1500);
> >>+	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
> >>   	rt->dst.flags	= DST_NOXFRM;
> >>   	rt->dst.ops =&fake_dst_ops;
> >>   }
> >>--------
> >>
> >>The true in dst_init_metrics() is responsible for that flag.
> >>
> >
> >You are aware this change fixed an oops ?
> 
> No, I'm not aware of this. I know almost nothing about what all that
> stuff is doing. For me that change above just introduced an oops
> through an immediate NULL pointer dereference in br_change_mtu().
> 
> >read_only in this context means : In case this must be written, we make
> >a COW first
> >(allocate a piece of memory, copy the source in it before applying any
> >change)
> >
> >It would be nice you send us the stack trace, so that we can have a clue
> >of whats going on.
> 
> Here is the text as found in my first mail:
> ----
> Commit 42923465fb8d025a2b5153f2e7ab1e6e1058bf00 does here what it
> should prevent, it introduces NULL a dereference.
> 
> The above commit uses dst_init_metrics() which sets the metrics as
> read only. As result br_change_mtu() dies in dst_metric_set()
> which calls dst_metrics_write_ptr() which calls
> dst->ops->cow_metrics() if the metrics are read only.
> 
> I don't have a system with 2.6.39.1 or 3.0-rcN ready which writes an
> oops to somewhere else than the screen. Just set up a bridge and
> change the MTU for the IF, that will trigger the oops.
> ----
> 
> Here is the oops:
> 
> ----
> [  136.546023] BUG: unable to handle kernel NULL pointer dereference
> at   (null)
> [  136.546038] IP: [<  (null)>]   (null)
> [  136.546046] *pde = 00000000
> [  136.546052] Oops: 0000 [#1] SMP
> [  136.546059] last sysfs file: /sys/devices/virtual/net/mybridge/uevent
> [  136.546066] Modules linked in: sit tunnel4 tun iwlagn sch_sfq
> bridge stp llc nfs lockd auth_rpcgss nfs_acl sunrpc ipt_LOG
> xt_recent xt_state iptable_filter iptable_nat nf_nat
> nf_conntrack_ipv4 nf_defrag_ipv4 xt_addrtype xt_dscp xt_iprange
> xt_DSCP xt_set ip_set nfnetlink ip6t_LOG xt_limit ip6table_filter
> xt_string xt_owner xt_multiport xt_hashlimit xt_conntrack xt_NFQUEUE
> xt_mark xt_connmark nf_conntrack ip6_tables snd_seq_oss
> snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_mixer_oss
> uinput ipv6 dm_mod nvidiafb fb_ddc i2c_algo_bit vgastate nvidia(P)
> arc4 uvcvideo videodev ecb snd_hda_codec_realtek iwl4965
> snd_hda_intel iwl_legacy snd_hda_codec uhci_hcd firewire_ohci
> mac80211 sdhci_pci snd_hwdep snd_pcm sdhci ehci_hcd intel_agp
> firewire_core cfg80211 snd_timer asus_laptop snd e1000e usbcore
> sparse_keymap rfkill tpm_infineon crc_itu_t intel_gtt mmc_core
> iTCO_wdt tpm_tis ata_piix agpgart tpm video soundcore sg tpm_bios
> joydev snd_page_alloc [last unloaded: microcode]
> [  136.546235]
> [  136.546243] Pid: 8415, comm: ip Tainted: P
> 2.6.39.1-00006-g40545b7 #103 ASUSTeK Computer Inc.         V1Sn
> /V1Sn
> [  136.546256] EIP: 0060:[<00000000>] EFLAGS: 00010202 CPU: 0
> [  136.546268] EIP is at 0x0
> [  136.546273] EAX: f14a389c EBX: 000005d4 ECX: f80d32c0 EDX: f80d1da1
> [  136.546279] ESI: f14a3000 EDI: f255bf10 EBP: f15c3b54 ESP: f15c3b48
> [  136.546285]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
> [  136.546293] Process ip (pid: 8415, ti=f15c2000 task=f4741f80
> task.ti=f15c2000)
> [  136.546297] Stack:
> [  136.546301]  f80c658f f14a3000 ffffffed f15c3b64 c12cb9c8
> f80d1b80 ffffffa1 f15c3bbc
> [  136.546315]  c12da347 c12d9c7d 00000000 f7670b00 00000000
> f80d1b80 ffffffa6 f15c3be4
> [  136.546329]  00000004 f14a3000 f255bf20 00000008 f15c3bbc
> c11d6cae 00000000 00000000
> [  136.546343] Call Trace:
> [  136.546359]  [<f80c658f>] ? br_change_mtu+0x5f/0x80 [bridge]
> [  136.546372]  [<c12cb9c8>] dev_set_mtu+0x38/0x80
> [  136.546381]  [<c12da347>] do_setlink+0x1a7/0x860
> [  136.546390]  [<c12d9c7d>] ? rtnl_fill_ifinfo+0x9bd/0xc70
> [  136.546400]  [<c11d6cae>] ? nla_parse+0x6e/0xb0
> [  136.546409]  [<c12db931>] rtnl_newlink+0x361/0x510
> [  136.546420]  [<c1023240>] ? vmalloc_sync_all+0x100/0x100
> [  136.546429]  [<c1362762>] ? error_code+0x5a/0x60
> [  136.546438]  [<c12db5d0>] ? rtnl_configure_link+0x80/0x80
> [  136.546446]  [<c12db27a>] rtnetlink_rcv_msg+0xfa/0x210
> [  136.546454]  [<c12db180>] ? __rtnl_unlock+0x20/0x20
> [  136.546463]  [<c12ee0fe>] netlink_rcv_skb+0x8e/0xb0
> [  136.546471]  [<c12daf1c>] rtnetlink_rcv+0x1c/0x30
> [  136.546479]  [<c12edafa>] netlink_unicast+0x23a/0x280
> [  136.546487]  [<c12ede6b>] netlink_sendmsg+0x26b/0x2f0
> [  136.546497]  [<c12bb828>] sock_sendmsg+0xc8/0x100
> [  136.546508]  [<c10adf61>] ? __alloc_pages_nodemask+0xe1/0x750
> [  136.546517]  [<c11d0602>] ? _copy_from_user+0x42/0x60
> [  136.546525]  [<c12c5e4c>] ? verify_iovec+0x4c/0xc0
> [  136.546534]  [<c12bd805>] sys_sendmsg+0x1c5/0x200
> [  136.546542]  [<c10c2150>] ? __do_fault+0x310/0x410
> [  136.546549]  [<c10c2c46>] ? do_wp_page+0x1d6/0x6b0
> [  136.546557]  [<c10c47d1>] ? handle_pte_fault+0xe1/0x720
> [  136.546565]  [<c12bd1af>] ? sys_getsockname+0x7f/0x90
> [  136.546574]  [<c10c4ec1>] ? handle_mm_fault+0xb1/0x180
> [  136.546582]  [<c1023240>] ? vmalloc_sync_all+0x100/0x100
> [  136.546589]  [<c10233b3>] ? do_page_fault+0x173/0x3d0
> [  136.546596]  [<c12bd87b>] ? sys_recvmsg+0x3b/0x60
> [  136.546605]  [<c12bdd83>] sys_socketcall+0x293/0x2d0
> [  136.546614]  [<c13629d0>] sysenter_do_call+0x12/0x26
> [  136.546619] Code:  Bad EIP value.
> [  136.546627] EIP: [<00000000>] 0x0 SS:ESP 0068:f15c3b48
> [  136.546645] CR2: 0000000000000000
> [  136.546652] ---[ end trace 6909b560e78934fa ]---
> ----
> 
> And if you want the oops on your machine (uisng 2.6.39.1 or 3.0-rcN):
> 
> ----
> brctl addbr mybridge
> ip link set mybridge mtu 1234
> oops
> ----
> 
Ok, this makes sense to me now, thanks.  The change to the dst initalization to
mark our fake routing table as read only means we need a cow_metrics method to
copy the metrics before we we can compete the dst_metric_set in br_change_mtu.
Thats fine, but given that its really a fake routing table with only one dst
entry which (I think) is only written under the rtnl lock, why not just modify
the dst_init_metrics call so that its not marked as read-only?

Regards
Neil

> Regards,
> 
> Alexander
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Eric Dumazet @ 2011-06-06 13:09 UTC (permalink / raw)
  To: Alexander Holler
  Cc: Neil Horman, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <1307362358.3098.6.camel@edumazet-laptop>

Le lundi 06 juin 2011 à 14:12 +0200, Eric Dumazet a écrit :
> Le lundi 06 juin 2011 à 13:48 +0200, Alexander Holler a écrit :
> > Am 06.06.2011 13:15, schrieb Neil Horman:
> > > On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
> > >> Hello,
> > >>
> > >> I'm getting a oops in the bridge code in br_change_mtu() with
> > >> 2.6.39.1. The patch below seems to fix that.
> > >>
> > >> I'm not sure about the usage of dst_cow_metrics_generic() in
> > >> fake_dst_ops, but after having a quick look at it seems to be ok to
> > >> use that here.
> > >>
> > >> Regards,
> > >>
> > >> Alexander
> > >>
> > > How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
> > > wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
> > > into that clause in which we call cow_metrics when we call dst_metric_set.  It
> > > seems like that flag is set erroneously.  perhaps we should just update
> > > fake_rtable.dst to have the correct flags?
> > > Neil
> > 
> > It is set by that change:
> > 
> > --------
> > @@ -124,7 +128,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
> >   	atomic_set(&rt->dst.__refcnt, 1);
> >   	rt->dst.dev = br->dev;
> >   	rt->dst.path = &rt->dst;
> > -	dst_metric_set(&rt->dst, RTAX_MTU, 1500);
> > +	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
> >   	rt->dst.flags	= DST_NOXFRM;
> >   	rt->dst.ops = &fake_dst_ops;
> >   }
> > --------
> > 
> > The true in dst_init_metrics() is responsible for that flag.
> > 
> 
> You are aware this change fixed an oops ?
> 
> read_only in this context means : In case this must be written, we make
> a COW first
> (allocate a piece of memory, copy the source in it before applying any
> change)
> 
> It would be nice you send us the stack trace, so that we can have a clue
> of whats going on.
> 

Alexander, you should take a look at :

git show 0972ddb2

To get an idea of how to deal with this problem 
(See how Held Bernhard included a backtrace to help us make a
diagnostic)

We dont want to even allocate a piece of memory to copy
br_dst_default_metric for a fake dst.

^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Alexander Holler @ 2011-06-06 12:49 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Neil Horman, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <1307362358.3098.6.camel@edumazet-laptop>

Am 06.06.2011 14:12, schrieb Eric Dumazet:
> Le lundi 06 juin 2011 à 13:48 +0200, Alexander Holler a écrit :
>> Am 06.06.2011 13:15, schrieb Neil Horman:
>>> On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
>>>> Hello,
>>>>
>>>> I'm getting a oops in the bridge code in br_change_mtu() with
>>>> 2.6.39.1. The patch below seems to fix that.
>>>>
>>>> I'm not sure about the usage of dst_cow_metrics_generic() in
>>>> fake_dst_ops, but after having a quick look at it seems to be ok to
>>>> use that here.
>>>>
>>>> Regards,
>>>>
>>>> Alexander
>>>>
>>> How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
>>> wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
>>> into that clause in which we call cow_metrics when we call dst_metric_set.  It
>>> seems like that flag is set erroneously.  perhaps we should just update
>>> fake_rtable.dst to have the correct flags?
>>> Neil
>>
>> It is set by that change:
>>
>> --------
>> @@ -124,7 +128,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
>>    	atomic_set(&rt->dst.__refcnt, 1);
>>    	rt->dst.dev = br->dev;
>>    	rt->dst.path =&rt->dst;
>> -	dst_metric_set(&rt->dst, RTAX_MTU, 1500);
>> +	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
>>    	rt->dst.flags	= DST_NOXFRM;
>>    	rt->dst.ops =&fake_dst_ops;
>>    }
>> --------
>>
>> The true in dst_init_metrics() is responsible for that flag.
>>
>
> You are aware this change fixed an oops ?

No, I'm not aware of this. I know almost nothing about what all that 
stuff is doing. For me that change above just introduced an oops through 
an immediate NULL pointer dereference in br_change_mtu().

> read_only in this context means : In case this must be written, we make
> a COW first
> (allocate a piece of memory, copy the source in it before applying any
> change)
>
> It would be nice you send us the stack trace, so that we can have a clue
> of whats going on.

Here is the text as found in my first mail:
----
Commit 42923465fb8d025a2b5153f2e7ab1e6e1058bf00 does here what it
should prevent, it introduces NULL a dereference.

The above commit uses dst_init_metrics() which sets the metrics as
read only. As result br_change_mtu() dies in dst_metric_set()
which calls dst_metrics_write_ptr() which calls
dst->ops->cow_metrics() if the metrics are read only.

I don't have a system with 2.6.39.1 or 3.0-rcN ready which writes an 
oops to somewhere else than the screen. Just set up a bridge and change 
the MTU for the IF, that will trigger the oops.
----

Here is the oops:

----
[  136.546023] BUG: unable to handle kernel NULL pointer dereference at 
   (null)
[  136.546038] IP: [<  (null)>]   (null)
[  136.546046] *pde = 00000000
[  136.546052] Oops: 0000 [#1] SMP
[  136.546059] last sysfs file: /sys/devices/virtual/net/mybridge/uevent
[  136.546066] Modules linked in: sit tunnel4 tun iwlagn sch_sfq bridge 
stp llc nfs lockd auth_rpcgss nfs_acl sunrpc ipt_LOG xt_recent xt_state 
iptable_filter iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 
xt_addrtype xt_dscp xt_iprange xt_DSCP xt_set ip_set nfnetlink ip6t_LOG 
xt_limit ip6table_filter xt_string xt_owner xt_multiport xt_hashlimit 
xt_conntrack xt_NFQUEUE xt_mark xt_connmark nf_conntrack ip6_tables 
snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss 
snd_mixer_oss uinput ipv6 dm_mod nvidiafb fb_ddc i2c_algo_bit vgastate 
nvidia(P) arc4 uvcvideo videodev ecb snd_hda_codec_realtek iwl4965 
snd_hda_intel iwl_legacy snd_hda_codec uhci_hcd firewire_ohci mac80211 
sdhci_pci snd_hwdep snd_pcm sdhci ehci_hcd intel_agp firewire_core 
cfg80211 snd_timer asus_laptop snd e1000e usbcore sparse_keymap rfkill 
tpm_infineon crc_itu_t intel_gtt mmc_core iTCO_wdt tpm_tis ata_piix 
agpgart tpm video soundcore sg tpm_bios joydev snd_page_alloc [last 
unloaded: microcode]
[  136.546235]
[  136.546243] Pid: 8415, comm: ip Tainted: P 
2.6.39.1-00006-g40545b7 #103 ASUSTeK Computer Inc.         V1Sn 
        /V1Sn
[  136.546256] EIP: 0060:[<00000000>] EFLAGS: 00010202 CPU: 0
[  136.546268] EIP is at 0x0
[  136.546273] EAX: f14a389c EBX: 000005d4 ECX: f80d32c0 EDX: f80d1da1
[  136.546279] ESI: f14a3000 EDI: f255bf10 EBP: f15c3b54 ESP: f15c3b48
[  136.546285]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  136.546293] Process ip (pid: 8415, ti=f15c2000 task=f4741f80 
task.ti=f15c2000)
[  136.546297] Stack:
[  136.546301]  f80c658f f14a3000 ffffffed f15c3b64 c12cb9c8 f80d1b80 
ffffffa1 f15c3bbc
[  136.546315]  c12da347 c12d9c7d 00000000 f7670b00 00000000 f80d1b80 
ffffffa6 f15c3be4
[  136.546329]  00000004 f14a3000 f255bf20 00000008 f15c3bbc c11d6cae 
00000000 00000000
[  136.546343] Call Trace:
[  136.546359]  [<f80c658f>] ? br_change_mtu+0x5f/0x80 [bridge]
[  136.546372]  [<c12cb9c8>] dev_set_mtu+0x38/0x80
[  136.546381]  [<c12da347>] do_setlink+0x1a7/0x860
[  136.546390]  [<c12d9c7d>] ? rtnl_fill_ifinfo+0x9bd/0xc70
[  136.546400]  [<c11d6cae>] ? nla_parse+0x6e/0xb0
[  136.546409]  [<c12db931>] rtnl_newlink+0x361/0x510
[  136.546420]  [<c1023240>] ? vmalloc_sync_all+0x100/0x100
[  136.546429]  [<c1362762>] ? error_code+0x5a/0x60
[  136.546438]  [<c12db5d0>] ? rtnl_configure_link+0x80/0x80
[  136.546446]  [<c12db27a>] rtnetlink_rcv_msg+0xfa/0x210
[  136.546454]  [<c12db180>] ? __rtnl_unlock+0x20/0x20
[  136.546463]  [<c12ee0fe>] netlink_rcv_skb+0x8e/0xb0
[  136.546471]  [<c12daf1c>] rtnetlink_rcv+0x1c/0x30
[  136.546479]  [<c12edafa>] netlink_unicast+0x23a/0x280
[  136.546487]  [<c12ede6b>] netlink_sendmsg+0x26b/0x2f0
[  136.546497]  [<c12bb828>] sock_sendmsg+0xc8/0x100
[  136.546508]  [<c10adf61>] ? __alloc_pages_nodemask+0xe1/0x750
[  136.546517]  [<c11d0602>] ? _copy_from_user+0x42/0x60
[  136.546525]  [<c12c5e4c>] ? verify_iovec+0x4c/0xc0
[  136.546534]  [<c12bd805>] sys_sendmsg+0x1c5/0x200
[  136.546542]  [<c10c2150>] ? __do_fault+0x310/0x410
[  136.546549]  [<c10c2c46>] ? do_wp_page+0x1d6/0x6b0
[  136.546557]  [<c10c47d1>] ? handle_pte_fault+0xe1/0x720
[  136.546565]  [<c12bd1af>] ? sys_getsockname+0x7f/0x90
[  136.546574]  [<c10c4ec1>] ? handle_mm_fault+0xb1/0x180
[  136.546582]  [<c1023240>] ? vmalloc_sync_all+0x100/0x100
[  136.546589]  [<c10233b3>] ? do_page_fault+0x173/0x3d0
[  136.546596]  [<c12bd87b>] ? sys_recvmsg+0x3b/0x60
[  136.546605]  [<c12bdd83>] sys_socketcall+0x293/0x2d0
[  136.546614]  [<c13629d0>] sysenter_do_call+0x12/0x26
[  136.546619] Code:  Bad EIP value.
[  136.546627] EIP: [<00000000>] 0x0 SS:ESP 0068:f15c3b48
[  136.546645] CR2: 0000000000000000
[  136.546652] ---[ end trace 6909b560e78934fa ]---
----

And if you want the oops on your machine (uisng 2.6.39.1 or 3.0-rcN):

----
brctl addbr mybridge
ip link set mybridge mtu 1234
oops
----

Regards,

Alexander

^ permalink raw reply

* [PATCH net-next-2.6] be2net: Fix Rx pause counter for lancer
From: Padmanabh Ratnakar @ 2011-06-06 12:27 UTC (permalink / raw)
  To: davem, netdev; +Cc: Padmanabh Ratnakar, Selvin Xavier

From: Selvin Xavier <selvin.xavier@emulex.com>

Fixed Rx pause counter for Lancer. Swapping hi and lo words.

Signed-off-by: Selvin Xavier <selvin.xavier@emulex.com>
Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 drivers/net/benet/be_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
index 5824c95..9ba197b 100644
--- a/drivers/net/benet/be_main.c
+++ b/drivers/net/benet/be_main.c
@@ -362,8 +362,8 @@ static void populate_lancer_stats(struct be_adapter *adapter)
 	drvs->rx_priority_pause_frames = 0;
 	drvs->pmem_fifo_overflow_drop = 0;
 	drvs->rx_pause_frames =
-		make_64bit_val(pport_stats->rx_pause_frames_lo,
-				 pport_stats->rx_pause_frames_hi);
+		make_64bit_val(pport_stats->rx_pause_frames_hi,
+				 pport_stats->rx_pause_frames_lo);
 	drvs->rx_crc_errors = make_64bit_val(pport_stats->rx_crc_errors_hi,
 						pport_stats->rx_crc_errors_lo);
 	drvs->rx_control_frames =
-- 
1.6.0.2


^ permalink raw reply related

* Re: [PATCH] usbnet/cdc_ncm: add missing .reset_resume hook
From: Stefan (metze) Metzmacher @ 2011-06-06 12:23 UTC (permalink / raw)
  To: David Miller; +Cc: oliver, gregkh, linux-usb, netdev, linux-kernel
In-Reply-To: <20110601.211142.557328157396419857.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 835 bytes --]

Hi David,

> From: Stefan Metzmacher <metze@samba.org>
> Date: Wed,  1 Jun 2011 14:01:41 +0200
> 
>> This avoids messages like this after suspend:
>>
>>    cdc_ncm 2-1.4:1.6: no reset_resume for driver cdc_ncm?
>>    cdc_ncm 2-1.4:1.7: no reset_resume for driver cdc_ncm?
>>    cdc_ncm 2-1.4:1.6: usb0: unregister 'cdc_ncm' usb-0000:00:1d.0-1.4, CDC NCM
>>
>> This is important for the Ericsson F5521gw GSM/UMTS modem.
>> Otherwise modemmanager looses the fact that the cdc_ncm and cdc_acm devices
>> belong together.
>>
>> The cdc_ether module does the same.
>>
>> Signed-off-by: Stefan Metzmacher <metze@samba.org>
> 
> Applied and queued up for -stable, thanks.

It seems to be part of 3.0-rc2, but I'm not seeing it in any stable tree
yet...

When can I expect it in stable trees like 2.6.38.y?

metze


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply

* Re: [RFC PATCH] ipv6: basic implementation of reverse path filtering
From: Eric Dumazet @ 2011-06-06 12:22 UTC (permalink / raw)
  To: Eric Leblond; +Cc: netdev
In-Reply-To: <1307360074-25473-1-git-send-email-eric@regit.org>

Le lundi 06 juin 2011 à 13:34 +0200, Eric Leblond a écrit :
> This patch provides a basic implementation of reverse path filtering
> for IPv6. Functionnality can be activatedor desactivated through an
> rp_filter entry similar to the IPv4 one.
> 
> The functionnality is disabled by default for backward compatibility
> but should be enable on all IPv6 routers/firewalls for security reason.
> 
> This implementation is heavily based on the patch Denis Semmau proposed
> in 2006.
> 
> Signed-off-by: Eric Leblond <eric@regit.org>
> ---
>  include/linux/ipv6.h   |    2 ++
>  include/linux/sysctl.h |    1 +
>  net/ipv6/addrconf.c    |   10 ++++++++++
>  net/ipv6/ip6_output.c  |   32 ++++++++++++++++++++++++++++++++
>  4 files changed, 45 insertions(+), 0 deletions(-)
> 

Hmm, is it matching ipv4 one really ?

vi +855 Documentation/networking/ip-sysctl.txt


rp_filter - INTEGER
        0 - No source validation.
        1 - Strict mode as defined in RFC3704 Strict Reverse Path
            Each incoming packet is tested against the FIB and if the interface
            is not the best reverse path the packet check will fail.
            By default failed packets are discarded.
        2 - Loose mode as defined in RFC3704 Loose Reverse Path
            Each incoming packet's source address is also tested against the FIB
            and if the source address is not reachable via any interface
            the packet check will fail.



^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Eric Dumazet @ 2011-06-06 12:12 UTC (permalink / raw)
  To: Alexander Holler
  Cc: Neil Horman, linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <4DECBEA3.6070408@ahsoftware.de>

Le lundi 06 juin 2011 à 13:48 +0200, Alexander Holler a écrit :
> Am 06.06.2011 13:15, schrieb Neil Horman:
> > On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
> >> Hello,
> >>
> >> I'm getting a oops in the bridge code in br_change_mtu() with
> >> 2.6.39.1. The patch below seems to fix that.
> >>
> >> I'm not sure about the usage of dst_cow_metrics_generic() in
> >> fake_dst_ops, but after having a quick look at it seems to be ok to
> >> use that here.
> >>
> >> Regards,
> >>
> >> Alexander
> >>
> > How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
> > wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
> > into that clause in which we call cow_metrics when we call dst_metric_set.  It
> > seems like that flag is set erroneously.  perhaps we should just update
> > fake_rtable.dst to have the correct flags?
> > Neil
> 
> It is set by that change:
> 
> --------
> @@ -124,7 +128,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
>   	atomic_set(&rt->dst.__refcnt, 1);
>   	rt->dst.dev = br->dev;
>   	rt->dst.path = &rt->dst;
> -	dst_metric_set(&rt->dst, RTAX_MTU, 1500);
> +	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
>   	rt->dst.flags	= DST_NOXFRM;
>   	rt->dst.ops = &fake_dst_ops;
>   }
> --------
> 
> The true in dst_init_metrics() is responsible for that flag.
> 

You are aware this change fixed an oops ?

read_only in this context means : In case this must be written, we make
a COW first
(allocate a piece of memory, copy the source in it before applying any
change)

It would be nice you send us the stack trace, so that we can have a clue
of whats going on.

Thanks

^ permalink raw reply

* [RFC PATCH] ipv6: basic implementation of reverse path filtering
From: Eric Leblond @ 2011-06-06 11:34 UTC (permalink / raw)
  To: netdev; +Cc: Eric Leblond

This patch provides a basic implementation of reverse path filtering
for IPv6. Functionnality can be activatedor desactivated through an
rp_filter entry similar to the IPv4 one.

The functionnality is disabled by default for backward compatibility
but should be enable on all IPv6 routers/firewalls for security reason.

This implementation is heavily based on the patch Denis Semmau proposed
in 2006.

Signed-off-by: Eric Leblond <eric@regit.org>
---
 include/linux/ipv6.h   |    2 ++
 include/linux/sysctl.h |    1 +
 net/ipv6/addrconf.c    |   10 ++++++++++
 net/ipv6/ip6_output.c  |   32 ++++++++++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0c99776..e61b88d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -134,6 +134,7 @@ struct ipv6hdr {
  */
 struct ipv6_devconf {
 	__s32		forwarding;
+	__s32		rp_filter;
 	__s32		hop_limit;
 	__s32		mtu6;
 	__s32		accept_ra;
@@ -213,6 +214,7 @@ enum {
 	DEVCONF_DISABLE_IPV6,
 	DEVCONF_ACCEPT_DAD,
 	DEVCONF_FORCE_TLLAO,
+	DEVCONF_RP_FILTER,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..bdcb7f8 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -568,6 +568,7 @@ enum {
 	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22,
 	NET_IPV6_PROXY_NDP=23,
 	NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
+	NET_IPV6_RP_FILTER=26,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 498b927..ba1c574 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -196,6 +196,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
+	.rp_filter		= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -230,6 +231,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
+	.rp_filter		= 0,
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -3805,6 +3807,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
 	array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
 	array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
+	array[DEVCONF_RP_FILTER] = cnf->rp_filter;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -4459,6 +4462,13 @@ static struct addrconf_sysctl_table
 			.proc_handler   = proc_dointvec
 		},
 		{
+			.procname       = "rp_filter",
+			.data           = &ipv6_devconf.rp_filter,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec
+		},
+		{
 			/* sentinel */
 		}
 	},
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9d4b165..c035494 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -374,6 +374,18 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
 	return 0;
 }
 
+static int rt6_validate_source(struct sk_buff *skb)
+{
+	struct rt6_info *rt;
+	struct ipv6hdr *hdr = ipv6_hdr(skb);
+	rt = rt6_lookup(dev_net(skb->dev), &hdr->saddr, NULL, 0, 0);
+	if (rt != NULL) {
+		if (rt->rt6i_idev->dev == skb->dev)
+			return 0;
+	}
+	return -1;
+}
+
 static inline int ip6_forward_finish(struct sk_buff *skb)
 {
 	return dst_output(skb);
@@ -384,6 +396,7 @@ int ip6_forward(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct inet6_dev *idev = NULL;
 	struct net *net = dev_net(dst->dev);
 	u32 mtu;
 
@@ -401,6 +414,25 @@ int ip6_forward(struct sk_buff *skb)
 	if (skb->pkt_type != PACKET_HOST)
 		goto drop;
 
+	idev = in6_dev_get(skb->dev);
+	if (!idev) {
+		printk(KERN_WARNING "idev error for rp_filter\n");
+		goto error;
+	}
+
+	if (net->ipv6.devconf_all->rp_filter & idev->cnf.rp_filter) {
+		if (rt6_validate_source(skb) < 0) {
+			printk(KERN_WARNING
+			       "rp_filter: packet refused on %s, invalid src %pI6 (dst: %pI6)",
+			       skb->dev->name,
+			       &hdr->saddr,
+			       &hdr->daddr
+			      );
+			goto drop;
+		}
+
+	}
+
 	skb_forward_csum(skb);
 
 	/*
-- 
1.7.5.3


^ permalink raw reply related

* Re: bridge/netfilter: regression in 2.6.39.1
From: Alexander Holler @ 2011-06-06 11:48 UTC (permalink / raw)
  To: Neil Horman; +Cc: linux-kernel, Eric Dumazet, David Miller, Herbert Xu, netdev
In-Reply-To: <20110606111507.GA1000@hmsreliant.think-freely.org>

Am 06.06.2011 13:15, schrieb Neil Horman:
> On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
>> Hello,
>>
>> I'm getting a oops in the bridge code in br_change_mtu() with
>> 2.6.39.1. The patch below seems to fix that.
>>
>> I'm not sure about the usage of dst_cow_metrics_generic() in
>> fake_dst_ops, but after having a quick look at it seems to be ok to
>> use that here.
>>
>> Regards,
>>
>> Alexander
>>
> How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
> wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
> into that clause in which we call cow_metrics when we call dst_metric_set.  It
> seems like that flag is set erroneously.  perhaps we should just update
> fake_rtable.dst to have the correct flags?
> Neil

It is set by that change:

--------
@@ -124,7 +128,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
  	atomic_set(&rt->dst.__refcnt, 1);
  	rt->dst.dev = br->dev;
  	rt->dst.path = &rt->dst;
-	dst_metric_set(&rt->dst, RTAX_MTU, 1500);
+	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
  	rt->dst.flags	= DST_NOXFRM;
  	rt->dst.ops = &fake_dst_ops;
  }
--------

The true in dst_init_metrics() is responsible for that flag.

Regards,

Alexander

^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Neil Horman @ 2011-06-06 11:15 UTC (permalink / raw)
  To: Alexander Holler
  Cc: linux-kernel, Eric Dumazet, David Miller, Herbert Xu, netdev
In-Reply-To: <4DE93422.3070000@ahsoftware.de>

On Fri, Jun 03, 2011 at 09:21:06PM +0200, Alexander Holler wrote:
> Hello,
> 
> I'm getting a oops in the bridge code in br_change_mtu() with
> 2.6.39.1. The patch below seems to fix that.
> 
> I'm not sure about the usage of dst_cow_metrics_generic() in
> fake_dst_ops, but after having a quick look at it seems to be ok to
> use that here.
> 
> Regards,
> 
> Alexander
> 
How did the flags of the dst entry on which we're callnig dst_entry_write_ptr
wind up getting the  READ_ONLY flag set on it?  I don't see how we'er falling
into that clause in which we call cow_metrics when we call dst_metric_set.  It
seems like that flag is set erroneously.  perhaps we should just update
fake_rtable.dst to have the correct flags?
Neil

> -----
> From 3c1d5951af73389798afeea672ec224e195b8e8d Mon Sep 17 00:00:00 2001
> From: Alexander Holler <holler@ahsoftware.de>
> Date: Fri, 3 Jun 2011 20:43:06 +0200
> Subject: [PATCH] bridge: add dst_cow_metrics_generic to fake_dst_ops
> 
> Commit 42923465fb8d025a2b5153f2e7ab1e6e1058bf00 does here what it
> should prevent, it introduces NULL a dereference.
> 
> The above commit uses dst_init_metrics() which sets the metrics as
> read only. As result br_change_mtu() dies in dst_metric_set()
> which calls dst_metrics_write_ptr() which calls
> dst->ops->cow_metrics() if the metrics are read only.
> ---
>  net/bridge/br_netfilter.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
> diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
> index 5f9c091..de982a1 100644
> --- a/net/bridge/br_netfilter.c
> +++ b/net/bridge/br_netfilter.c
> @@ -107,6 +107,7 @@ static void fake_update_pmtu(struct dst_entry
> *dst, u32 mtu)
>  static struct dst_ops fake_dst_ops = {
>         .family =               AF_INET,
>         .protocol =             cpu_to_be16(ETH_P_IP),
> +       .cow_metrics =          dst_cow_metrics_generic,
>         .update_pmtu =          fake_update_pmtu,
>  };
> 
> -- 
> 1.7.3.4
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: ethtool -E rejects magic >= 80000000
From: Ben Hutchings @ 2011-06-06 10:50 UTC (permalink / raw)
  To: Jens Rottmann; +Cc: Martin Hein, Jeff Garzik, netdev
In-Reply-To: <1307355820.2765.15.camel@bwh-desktop>

[-- Attachment #1: Type: text/plain, Size: 347 bytes --]

Please can you test the attached patch on top of the git repository
<git://git.kernel.org/pub/scm/network/ethtool/ethtool.git> or ethtool
2.6.39.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

[-- Attachment #2: 0001-Correct-parameter-types-for-ethtool-e-and-ethtool-E.patch --]
[-- Type: text/x-patch, Size: 3887 bytes --]

>From ed7fba4e03c39365affbae4ec4f6401b55cfd80b Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 6 Jun 2011 11:35:19 +0100
Subject: [PATCH ethtool] Correct parameter types for ethtool -e and ethtool
 -E
To: netdev@vger.kernel.org
Cc: linux-net-drivers@solarflare.com

All parameters to the underlying ethtool commands are unsigned, not
signed.  In particular, the 'magic' parameter to ethtool -E often has
the most significant bit set and users should not have to provide it
as a negative number.

For ethtool -E, the value to be written is 8-bit, not 32-bit.

Reported-by: Jens Rottmann <JRottmann@LiPPERTEmbedded.de>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 ethtool.c |   40 ++++++++++++++++++++++++----------------
 1 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index edfbe3d..288b93f 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -365,13 +365,14 @@ static int gregs_dump_hex = 0;
 static char *gregs_dump_file = NULL;
 static int geeprom_changed = 0;
 static int geeprom_dump_raw = 0;
-static s32 geeprom_offset = 0;
-static s32 geeprom_length = -1;
+static u32 geeprom_offset = 0;
+static u32 geeprom_length = -1;
 static int seeprom_changed = 0;
-static s32 seeprom_magic = 0;
-static s32 seeprom_length = -1;
-static s32 seeprom_offset = 0;
-static s32 seeprom_value = EOF;
+static u32 seeprom_magic = 0;
+static u32 seeprom_length = -1;
+static u32 seeprom_offset = 0;
+static u8 seeprom_value = 0;
+static int seeprom_value_seen = 0;
 static int rx_fhash_get = 0;
 static int rx_fhash_set = 0;
 static u32 rx_fhash_val = 0;
@@ -400,6 +401,7 @@ typedef enum {
 	CMDL_NONE,
 	CMDL_BOOL,
 	CMDL_S32,
+	CMDL_U8,
 	CMDL_U16,
 	CMDL_U32,
 	CMDL_U64,
@@ -433,16 +435,17 @@ static struct cmdline_info cmdline_gregs[] = {
 };
 
 static struct cmdline_info cmdline_geeprom[] = {
-	{ "offset", CMDL_S32, &geeprom_offset, NULL },
-	{ "length", CMDL_S32, &geeprom_length, NULL },
+	{ "offset", CMDL_U32, &geeprom_offset, NULL },
+	{ "length", CMDL_U32, &geeprom_length, NULL },
 	{ "raw", CMDL_BOOL, &geeprom_dump_raw, NULL },
 };
 
 static struct cmdline_info cmdline_seeprom[] = {
-	{ "magic", CMDL_S32, &seeprom_magic, NULL },
-	{ "offset", CMDL_S32, &seeprom_offset, NULL },
-	{ "length", CMDL_S32, &seeprom_length, NULL },
-	{ "value", CMDL_S32, &seeprom_value, NULL },
+	{ "magic", CMDL_U32, &seeprom_magic, NULL },
+	{ "offset", CMDL_U32, &seeprom_offset, NULL },
+	{ "length", CMDL_U32, &seeprom_length, NULL },
+	{ "value", CMDL_U8, &seeprom_value, NULL,
+	  0, &seeprom_value_seen },
 };
 
 static struct cmdline_info cmdline_offload[] = {
@@ -614,6 +617,11 @@ static void parse_generic_cmdline(int argc, char **argp,
 							   0x7fffffff);
 					break;
 				}
+				case CMDL_U8: {
+					u8 *p = info[idx].wanted_val;
+					*p = get_uint_range(argp[i], 0, 0xff);
+					break;
+				}
 				case CMDL_U16: {
 					u16 *p = info[idx].wanted_val;
 					*p = get_uint_range(argp[i], 0, 0xffff);
@@ -2626,7 +2634,7 @@ static int do_geeprom(int fd, struct ifreq *ifr)
 		return 74;
 	}
 
-	if (geeprom_length <= 0)
+	if (geeprom_length == -1)
 		geeprom_length = drvinfo.eedump_len;
 
 	if (drvinfo.eedump_len < geeprom_offset + geeprom_length)
@@ -2667,10 +2675,10 @@ static int do_seeprom(int fd, struct ifreq *ifr)
 		return 74;
 	}
 
-	if (seeprom_value != EOF)
+	if (seeprom_value_seen)
 		seeprom_length = 1;
 
-	if (seeprom_length <= 0)
+	if (seeprom_length == -1)
 		seeprom_length = drvinfo.eedump_len;
 
 	if (drvinfo.eedump_len < seeprom_offset + seeprom_length)
@@ -2689,7 +2697,7 @@ static int do_seeprom(int fd, struct ifreq *ifr)
 	eeprom->data[0] = seeprom_value;
 
 	/* Multi-byte write: read input from stdin */
-	if (seeprom_value == EOF)
+	if (!seeprom_value_seen)
 		eeprom->len = fread(eeprom->data, 1, eeprom->len, stdin);
 
 	ifr->ifr_data = (caddr_t)eeprom;
-- 
1.7.4.4


^ permalink raw reply related

* Re: ethtool -E rejects magic >= 80000000
From: Ben Hutchings @ 2011-06-06 10:23 UTC (permalink / raw)
  To: Jens Rottmann; +Cc: Martin Hein, Jeff Garzik, netdev
In-Reply-To: <4DECA785.7070207@LiPPERTEmbedded.de>

On Mon, 2011-06-06 at 12:10 +0200, Jens Rottmann wrote:
> Hi,
> 
> we tripped over the fact that commands like e.g.
> 
> ethtool -E ... magic 0xCCCC8086 ...
> 
> are no longer accepted with a new Debian though the same worked on an
> ancient Slackware. Current ethtool now firmly insists on a -0x80000000
> .. +0x7FFFFFFF range for "magic", so instead of the PCI ID (usually) you
> have to provide its negative two's complement:
> 
> ethtool -E ... magic -0x33337F7A ...
> 
> This works, but is rather nonintuitive and akward.

Indeed.  I would think that the input format for the magic value on the
command line should match the output format from 'ethtool -e', except
that the magic value is never output!

> A bit of gitweb browsing led us to a commit dated 25 Jun 2010, which we
> think triggered the new behaviour:
> 
> ethtool: Parse integers into variables of different sizes and byte orders
> 
> The commit changed
> 
> { "magic", CMDL_INT, &seeprom_magic, NULL },
> ...
> case CMDL_INT: {
> 	*p = get_int(argp[i],0);
> 
> into
> 
> { "magic", CMDL_S32, &seeprom_magic, NULL },
> ...
> case CMDL_S32: {
> 	s32 *p = info[idx].wanted_val;
> 	*p = get_int_range(argp[i], 0, -0x80000000LL, 0x7fffffff);
> 
> which introduces a strict range check. The kernel sources' struct
> ethtool_eeprom defines __u32 magic, so probably ethtool's "magic" should
> be CMDL_U32 instead. (Meaning it should have been CMDL_UINT originally,
> i.e. the patch did not cause the problem, but only made it visible.)

Yes, I agree with this.

Of course, some users and scripts may now assume that this parameter is
signed.  So perhaps the range should be -0x80000000 to 0xffffffff (union
of s32 and u32 ranges)?  I'm not sure whether it's worth the trouble to
do this.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] ethtool: Add support for 20G link speed
From: Ben Hutchings @ 2011-06-06 10:12 UTC (permalink / raw)
  To: Yaniv Rosner; +Cc: netdev, eilong
In-Reply-To: <1307295583.20872.123.camel@lb-tlvb-dmitry>

On Sun, 2011-06-05 at 20:39 +0300, Yaniv Rosner wrote:
> Hi Ben,
> I'm resubmitting the addition of 20G with your corrections to ethtool.
> 
> Thanks,
> Yaniv

Applied, thanks.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH 2/3] ipv4: Fix packet size calculation for IPsec packets in __ip_append_data
From: Steffen Klassert @ 2011-06-06  8:52 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Herbert Xu, netdev
In-Reply-To: <1307345899.3098.3.camel@edumazet-laptop>

On Mon, Jun 06, 2011 at 09:38:19AM +0200, Eric Dumazet wrote:
> 
> Woh, I am afraid I wont have time in following days to check your
> assertion.

My test setup was the following:

I use an IPsec tunnel with tunnel endpoints 192.168.1.1 and 192.168.1.2

Then I do at 192.168.1.2

ping -c1 -M do -s 1410 192.168.1.1

PING 192.168.1.1 (192.168.1.1) 1410(1438) bytes of data.
>From 192.168.1.2 icmp_seq=1 Frag needed and DF set (mtu = 1438)

--- 192.168.1.1 ping statistics ---
0 packets transmitted, 0 received, +1 errors

So the packet matches the mtu but it is not send.
I used a kernel with your patch as head commit.

Reverting your patch (going one commit deeper in the history):

ping -c1 -M do -s 1410 192.168.1.1

PING 192.168.1.1 (192.168.1.1) 1410(1438) bytes of data.
1418 bytes from 192.168.1.1: icmp_seq=1 ttl=64 time=3.01 ms

--- 192.168.1.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 3.014/3.014/3.014/0.000 ms

> 
> What about original problem then, how should we fix it ?
> 

Hm, I don't know. I'll try to reproduce it here.

> We do have some cases where at least one fragment (the last one) is
> oversized.

trailer_len is used only on IPsec so the poroblem exists only when
using IPsec, right?

> 
> I remember I used Nick Bowler scripts at that time, I might find them
> again...

Would be nice if you could provide these scripts and some informations
on how to reproduce the problem.


^ permalink raw reply

* Re: [PATCH 2/3] ipv4: Fix packet size calculation for IPsec packets in __ip_append_data
From: Eric Dumazet @ 2011-06-06  7:38 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: David Miller, Herbert Xu, netdev
In-Reply-To: <20110606064802.GC31505@secunet.com>

Le lundi 06 juin 2011 à 08:48 +0200, Steffen Klassert a écrit :
> Git commit 59104f06 (ip: take care of last fragment in ip_append_data)
> added a check to see if we exceed the mtu when we add trailer_len.
> However, the mtu is already subtracted by trailer_len when the
> xfrm transfomation bundles are set up. So IPsec packets with mtu
> size get fragmented, or if the DF bit is set the packets will not
> be send even though they match the mtu perfectly fine. This patch
> actually reverts commit 59104f06.
> 
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---

Woh, I am afraid I wont have time in following days to check your
assertion.

What about original problem then, how should we fix it ?

We do have some cases where at least one fragment (the last one) is
oversized.

I remember I used Nick Bowler scripts at that time, I might find them
again...

[PATCH] ip : take care of last fragment in ip_append_data

While investigating a bit, I found ip_fragment() slow path was taken
because ip_append_data() provides following layout for a send(MTU +
N*(MTU - 20)) syscall :

- one skb with 1500 (mtu) bytes
- N fragments of 1480 (mtu-20) bytes (before adding IP header)
last fragment gets 17 bytes of trail data because of following bit:

        if (datalen == length + fraggap)
                alloclen += rt->dst.trailer_len;

Then esp4 adds 16 bytes of data (while trailer_len is 17... hmm...
another bug ?)

In ip_fragment(), we notice last fragment is too big (1496 + 20) > mtu,
so we take slow path, building another skb chain.

In order to avoid taking slow path, we should correct ip_append_data()
to make sure last fragment has real trail space, under mtu...

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply

* Re: bridge/netfilter: regression in 2.6.39.1
From: Alexander Holler @ 2011-06-06  6:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel, David Miller, Herbert Xu, netdev
In-Reply-To: <4DEA1F5F.4070707@ahsoftware.de>

Hello,

Am 04.06.2011 14:04, schrieb Alexander Holler:

> Anyway, I've used the broken way of not using git send-email because I'm
> unsure if the solution I've offered is the best one. I'm lacking the
> knowledge about what cow_metrics() does, what it is used for, and what,
> if any, pre- or post-requisites are needed to use
> dst_cow_metrics_generic().
>
> I can only say that my solution seems to work here, the machine comes up
> without an oops and I didn't have seen any other problems while using
> the bridge (which surely doesn't say anything, e.g. I'm not sure if my
> use of dst_cow_metrics_generic() may introduce a mem leak or similiar,
> it doesn't look so, but who knows).

After having a second look at dst_cow_metrics_generic(), it seems my 
patch does introduce a small mem leak which occurs when the bridge will 
be build as a module and the module gets unloaded. Then the static 
struct dst_ops fake_dst_ops disappears and with that the memory 
allocated by dst_cow_metrics_generic() is lost. I'm not sure which of 
the dst*-functions should be called before the module gets unloaded. 
Maybe someone with a deeper knowledge of the that stuff could have a 
look at that.
Or maybe just changing the true in the call to dst_cow_metrics_generic() 
in br_netfilter_rtable_init() to false instead of adding 
dst_cow_metrics_generic() to fake_dst_ops() is an alternative. As I 
said, I don't know what that metric-stuff does and what it is used for.

Regards,

Alexander

^ permalink raw reply

* [PATCH 3/3] ipv4: Fix packet size calculation for raw IPsec packets in __ip_append_data
From: Steffen Klassert @ 2011-06-06  6:48 UTC (permalink / raw)
  To: David Miller, Herbert Xu; +Cc: netdev
In-Reply-To: <20110606064603.GB31505@secunet.com>

We assume that transhdrlen is positive on the first fragment
which is wrong for raw packets. So we don't add exthdrlen to the
packet size for raw packets. This leads to a reallocation on IPsec
because we have not enough headroom on the skb to place the IPsec
headers. This patch fixes this by adding exthdrlen to the packet
size whenever the send queue of the socket is empty. This issue was
introduced with git commit 1470ddf7 (inet: Remove explicit write
references to sk/inet in ip_append_data)

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_output.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a16859b..6b894d4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -799,7 +799,9 @@ static int __ip_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 
-	exthdrlen = transhdrlen ? rt->dst.header_len : 0;
+	skb = skb_peek_tail(queue);
+
+	exthdrlen = !skb ? rt->dst.header_len : 0;
 	length += exthdrlen;
 	transhdrlen += exthdrlen;
 	mtu = cork->fragsize;
@@ -825,8 +827,6 @@ static int __ip_append_data(struct sock *sk,
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
-	skb = skb_peek_tail(queue);
-
 	cork->length += length;
 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 2/3] ipv4: Fix packet size calculation for IPsec packets in __ip_append_data
From: Steffen Klassert @ 2011-06-06  6:48 UTC (permalink / raw)
  To: David Miller, Herbert Xu; +Cc: netdev, Eric Dumazet
In-Reply-To: <20110606064603.GB31505@secunet.com>

Git commit 59104f06 (ip: take care of last fragment in ip_append_data)
added a check to see if we exceed the mtu when we add trailer_len.
However, the mtu is already subtracted by trailer_len when the
xfrm transfomation bundles are set up. So IPsec packets with mtu
size get fragmented, or if the DF bit is set the packets will not
be send even though they match the mtu perfectly fine. This patch
actually reverts commit 59104f06.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_output.c |    7 ++-----
 1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 98af369..a16859b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -888,12 +888,9 @@ alloc_new_skb:
 			 * because we have no idea what fragment will be
 			 * the last.
 			 */
-			if (datalen == length + fraggap) {
+			if (datalen == length + fraggap)
 				alloclen += rt->dst.trailer_len;
-				/* make sure mtu is not reached */
-				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
-					datalen -= ALIGN(rt->dst.trailer_len, 8);
-			}
+
 			if (transhdrlen) {
 				skb = sock_alloc_send_skb(sk,
 						alloclen + hh_len + 15,
-- 
1.7.0.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox