Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/9] net: pxa168_eth: Provide phy_interface mode on platform_data
From: Sebastian Hesselbarth @ 2014-10-21  8:53 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: Florian Fainelli, Eric Miao, netdev, Antoine Ténart,
	linux-kernel, Haojian Zhuang, David S. Miller, linux-arm-kernel
In-Reply-To: <1413881627-21639-1-git-send-email-sebastian.hesselbarth@gmail.com>

The PXA168 Ethernet IP support MII and RMII connection to its PHY.
Currently, pxa168 platform_data does not provide a way to pass that
and there is one user of pxa168 platform_data (mach-mmp/gplug).
Given the pinctrl settings of gplug it uses RMII, so add and pass
a corresponding phy_interface_t.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Antoine Ténart" <antoine.tenart@free-electrons.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Eric Miao <eric.y.miao@gmail.com>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/arm/mach-mmp/gplugd.c | 2 ++
 include/linux/pxa168_eth.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/arch/arm/mach-mmp/gplugd.c b/arch/arm/mach-mmp/gplugd.c
index d81b2475e67e..3b5794cd0357 100644
--- a/arch/arm/mach-mmp/gplugd.c
+++ b/arch/arm/mach-mmp/gplugd.c
@@ -12,6 +12,7 @@
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
 #include <linux/gpio-pxa.h>
+#include <linux/phy.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
@@ -158,6 +159,7 @@ struct pxa168_eth_platform_data gplugd_eth_platform_data = {
 	.port_number = 0,
 	.phy_addr    = 0,
 	.speed       = 0, /* Autonagotiation */
+	.intf        = PHY_INTERFACE_MODE_RMII,
 	.init        = gplugd_eth_init,
 };
 
diff --git a/include/linux/pxa168_eth.h b/include/linux/pxa168_eth.h
index 18d75e795606..37c381120bc8 100644
--- a/include/linux/pxa168_eth.h
+++ b/include/linux/pxa168_eth.h
@@ -13,6 +13,7 @@ struct pxa168_eth_platform_data {
 	 */
 	int	speed;		/* 0, SPEED_10, SPEED_100 */
 	int	duplex;		/* DUPLEX_HALF or DUPLEX_FULL */
+	phy_interface_t intf;
 
 	/*
 	 * Override default RX/TX queue sizes if nonzero.
-- 
2.1.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related

* [PATCH 1/9] phy: marvell: Add support for 88E3016 FastEthernet PHY
From: Sebastian Hesselbarth @ 2014-10-21  8:53 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: David S. Miller, Antoine Ténart, Florian Fainelli,
	linux-arm-kernel, netdev, linux-kernel
In-Reply-To: <1413881627-21639-1-git-send-email-sebastian.hesselbarth@gmail.com>

Marvell 88E3016 is a FastEthernet PHY that also can be found in Marvell
Berlin SoCs as integrated PHY.

Tested-by: Antoine Ténart <antoine.tenart@free-electrons.com>
Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Antoine Ténart" <antoine.tenart@free-electrons.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/phy/marvell.c   | 46 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/marvell_phy.h |  1 +
 2 files changed, 47 insertions(+)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index bd37e45c89c0..d2b2f2f795d5 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -118,6 +118,9 @@
 
 #define MII_M1116R_CONTROL_REG_MAC	21
 
+#define MII_88E3016_PHY_SPEC_CTRL	0x10
+#define MII_88E3016_DISABLE_SCRAMBLER	0x0200
+#define MII_88E3016_AUTO_MDIX_CROSSOVER	0x0030
 
 MODULE_DESCRIPTION("Marvell PHY driver");
 MODULE_AUTHOR("Andy Fleming");
@@ -434,6 +437,25 @@ static int m88e1116r_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+static int m88e3016_config_init(struct phy_device *phydev)
+{
+	int reg;
+
+	/* Enable Scrambler and Auto-Crossover */
+	reg = phy_read(phydev, MII_88E3016_PHY_SPEC_CTRL);
+	if (reg < 0)
+		return reg;
+
+	reg &= ~MII_88E3016_DISABLE_SCRAMBLER;
+	reg |= MII_88E3016_AUTO_MDIX_CROSSOVER;
+
+	reg = phy_write(phydev, MII_88E3016_PHY_SPEC_CTRL, reg);
+	if (reg < 0)
+		return reg;
+
+	return 0;
+}
+
 static int m88e1111_config_init(struct phy_device *phydev)
 {
 	int err;
@@ -770,6 +792,12 @@ static int marvell_read_status(struct phy_device *phydev)
 	return 0;
 }
 
+static int marvell_aneg_done(struct phy_device *phydev)
+{
+	int retval = phy_read(phydev, MII_M1011_PHY_STATUS);
+	return (retval < 0) ? retval : (retval & MII_M1011_PHY_STATUS_RESOLVED);
+}
+
 static int m88e1121_did_interrupt(struct phy_device *phydev)
 {
 	int imask;
@@ -1050,6 +1078,23 @@ static struct phy_driver marvell_drivers[] = {
 		.suspend = &genphy_suspend,
 		.driver = { .owner = THIS_MODULE },
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E3016,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E3016",
+		.features = PHY_BASIC_FEATURES,
+		.flags = PHY_HAS_INTERRUPT,
+		.config_aneg = &genphy_config_aneg,
+		.config_init = &m88e3016_config_init,
+		.aneg_done = &marvell_aneg_done,
+		.read_status = &marvell_read_status,
+		.ack_interrupt = &marvell_ack_interrupt,
+		.config_intr = &marvell_config_intr,
+		.did_interrupt = &m88e1121_did_interrupt,
+		.resume = &genphy_resume,
+		.suspend = &genphy_suspend,
+		.driver = { .owner = THIS_MODULE },
+	},
 };
 
 static int __init marvell_init(void)
@@ -1079,6 +1124,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1318S, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1116R, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1510, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
 	{ }
 };
 
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 8e9a029e093d..e6982ac3200d 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -16,6 +16,7 @@
 #define MARVELL_PHY_ID_88E1318S		0x01410e90
 #define MARVELL_PHY_ID_88E1116R		0x01410e40
 #define MARVELL_PHY_ID_88E1510		0x01410dd0
+#define MARVELL_PHY_ID_88E3016		0x01410e60
 
 /* struct phy_device dev_flags definitions */
 #define MARVELL_PHY_M1145_FLAGS_RESISTANCE	0x00000001
-- 
2.1.1

^ permalink raw reply related

* [PATCH 0/9] Marvell PXA168 libphy handling and Berlin Ethernet
From: Sebastian Hesselbarth @ 2014-10-21  8:53 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: devicetree, Florian Fainelli, Eric Miao, netdev,
	Antoine Ténart, linux-kernel, Haojian Zhuang,
	David S. Miller, linux-arm-kernel

This patch series deals with a removing a IP feature that can be found
on all currently supported Marvell Ethernet IP (pxa168_eth, mv643xx_eth,
mvneta). The MAC IP allows to automatically perform PHY auto-negotiation
without software interaction.

However, this feature (a) fundamentally clashes with the way libphy works
and (b) is unable to deal with quirky PHYs that require special treatment.
In this series, pxa168_eth driver is rewritten to completely disable that
feature and properly deal with libphy provided PHYs.

This is the real patch set after an RFT sent earlier. Unfortunately, there
was no testing from MMP/gplug but there was for BG2Q. However, it still
would be great to get a Tested-by on gplug. Also, this patch set is now
rebased on v3.18-rc1. As usual, a branch on top of v3.18-rc1 can be found at

git://git.infradead.org/users/hesselba/linux-berlin.git devel/bg2-bg2cd-eth-v1

Patches 1-5 should go through David's net tree, I'll pick up the DT patches
6-9.

Compared to the RFT, there have been some changes:
- added phy-connection-type property to BG2Q PHY DT node
- bail out from pxa168_eth_adjust_link when there is no change in
  PHY parameters. Also, add a call to phy_print_status.

Patch 1 adds support for Marvell 88E3016 FastEthernet PHY that is also
integrated in Marvell Berlin BG2/BG2CD SoCs.

Patch 2 allows to pass phy_interface_t on pxa168_eth platform_data that
is only used by mach-mmp/gplug. From the board setup, I guessed gplug's
PHY is connected via RMII. The patch still isn't even compile tested.

Patches 3-5 prepare proper libphy handling and finally remove all in-driver
PHY mangling related to the feature explained above.

Patches 6-9 add corresponding ethernet DT nodes to BG2, BG2CD, add a
phy-connection-type property to BG2Q and enable ethernet on BG2-based Sony
NSZ-GS7. I have tested all this on GS7 successfully with ip=dhcp on 100M FD.

Antoine Ténart (1):
  ARM: berlin: Add phy-connection-type to BG2Q PHY

Sebastian Hesselbarth (8):
  phy: marvell: Add support for 88E3016 FastEthernet PHY
  net: pxa168_eth: Provide phy_interface mode on platform_data
  net: pxa168_eth: Prepare proper libphy handling
  net: pxa168_eth: Remove HW auto-negotiaion
  net: pxa168_eth: Remove in-driver PHY mangling
  ARM: berlin: Add BG2 ethernet DT nodes
  ARM: berlin: Add BG2CD ethernet DT nodes
  ARM: berlin: Enable ethernet on Sony NSZ-GS7

 arch/arm/boot/dts/berlin2-sony-nsz-gs7.dts |   2 +
 arch/arm/boot/dts/berlin2.dtsi             |  36 +++++
 arch/arm/boot/dts/berlin2cd.dtsi           |  36 +++++
 arch/arm/boot/dts/berlin2q.dtsi            |   1 +
 arch/arm/mach-mmp/gplugd.c                 |   2 +
 drivers/net/ethernet/marvell/pxa168_eth.c  | 248 ++++++++++++-----------------
 drivers/net/phy/marvell.c                  |  46 ++++++
 include/linux/marvell_phy.h                |   1 +
 include/linux/pxa168_eth.h                 |   1 +
 9 files changed, 225 insertions(+), 148 deletions(-)

---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Antoine Ténart" <antoine.tenart@free-electrons.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Eric Miao <eric.y.miao@gmail.com>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: netdev@vger.kernel.org
Cc: devicetree@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
-- 
2.1.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply

* [net-next 2/2] ip6_tunnel: allow to change mode for the ip6tnl0
From: Alexey Andriyanov @ 2014-10-21  8:11 UTC (permalink / raw)
  To: netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet
In-Reply-To: <1413879088-13513-1-git-send-email-alan@al-an.info>

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

This allows to update the fallback tunnel device, but only
the mode could be changed.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a48f212..0e45ec9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1285,6 +1285,14 @@ static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
 	return err;
 }
 
+static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+	/* for default tnl0 device allow to change only the proto */
+	t->parms.proto = p->proto;
+	netdev_state_change(t->dev);
+	return 0;
+}
+
 static void
 ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
 {
@@ -1384,7 +1392,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			break;
 		ip6_tnl_parm_from_user(&p1, &p);
 		t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
-		if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+		if (cmd == SIOCCHGTUNNEL) {
 			if (t != NULL) {
 				if (t->dev != dev) {
 					err = -EEXIST;
@@ -1392,8 +1400,10 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 				}
 			} else
 				t = netdev_priv(dev);
-
-			err = ip6_tnl_update(t, &p1);
+			if (dev == ip6n->fb_tnl_dev)
+				err = ip6_tnl0_update(t, &p1);
+			else
+				err = ip6_tnl_update(t, &p1);
 		}
 		if (t) {
 			err = 0;
-- 
1.9.1

^ permalink raw reply related

* [net-next 2/2] ip6_tunnel: allow to change mode for the ip6tnl0
From: Alexey Andriyanov @ 2014-10-21  8:19 UTC (permalink / raw)
  To: Roman Gushchin, netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet
In-Reply-To: <1413879596-13726-1-git-send-email-alan@al-an.info>

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

This allows to update the fallback tunnel device, but only
the mode could be changed.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a48f212..0e45ec9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1285,6 +1285,14 @@ static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
 	return err;
 }
 
+static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+	/* for default tnl0 device allow to change only the proto */
+	t->parms.proto = p->proto;
+	netdev_state_change(t->dev);
+	return 0;
+}
+
 static void
 ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
 {
@@ -1384,7 +1392,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			break;
 		ip6_tnl_parm_from_user(&p1, &p);
 		t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
-		if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+		if (cmd == SIOCCHGTUNNEL) {
 			if (t != NULL) {
 				if (t->dev != dev) {
 					err = -EEXIST;
@@ -1392,8 +1400,10 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 				}
 			} else
 				t = netdev_priv(dev);
-
-			err = ip6_tnl_update(t, &p1);
+			if (dev == ip6n->fb_tnl_dev)
+				err = ip6_tnl0_update(t, &p1);
+			else
+				err = ip6_tnl_update(t, &p1);
 		}
 		if (t) {
 			err = 0;
-- 
1.9.1

^ permalink raw reply related

* [net-next 1/2] ip6_tunnel: put ip6tnl0 FB device into 'any' mode
From: Alexey Andriyanov @ 2014-10-21  8:11 UTC (permalink / raw)
  To: netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9409887..a48f212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1551,7 +1551,8 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 
-	t->parms.proto = IPPROTO_IPV6;
+	/* allow any registered unrelying proto for the FB device */
+	t->parms.proto = 0;
 	dev_hold(dev);
 
 	ip6_tnl_link_config(t);
-- 
1.9.1

^ permalink raw reply related

* [net-next 1/2] ip6_tunnel: put ip6tnl0 FB device into 'any' mode
From: Alexey Andriyanov @ 2014-10-21  8:19 UTC (permalink / raw)
  To: Roman Gushchin, netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9409887..a48f212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1551,7 +1551,8 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 
-	t->parms.proto = IPPROTO_IPV6;
+	/* allow any registered unrelying proto for the FB device */
+	t->parms.proto = 0;
 	dev_hold(dev);
 
 	ip6_tnl_link_config(t);
-- 
1.9.1

^ permalink raw reply related

* RCU stall in af_unix.c, should use spin_lock_irqsave?
From: Thomas Petazzoni @ 2014-10-21  8:03 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet
  Cc: netdev, linux-kernel, Alexandre FOURNIER, Ezequiel Garcia,
	Marcin Wojtas, Gregory Clément

Hello,

I stumbled across a reproducible RCU stall related to the AF_UNIX code
(on 3.17, on an ARM SMP system), and I'm not sure whether the problem
is caused by:

 * The af_unix.c code using spin_lock() on sk->sk_receive_queue.lock
   while it should be using spin_lock_irqsave().

OR

 * The mvpp2 Ethernet driver using on_each_cpu() in a softirq context.

At least, switching to use spin_lock_irqsave() instead of spin_lock()
has proven to fix the issue (see patch below). The spinlock validator
complains that a lockup has been detected, which might indicate that
something is indeed wrong with the spinlock handling.

Now, to the gory details. When stress-testing a lighttpd web server
that does a lot of CGI calls and therefore a lot of Unix socket
traffic, I get typically after couple of minutes the following RCU
stall:

INFO: rcu_sched self-detected stall on CPU { 0}  (t=2100 jiffies g=14665 c=14664 q=1352)
Task dump for CPU 0:
lighttpd        R running      0  1549      1 0x00000002
[<c0014f94>] (unwind_backtrace) from [<c001130c>] (show_stack+0x10/0x14)
[<c001130c>] (show_stack) from [<c0059688>] (rcu_dump_cpu_stacks+0x98/0xd4)
[<c0059688>] (rcu_dump_cpu_stacks) from [<c005c3ec>] (rcu_check_callbacks+0x424/0x740)
[<c005c3ec>] (rcu_check_callbacks) from [<c005e7c8>] (update_process_times+0x40/0x60)
[<c005e7c8>] (update_process_times) from [<c006ce70>] (tick_sched_timer+0x70/0x210)
[<c006ce70>] (tick_sched_timer) from [<c005efc4>] (__run_hrtimer.isra.35+0x6c/0x128)
[<c005efc4>] (__run_hrtimer.isra.35) from [<c005f600>] (hrtimer_interrupt+0x11c/0x2d0)
[<c005f600>] (hrtimer_interrupt) from [<c00148f8>] (twd_handler+0x34/0x44)
[<c00148f8>] (twd_handler) from [<c00557ec>] (handle_percpu_devid_irq+0x6c/0x84)
[<c00557ec>] (handle_percpu_devid_irq) from [<c0051c80>] (generic_handle_irq+0x2c/0x3c)
[<c0051c80>] (generic_handle_irq) from [<c000eafc>] (handle_IRQ+0x40/0x90)
[<c000eafc>] (handle_IRQ) from [<c00086d0>] (gic_handle_irq+0x2c/0x5c)
[<c00086d0>] (gic_handle_irq) from [<c0011e40>] (__irq_svc+0x40/0x54)
Exception stack(0xde0c1ce8 to 0xde0c1d30)
1ce0:                   c073a684 20010113 c06e30fc 00000003 de0c1d30 00000001
1d00: 00000001 0000012c dfbdcc40 ffffe70c dfbdcc48 df5bd800 00000002 de0c1d30
1d20: c00712d4 c00712bc 20010113 ffffffff
[<c0011e40>] (__irq_svc) from [<c00712bc>] (generic_exec_single+0x10c/0x180)
[<c00712bc>] (generic_exec_single) from [<c00713d0>] (smp_call_function_single+0xa0/0xcc)
[<c00713d0>] (smp_call_function_single) from [<c0071818>] (on_each_cpu+0x2c/0x48)
[<c0071818>] (on_each_cpu) from [<c03312dc>] (mvpp2_poll+0x30/0x594)
[<c03312dc>] (mvpp2_poll) from [<c041d024>] (net_rx_action+0xb0/0x170)
[<c041d024>] (net_rx_action) from [<c00220c4>] (__do_softirq+0x120/0x274)
[<c00220c4>] (__do_softirq) from [<c0022468>] (irq_exit+0x78/0xb0)
[<c0022468>] (irq_exit) from [<c000eb00>] (handle_IRQ+0x44/0x90)
[<c000eb00>] (handle_IRQ) from [<c00086d0>] (gic_handle_irq+0x2c/0x5c)
[<c00086d0>] (gic_handle_irq) from [<c0011e40>] (__irq_svc+0x40/0x54)
Exception stack(0xde0c1eb8 to 0xde0c1f00)
1ea0:                                                       de1b986c 00000000
1ec0: 00000420 de1b986c de1b9800 d761c080 be913a34 be913a34 00000007 de0c0000
1ee0: d761c0a0 be913a34 00000010 de0c1f00 c0491898 c0491918 60010013 ffffffff
[<c0011e40>] (__irq_svc) from [<c0491918>] (unix_inq_len+0x9c/0xa8)
[<c0491918>] (unix_inq_len) from [<c049194c>] (unix_ioctl+0x28/0x88)
[<c049194c>] (unix_ioctl) from [<c0407ccc>] (sock_ioctl+0x124/0x280)
[<c0407ccc>] (sock_ioctl) from [<c00c11bc>] (do_vfs_ioctl+0x3fc/0x5c0)
[<c00c11bc>] (do_vfs_ioctl) from [<c00c13b4>] (SyS_ioctl+0x34/0x5c)
[<c00c13b4>] (SyS_ioctl) from [<c000e220>] (ret_fast_syscall+0x0/0x30)
Task dump for CPU 1:
kiplink_admin.f R running      0  1932   1549 0x00000000
[<c0513a04>] (__schedule) from [<00000007>] (0x7)

If my analysis is correct, what happens on CPU0 is that:

 * lighttpd does an ioctl() on a socket, which ends up calling
   unix_inq_len(), which tries to get a spinlock using spin_lock(). The
   lock is probably taken.

 * while waiting for this lock, we get a network RX interrupt, which
   schedules the network RX softirq, which ends up calling the ->poll()
   function of the network driver, in our case mvpp2_poll().

 * since the network hardware has some per-CPU registers that we need
   to read on all CPUs, the network driver does a on_each_cpu() call.
   This apparently leads nowhere, as after a while, the timer interrupt
   kicks in and decides we're not making progress anymore.

After enabling spinlock debugging, I get the following right before the
RCU stall (note how the RCU stall happens on CPU0, while the spinlock
lockup suspected happens on CPU1) :

BUG: spinlock lockup suspected on CPU#1, kiplink_admin.f/1938
 lock: 0xde4998c0, .magic: dead4ead, .owner: lighttpd/1910, .owner_cpu: 0
CPU: 1 PID: 1938 Comm: kiplink_admin.f Tainted: G        W  O   3.17.0-00017-g53fa061 #2
[<c00154d8>] (unwind_backtrace) from [<c001183c>] (show_stack+0x10/0x14)
[<c001183c>] (show_stack) from [<c053f560>] (dump_stack+0x9c/0xbc)
[<c053f560>] (dump_stack) from [<c0057338>] (do_raw_spin_lock+0x118/0x18c)
[<c0057338>] (do_raw_spin_lock) from [<c05466fc>] (_raw_spin_lock_irqsave+0x60/0x6c)
[<c05466fc>] (_raw_spin_lock_irqsave) from [<c042a7d4>] (skb_queue_tail+0x18/0x48)
[<c042a7d4>] (skb_queue_tail) from [<c04b9f58>] (unix_stream_sendmsg+0x1c8/0x36c)
[<c04b9f58>] (unix_stream_sendmsg) from [<c0422eb8>] (sock_aio_write+0xcc/0xec)
[<c0422eb8>] (sock_aio_write) from [<c00bf414>] (do_sync_write+0x80/0xa8)
[<c00bf414>] (do_sync_write) from [<c00bfe60>] (vfs_write+0x108/0x1b0)
[<c00bfe60>] (vfs_write) from [<c00c0418>] (SyS_write+0x40/0x94)
[<c00c0418>] (SyS_write) from [<c000e3a0>] (ret_fast_syscall+0x0/0x48)

And interestingly, skb_queue_tail() is also taking the same spinlock as
unix_inq_len(), except that it does so with spin_lock_irqsave(). And
this is causing the issue: since this spin_lock_irqsave() takes place
on CPU1, the interupts are disabled, and therefore we're not getting
the IPI that allows the on_each_cpu() call coming from CPU0 to make
progress, causing the lockup.

The patch below has proven to fix the issue: I was able to reproduce
the issue in maximum 5 to 10 minutes, and with the patch the system has
survived an entire night of testing.

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e968843..c60205a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2124,11 +2124,12 @@ long unix_inq_len(struct sock *sk)
 {
        struct sk_buff *skb;
        long amount = 0;
+       unsigned long flags;
 
        if (sk->sk_state == TCP_LISTEN)
                return -EINVAL;
 
-       spin_lock(&sk->sk_receive_queue.lock);
+       spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
        if (sk->sk_type == SOCK_STREAM ||
            sk->sk_type == SOCK_SEQPACKET) {
                skb_queue_walk(&sk->sk_receive_queue, skb)
@@ -2138,7 +2139,7 @@ long unix_inq_len(struct sock *sk)
                if (skb)
                        amount = skb->len;
        }
-       spin_unlock(&sk->sk_receive_queue.lock);
+       spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
 
        return amount;
 }

So, the question is: is this patch the correct solution (but then other
usage of spin_lock in af_unix.c might also need fixing) ? Or is the
network driver at fault?

Thanks for your input,

Thomas
-- 
Thomas Petazzoni, CTO, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com

^ permalink raw reply related

* Re: [PATCH] netlink: Re-add locking to netlink_lookup() and seq walker
From: Heiko Carstens @ 2014-10-21  7:04 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Eric Dumazet, Sasha Levin, paulmck, Nikolay Aleksandrov,
	David S. Miller, netdev, linux-kernel, Ursula Braun
In-Reply-To: <20141020195355.GA2299@casper.infradead.org>

On Mon, Oct 20, 2014 at 08:53:55PM +0100, Thomas Graf wrote:
> Heiko,
> 
> Can you test the following patch:
> 
> The synchronize_rcu() in netlink_release() introduces unacceptable
> latency. Reintroduce minimal lookup so we can drop the
> synchronize_rcu() until socket destruction has been RCUfied.
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
>  net/netlink/af_netlink.c | 37 +++++++++++++++++++++++++------------
>  1 file changed, 25 insertions(+), 12 deletions(-)

Thanks a lot! Your patch fixes the issue for me.

Reported-and-tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>

^ permalink raw reply

* [RFC PATCH v2 net-next] net: ipv6: Add a sysctl to make optimistic addresses useful candidates
From: Erik Kline @ 2014-10-21  4:05 UTC (permalink / raw)
  To: netdev; +Cc: ben, lorenzo, hannes, Erik Kline

Add a sysctl that causes an interface's optimistic addresses
to be considered equivalent to other non-deprecated addresses
for source address selection purposes.  Preferred addresses
will still take precedence over optimistic addresses, subject
to other ranking in the source address selection algorithm.

This is useful where different interfaces are connected to
different networks from different ISPs (e.g., a cell network
and a home wifi network).

The current behaviour complies with RFC 3484/6724, and it
makes sense if the host has only one interface, or has
multiple interfaces on the same network (same or cooperating
administrative domain(s), but not in the multiple distinct
networks case.

For example, if a mobile device has an IPv6 address on an LTE
network and then connects to IPv6-enabled wifi, while the wifi
IPv6 address is undergoing DAD, IPv6 connections will try use
the wifi default route with the LTE IPv6 address, and will get
stuck until they time out.

Also, because optimistic addresses can actually be used, issue
an RTM_NEWADDR as soon as DAD starts.  If DAD fails an separate
RTM_DELADDR will be sent.

Also: add an entry in ip-sysctl.txt for optimistic_dad.

Signed-off-by: Erik Kline <ek@google.com>
---
 Documentation/networking/ip-sysctl.txt | 13 ++++++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 36 ++++++++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0307e28..e03cf49 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1452,6 +1452,19 @@ suppress_frag_ndisc - INTEGER
 	1 - (default) discard fragmented neighbor discovery packets
 	0 - allow fragmented neighbor discovery packets
 
+optimistic_dad - BOOLEAN
+	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+		0: disabled (default)
+		1: enabled
+
+use_optimistic - BOOLEAN
+	If enabled, do not classify optimistic addresses as deprecated during
+	source address selection.  Preferred addresses will still be chosen
+	before optimistic addresses, subject to other ranking in the source
+	address selection algorithm.
+		0: disabled (default)
+		1: enabled
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ff56053..7121a2e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -42,6 +42,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_from_local;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	__s32		optimistic_dad;
+	__s32		use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 	__s32		mc_forwarding;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index efa2666..e863d08 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -164,6 +164,7 @@ enum {
 	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_ACCEPT_RA_FROM_LOCAL,
+	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 725c763..c2fddb7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1169,6 +1169,9 @@ enum {
 	IPV6_SADDR_RULE_LABEL,
 	IPV6_SADDR_RULE_PRIVACY,
 	IPV6_SADDR_RULE_ORCHID,
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	IPV6_SADDR_RULE_NOT_OPTIMISTIC,
+#endif
 	IPV6_SADDR_RULE_PREFIX,
 	IPV6_SADDR_RULE_MAX
 };
@@ -1257,10 +1260,17 @@ static int ipv6_get_saddr_eval(struct net *net,
 		score->scopedist = ret;
 		break;
 	case IPV6_SADDR_RULE_PREFERRED:
+	    {
 		/* Rule 3: Avoid deprecated and optimistic addresses */
+		u8 avoid = IFA_F_DEPRECATED;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+		if (!score->ifa->idev->cnf.use_optimistic)
+			avoid |= IFA_F_OPTIMISTIC;
+#endif
 		ret = ipv6_saddr_preferred(score->addr_type) ||
-		      !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC));
+		      !(score->ifa->flags & avoid);
 		break;
+	    }
 #ifdef CONFIG_IPV6_MIP6
 	case IPV6_SADDR_RULE_HOA:
 	    {
@@ -1299,6 +1309,14 @@ static int ipv6_get_saddr_eval(struct net *net,
 		ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
 			ipv6_addr_orchid(dst->addr));
 		break;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
+		/* Optimistic addresses still have lower precedence than other
+		 * preferred addresses.
+		 */
+		ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
+		break;
+#endif
 	case IPV6_SADDR_RULE_PREFIX:
 		/* Rule 8: Use longest matching prefix */
 		ret = ipv6_addr_diff(&score->ifa->addr, dst->addr);
@@ -3222,8 +3240,13 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	 * Optimistic nodes can start receiving
 	 * Frames right away
 	 */
-	if (ifp->flags & IFA_F_OPTIMISTIC)
+	if (ifp->flags & IFA_F_OPTIMISTIC) {
 		ip6_ins_rt(ifp->rt);
+		/* Because optimistic nodes can receive frames, notify
+		 * listeners. If DAD fails, RTM_DELADDR is sent.
+		 */
+		ipv6_ifa_notify(RTM_NEWADDR, ifp);
+	}
 
 	addrconf_dad_kick(ifp);
 out:
@@ -4330,6 +4353,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
+	array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 	array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
@@ -5155,6 +5179,14 @@ static struct addrconf_sysctl_table
 			.proc_handler   = proc_dointvec,
 
 		},
+		{
+			.procname       = "use_optimistic",
+			.data           = &ipv6_devconf.use_optimistic,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 		{
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* Re: [RFC PATCH net-next] net: ipv6: Add a sysctl to make optimistic addresses useful candidates
From: Erik Kline @ 2014-10-21  4:02 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, hannes, Lorenzo Colitti, Eric Dumazet
In-Reply-To: <1413844815.31953.32.camel@decadent.org.uk>

Thanks.  Patch v2 coming shortly.

On Tue, Oct 21, 2014 at 7:40 AM, Ben Hutchings <ben@decadent.org.uk> wrote:
> On Fri, 2014-10-17 at 13:31 +0900, Erik Kline wrote:
> [...]
>> --- a/include/uapi/linux/ipv6.h
>> +++ b/include/uapi/linux/ipv6.h
>> @@ -154,6 +154,7 @@ enum {
>>       DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN,
>>       DEVCONF_PROXY_NDP,
>>       DEVCONF_OPTIMISTIC_DAD,
>> +     DEVCONF_USE_OPTIMISTIC,
>>       DEVCONF_ACCEPT_SOURCE_ROUTE,
>>       DEVCONF_MC_FORWARDING,
>>       DEVCONF_DISABLE_IPV6,
> [...]
>
> As this is UAPI, the new entry must be added at the end of the
> enumeration (well, before DEVCONF_MAX).
>
> Ben.
>
> --
> Ben Hutchings
> Reality is just a crutch for people who can't handle science fiction.

^ permalink raw reply

* [net-next] iptunnel: Fix iptunnel_xmit return code for stats maintenance
From: Andy Zhou @ 2014-10-21  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou

iptunnel_xmit() currently always return >= 0 instead of proper error
code, that is used to maintain stats. For example, current return code
conflicts with how iptunnel_xmit_stats() maintains stats.

Unfortunately, the return code can not be changed without readjusting
how SKB memory is managed through the call chain.  The following two
rules are adopted for this patch:

1) Proper error code are always propagate back through the call chain
   so that the caller can maintain stats.

2) Tunnel xmit functions always free resources, e.g. skb and route
   entry.

Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 drivers/net/vxlan.c            |   21 +++++++++++++--------
 include/net/ip_tunnels.h       |    7 +++++++
 net/ipv4/geneve.c              |    8 ++++++--
 net/ipv4/ip_tunnel_core.c      |   14 +++++++++++---
 net/openvswitch/vport-geneve.c |    5 ++---
 net/openvswitch/vport-gre.c    |    1 +
 net/openvswitch/vport-vxlan.c  |    6 +++---
 net/openvswitch/vport.c        |    8 ++++----
 8 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ca30982..93348cb 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1626,8 +1626,10 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	bool udp_sum = !vs->sock->sk->sk_no_check_tx;
 
 	skb = udp_tunnel_handle_offloads(skb, udp_sum);
-	if (IS_ERR(skb))
-		return -EINVAL;
+	if (IS_ERR(skb)) {
+		err = -EINVAL;
+		goto error;
+	}
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ VXLAN_HLEN + sizeof(struct iphdr)
@@ -1636,13 +1638,15 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err))
-		return err;
+		goto error;
 
 	if (vlan_tx_tag_present(skb)) {
 		if (WARN_ON(!__vlan_put_tag(skb,
 					    skb->vlan_proto,
-					    vlan_tx_tag_get(skb))))
-			return -ENOMEM;
+					    vlan_tx_tag_get(skb)))) {
+			err = -ENOMEM;
+			goto error;
+		}
 
 		skb->vlan_tci = 0;
 	}
@@ -1655,6 +1659,10 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 	return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
 				   ttl, df, src_port, dst_port, xnet);
+error:
+	kfree_skb(skb);
+	ip_rt_put(rt);
+	return err;
 }
 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 
@@ -1786,9 +1794,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     tos, ttl, df, src_port, dst_port,
 				     htonl(vni << 8),
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)));
-
-		if (err < 0)
-			goto rt_tx_error;
 		iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5bc6ede..80bcf2e 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -174,6 +174,13 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 }
 
 int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
+
+/* Transmit a packet over IP tunnel
+ * Returns:
+ *	0 Congestion notification received
+ *	>0  Number of bytes in the packet successfully sent
+ *	<0 packet dropped due to error
+ */
 int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
 		  __u8 tos, __u8 ttl, __be16 df, bool xnet);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 065cd94..90fea48 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -129,14 +129,14 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err))
-		return err;
+		goto error;
 
 	if (vlan_tx_tag_present(skb)) {
 		if (unlikely(!__vlan_put_tag(skb,
 					     skb->vlan_proto,
 					     vlan_tx_tag_get(skb)))) {
 			err = -ENOMEM;
-			return err;
+			goto error;
 		}
 		skb->vlan_tci = 0;
 	}
@@ -146,6 +146,10 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
 				   tos, ttl, df, src_port, dst_port, xnet);
+error:
+	kfree_skb(skb);
+	ip_rt_put(rt);
+	return err;
 }
 EXPORT_SYMBOL_GPL(geneve_xmit_skb);
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 88c386c..b3ba4a3 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -77,9 +77,17 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 	__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
 
 	err = ip_local_out_sk(sk, skb);
-	if (unlikely(net_xmit_eval(err)))
-		pkt_len = 0;
-	return pkt_len;
+
+	/* Deal with positive error numbers. Filter out NET_XMIT_CN */
+	if (err > 0)
+		return net_xmit_errno(err);
+
+	/* Success, return number of bytes transmitted */
+	if (err == 0)
+		err = pkt_len;
+
+	/* Return pkt_len or an error code */
+	return err;
 }
 EXPORT_SYMBOL_GPL(iptunnel_xmit);
 
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 106a9d8..34276fb 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -206,15 +206,14 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 	tunnel_id_to_vni(tun_key->tun_id, vni);
 	skb->ignore_df = 1;
 
-	err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
+	return  geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
 			      tun_key->ipv4_dst, tun_key->ipv4_tos,
 			      tun_key->ipv4_ttl, df, sport, dport,
 			      tun_key->tun_flags, vni,
 			      tun_info->options_len, (u8 *)tun_info->options,
 			      false);
-	if (err < 0)
-		ip_rt_put(rt);
 error:
+	kfree_skb(skb);
 	return err;
 }
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 108b82d..8721b30 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -200,6 +200,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 err_free_rt:
 	ip_rt_put(rt);
 error:
+	kfree_skb(skb);
 	return err;
 }
 
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 2735e01..ace849a 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -174,15 +174,15 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	src_port = udp_flow_src_port(net, skb, 0, 0, true);
 
-	err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
+	return vxlan_xmit_skb(vxlan_port->vs, rt, skb,
 			     fl.saddr, tun_key->ipv4_dst,
 			     tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
 			     src_port, dst_port,
 			     htonl(be64_to_cpu(tun_key->tun_id) << 8),
 			     false);
-	if (err < 0)
-		ip_rt_put(rt);
+
 error:
+	kfree_skb(skb);
 	return err;
 }
 
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6015802..eb0a72f 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -480,11 +480,11 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
 		stats->tx_packets++;
 		stats->tx_bytes += sent;
 		u64_stats_update_end(&stats->syncp);
-	} else if (sent < 0) {
-		ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
-		kfree_skb(skb);
-	} else
+	} else if (sent == -ENOBUFS || sent == -ENOMEM || sent == 0) {
 		ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
+	} else {
+		ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
+	}
 
 	return sent;
 }
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH 2/2][V2] xfrm6: fix a potential use after free in xfrm6_policy.c
From: roy.qing.li @ 2014-10-21  0:34 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com>

pskb_may_pull() maybe change skb->data and make nh and exthdr pointer
oboslete, so recompute the nd and exthdr

V2: insert a space between date type(like __be16) and * as suggested by
Sergei Shtylyov

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 net/ipv6/xfrm6_policy.c |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ac49f84..115fd3b 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -170,8 +170,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 		case IPPROTO_DCCP:
 			if (!onlyproto && (nh + offset + 4 < skb->data ||
 			     pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
-				__be16 *ports = (__be16 *)exthdr;
+				__be16 *ports;
 
+				nh = skb_network_header(skb);
+				ports = (__be16 *)(nh + offset);
 				fl6->fl6_sport = ports[!!reverse];
 				fl6->fl6_dport = ports[!reverse];
 			}
@@ -180,8 +182,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 
 		case IPPROTO_ICMPV6:
 			if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
-				u8 *icmp = (u8 *)exthdr;
+				u8 *icmp;
 
+				nh = skb_network_header(skb);
+				icmp = (u8 *)(nh + offset);
 				fl6->fl6_icmp_type = icmp[0];
 				fl6->fl6_icmp_code = icmp[1];
 			}
@@ -192,8 +196,9 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 		case IPPROTO_MH:
 			if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
 				struct ip6_mh *mh;
-				mh = (struct ip6_mh *)exthdr;
 
+				nh = skb_network_header(skb);
+				mh = (struct ip6_mh *)(nh + offset);
 				fl6->fl6_mh_type = mh->ip6mh_type;
 			}
 			fl6->flowi6_proto = nexthdr;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] netfilter: Fix wastful cleanup check for unconfirmed conn in get_next_corpse
From: Feng Gao @ 2014-10-21  0:31 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Patrick McHardy, kadlec, davem
  Cc: Netfilter Developer Mailing List, coreteam, netdev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 788 bytes --]

Hi all,

I am sorry to send the patch commit again because the last email is
not plain text and is rejected by some servers.

This is the patch to branch master of kernel.

The function get_next_corpse is only invoked by nf_ct_iterate_cleanup
in one while loop, and it will check the per cpu unconfirmed conntrack
list every time.

I think the whole list of unconfirmed conntracks could be accessed by
one call, so the others are not necessary.

So I move the checks outside the get_next_corpse, and create one new
function clean_up_unconfirmed_conntracks.
Let the nf_ct_iterate_cleanup invokes the
clean_up_unconfirmed_conntracks after the while loop.

These codes have already exist for a long time. Firstly I think maybe
there is some reason, but I fail to get it.


Best Regards
Feng

[-- Attachment #2: 0001-netfilter-Fix-wastful-cleanup-check-for-unconfirmed-.patch --]
[-- Type: application/octet-stream, Size: 2964 bytes --]

From 52de457b3cfd1e94a52df1c3dfcd9dbf3511fa0d Mon Sep 17 00:00:00 2001
From: fgao <gfree.wind@gmail.com>
Date: Mon, 20 Oct 2014 07:18:05 -0700
Subject: [PATCH 1/1] netfilter: Fix wastful cleanup check for unconfirmed conn in get_next_corpse

The function get_next_corpse is used to iterate the conntracks.
It will check the per cpu unconfirmed list of every cpu too.
Now it is only invoked by nf_ct_iterate_cleanup in one while loop.
Actually the unconfirmed list could be accessed completely by one call, then the others are wastful.

So move the unconfirmed list check outside the function get_next_corpse and create one new function
Let the nf_ct_iterate_cleanup invokes the new function clean_up_unconfirmed_conntracks once after the loops.

Signed-off-by: fgao <gfree.wind@gmail.com>
---
 net/netfilter/nf_conntrack_core.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5016a69..ace7c2c2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1348,6 +1348,28 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 	nf_conntrack_get(nskb->nfct);
 }
 
+static void clean_up_unconfirmed_conntracks(struct net *net,
+		int (*iter)(struct nf_conn *i, void *data),
+		void *data)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct hlist_nulls_node *n;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+		spin_lock_bh(&pcpu->lock);
+		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			if (iter(ct, data))
+				set_bit(IPS_DYING_BIT, &ct->status);
+		}
+		spin_unlock_bh(&pcpu->lock);
+	}
+}
+
 /* Bring out ya dead! */
 static struct nf_conn *
 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
@@ -1356,7 +1378,6 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	struct hlist_nulls_node *n;
-	int cpu;
 	spinlock_t *lockp;
 
 	for (; *bucket < net->ct.htable_size; (*bucket)++) {
@@ -1376,17 +1397,6 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 		local_bh_enable();
 	}
 
-	for_each_possible_cpu(cpu) {
-		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
-		spin_lock_bh(&pcpu->lock);
-		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
-			ct = nf_ct_tuplehash_to_ctrack(h);
-			if (iter(ct, data))
-				set_bit(IPS_DYING_BIT, &ct->status);
-		}
-		spin_unlock_bh(&pcpu->lock);
-	}
 	return NULL;
 found:
 	atomic_inc(&ct->ct_general.use);
@@ -1411,6 +1421,8 @@ void nf_ct_iterate_cleanup(struct net *net,
 
 		nf_ct_put(ct);
 	}
+
+	clean_up_unconfirmed_conntracks(net, iter, data);
 }
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 
-- 
1.9.1


^ permalink raw reply related

* Re: Queue with wait-free enqueue, blocking dequeue, splice
From: Mathieu Desnoyers @ 2014-10-21  0:04 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Paul E. McKenney, netdev, Jamal Hadi Salim
In-Reply-To: <20141020160237.302aa17c@redhat.com>

----- Original Message -----
> From: "Jesper Dangaard Brouer" <brouer@redhat.com>
> To: "Mathieu Desnoyers" <mathieu.desnoyers@efficios.com>
> Cc: brouer@redhat.com, "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>, netdev@vger.kernel.org, "Jamal Hadi Salim"
> <jhs@mojatatu.com>
> Sent: Monday, October 20, 2014 10:02:37 AM
> Subject: Re: Queue with wait-free enqueue, blocking dequeue, splice
> 
> 
> On Sat, 18 Oct 2014 11:48:12 +0000 (UTC) Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
> 
> > Following our LPC discussion on lock-free queue algorithms
> > for qdisc, here is some info on the wfcqueue implementation
> > found in Userspace RCU. See http://urcu.so for info and
> > git repository.
> 
> Thank for following up on our very interesting discussions.
> 
> I've started with the more simple variant "urcu/static/wfqueue.h" to
> understand the concepts.  And I'm now reading wfcqueue.h, which I guess
> it replacing wfqueue.h.

Yep, this is correct.

> 
>  
> > Here is the wfcqueue ported to the Linux kernel I sent last
> > year as RFC:
> > https://lkml.org/lkml/2013/3/14/289
> > 
> > I'm very interested to learn if it fits well for your
> > use-case,
> 
> Does this wfcqueue API support bulk dequeue?  (A feature needed for the
> lock-less qdisc implementation, else it cannot compete with our new
> bulk dequeue strategy).

Yes, it does. See the "__wfcq_splice" API.

> 
> AFAIK your queue implementation is a CAS-based, Wait-Free on enqueue,
> but Lock-Free on dequeue with the potential for waiting/blocking on
> a enqueue processes.
>  I'm not 100% sure, that we want this behavior for the qdisc system.

It's actually xchg-based (not CAS-based). It is indeed wait-free
in the strictest sense of the term on enqueue (at least on x86,
some other arch implement xchg using ll/sc, which is not strictly
wait-free).

On dequeue, it can busy-wait for a short while that the enqueue
completes. Note that in kernel, since we disable preemption during
enqueue, the dequeue does not have to ever block, just busy-looping
is OK, since the longest thing that could nest over the enqueue
is possibly an interrupt and softirq. So yes, I guess the dequeue
would qualify as lock-free.

> 
> I can certainly use the wfcq_empty() check,

Not sure why you would want to use it, considering that the dequeue
operation implies it. If there is nothing to dequeue, we just return
immediately. Dequeue operation does not block on empty queue. It
just busy waits if it happen to see the queue in an intermediate
state of the enqueue operation (which is very short, few instructions
at most, with preemption disabled).

> but I guess I need to
> maintain a separate counter to maintain the qdisc limit, right?
> (I would use the approximate/split counter API percpu_counter to keep
> this scalable, and wfcq_empty() would provide an accurate empty check)

Yes for split counters, not sure why you need the empty check explicitly
in your use-case though.

> 
> 
> I think, we/I should start micro benchmarking the different approaches.
> As our time budget is only 67.2ns
>  http://netoptimizer.blogspot.dk/2014/05/the-calculations-10gbits-wirespeed.html
> (or bulking tricks artificially "increase" this budget)
> 
> 
> The motivation behind this lockless qdisc is, the current qdisc locking
> cost 48ns, see slide 9/16 titled "Qdisc locking is nasty":
>  http://people.netfilter.org/hawk/presentations/LinuxPlumbers2014/performance_tx_qdisc_bulk_LPC2014.pdf
> 

Chances are that the scheme:

__wfcq_enqueue() on enqueue

__wfcq_splice() for bulk dequeue

__wfcq_first()
__wfcq_next()  for iteration on the splice dest queue

Would be must faster than the ton of locks you use currently.
The nice part about using just enqueue and splice is that you
don't need locking on the queue at all. Iteration on the
destination queue can be done by a single thread, so no need
for explicit locking there neither.

By the way, you should look at the kernel wfcq implementation
I posted earlier. It's more compact that the one in userspace RCU
because we don't need to support blocking/nonblocking modes,
because we have the luxury of disabling preemption in the kernel.

I look forward to see the numbers,

Thanks!

Mathieu

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Sr. Network Kernel Developer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
> 

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [RFC v2 2/6] driver-core: add driver async_probe support
From: Luis R. Rodriguez @ 2014-10-20 23:43 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Greg Kroah-Hartman, Wu Zhangjin, Takashi Iwai, Tejun Heo,
	Arjan van de Ven, linux-kernel@vger.kernel.org, Oleg Nesterov,
	hare, Andrew Morton, Tetsuo Handa, Joseph Salisbury,
	Benjamin Poirier, Santosh Rastapur, Kay Sievers,
	One Thousand Gnomes, Tim Gardner, Pierre Fersing,
	Nagalakshmi Nandigama, Praveen Krishnamoorthy, Sreekanth Reddy,
	Abhijit Mahajan <abhi
In-Reply-To: <20140905221029.GA35667@core.coreip.homeip.net>

> diff --git a/drivers/base/bus.c b/drivers/base/bus.c
> index 83e910a..49fe573 100644
> --- a/drivers/base/bus.c
> +++ b/drivers/base/bus.c
> @@ -10,6 +10,7 @@
>   *
>   */
>
> +#include <linux/async.h>
>  #include <linux/device.h>
>  #include <linux/module.h>
>  #include <linux/errno.h>
> @@ -547,15 +548,12 @@ void bus_probe_device(struct device *dev)
>  {
>         struct bus_type *bus = dev->bus;
>         struct subsys_interface *sif;
> -       int ret;
>
>         if (!bus)
>                 return;
>
> -       if (bus->p->drivers_autoprobe) {
> -               ret = device_attach(dev);
> -               WARN_ON(ret < 0);
> -       }
> +       if (bus->p->drivers_autoprobe)
> +               device_initial_probe(dev);
>
>         mutex_lock(&bus->p->mutex);
>         list_for_each_entry(sif, &bus->p->interfaces, node)
> @@ -657,6 +655,17 @@ static ssize_t uevent_store(struct device_driver *drv, const char *buf,
>  }
>  static DRIVER_ATTR_WO(uevent);

Based on my review with my latest changes this is what I was missing,
I'll be sure to address this.

 Luis

^ permalink raw reply

* Re: [RFC PATCH net-next] net: ipv6: Add a sysctl to make optimistic addresses useful candidates
From: Ben Hutchings @ 2014-10-20 22:40 UTC (permalink / raw)
  To: Erik Kline; +Cc: netdev, hannes, lorenzo, edumazet
In-Reply-To: <1413520309-13814-1-git-send-email-ek@google.com>

[-- Attachment #1: Type: text/plain, Size: 588 bytes --]

On Fri, 2014-10-17 at 13:31 +0900, Erik Kline wrote:
[...]
> --- a/include/uapi/linux/ipv6.h
> +++ b/include/uapi/linux/ipv6.h
> @@ -154,6 +154,7 @@ enum {
>  	DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN,
>  	DEVCONF_PROXY_NDP,
>  	DEVCONF_OPTIMISTIC_DAD,
> +	DEVCONF_USE_OPTIMISTIC,
>  	DEVCONF_ACCEPT_SOURCE_ROUTE,
>  	DEVCONF_MC_FORWARDING,
>  	DEVCONF_DISABLE_IPV6,
[...]

As this is UAPI, the new entry must be added at the end of the
enumeration (well, before DEVCONF_MAX).

Ben.

-- 
Ben Hutchings
Reality is just a crutch for people who can't handle science fiction.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply

* Re: [linux-nics] [PATCH 1/1] e1000: unset IFF_UNICAST_FLT on WMware 82545EM
From: Jeff Kirsher @ 2014-10-20 22:25 UTC (permalink / raw)
  To: Francesco Ruggeri
  Cc: netdev, linux.nics, pv-drivers, fruggeri, linux-kernel,
	e1000-devel, davem
In-Reply-To: <20141020221115.1FC9B480090@fruggeri-Arora18.sjc.aristanetworks.com>

[-- Attachment #1: Type: text/plain, Size: 551 bytes --]

On Mon, 2014-10-20 at 15:11 -0700, Francesco Ruggeri wrote:
> VMWare's e1000 implementation does not seem to support unicast
> filtering.
> This can be observed by configuring a macvlan interface on eth0 in a
> VM in
> VMWare Fusion 5.0.5, and trying to use that interface instead of eth0.
> Tested on 3.16.
> 
> Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000_main.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)

Thanks Francesco, I will add your patch to my queue.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: qdisc running
From: Jamal Hadi Salim @ 2014-10-20 22:17 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: john Fastabend, Herbert Xu, netdev@vger.kernel.org, eric Dumazet,
	Mathieu Desnoyers
In-Reply-To: <20141020181756.2c8f33b9@redhat.com>

On 10/20/14 12:17, Jesper Dangaard Brouer wrote:
>
> On Sun, 19 Oct 2014 15:24:42 -0400 Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>

> I guess it is good for our recent dequeue batching.

It is i think ;->

> But I think/hope we
> can come up with a scheme that does not requires 6 lock/unlock
> operations (as illustrated on slide 9).
>

To be clear:
2 locks + 2 unlock and 2 atomic ops.


> John and I have talked about doing a lockless qdisc, but maintaining
> this __QDISC___STATE_RUNNING in a lockless scenario, would cost us
> extra atomic ops...
>

In the animation this __QDISC___STATE_RUNNING is shown as "occupied"
flag. It is like someone is in the toilet and you cant come in;->
They have to finish dropping the packages into the toilet^Whardware ;->
If it is occupied, you put your package outside and go.

> Are we still sure, that this model of only allowing a single CPU in the
> dequeue path, is still the best solution?

For sure it is the best if you want to batch. Look at that last orange
guy picking all the packages (busylock.swf). This is where all the
batching would  happen.

>(The TXQ lock should already
> protect several CPUs in this code path).


Note:
Maybe for the orange guy (the dequeur) the tx lock could
be avoided? Double check the code. Important to note under
busy period contention is reduced to :
1 lock + 1 unlock + 2 atomic ops for N-1 CPUs.
The orange guy on the other hand is doing 2 lock/unlock.


> I can see that you really needed the budget/fairness in the dequeue
> loop, that we recently mangled with.
>

Yes, fairness is needed so the orange guy doesnt spend all his cycles
doing all the work (that was the basis of my presentation); unless
that is not an issue and the scheduler would move things away from
that cpu.


> What tool do I use to play these SWF files? (I tried VLC but no luck).
>

Firefox should work fine.

cheers,
jamal

^ permalink raw reply

* [PATCH 1/1] e1000: unset IFF_UNICAST_FLT on WMware 82545EM
From: Francesco Ruggeri @ 2014-10-20 22:11 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, davem, pv-drivers, linux.nics, fruggeri,
	e1000-devel

VMWare's e1000 implementation does not seem to support unicast filtering.
This can be observed by configuring a macvlan interface on eth0 in a VM in
VMWare Fusion 5.0.5, and trying to use that interface instead of eth0.
Tested on 3.16.

Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
---
 drivers/net/ethernet/intel/e1000/e1000_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 5f6aded..24f3986 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -1075,7 +1075,10 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 				  NETIF_F_HW_CSUM |
 				  NETIF_F_SG);
 
-	netdev->priv_flags |= IFF_UNICAST_FLT;
+	/* Do not set IFF_UNICAST_FLT for VMWare's 82545EM */
+	if (hw->device_id != E1000_DEV_ID_82545EM_COPPER ||
+	    hw->subsystem_vendor_id != PCI_VENDOR_ID_VMWARE)
+		netdev->priv_flags |= IFF_UNICAST_FLT;
 
 	adapter->en_mng_pt = e1000_enable_mng_pass_thru(hw);
 
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH] netfilter: log: protect nf_log_register against double registering
From: Marcelo Ricardo Leitner @ 2014-10-20 21:58 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev

Currently, despite the comment right before the function,
nf_log_register allows registering two loggers on with the same type and
end up overwriting the previous register.

Not a real issue today as current tree doesn't have two loggers for the
same type but it's better to get this protected.

Also make sure that all of its callers do error checking.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
---

Notes:
    Please let me know if you have any issues with the identation on
    nf_log_register. I just couldn't find a better one.
    
    Thanks

 net/ipv4/netfilter/nf_log_arp.c  |  8 +++++++-
 net/ipv4/netfilter/nf_log_ipv4.c |  8 +++++++-
 net/ipv6/netfilter/nf_log_ipv6.c |  8 +++++++-
 net/netfilter/nf_log.c           | 13 ++++++++++++-
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index ccfc78db12ee8acae68faf451f2cf6bc5597f2c1..8b39174b7be390397a110ec9d3ed497bf8ce6d26 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -130,7 +130,13 @@ static int __init nf_log_arp_init(void)
 	if (ret < 0)
 		return ret;
 
-	nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+	ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+	if (ret < 0) {
+		pr_err("log: failed to register logger\n");
+		unregister_pernet_subsys(&nf_log_arp_net_ops);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 078bdca1b607a167e05e7cf1bdfedccdd5aca92a..b3cb2ff6580343a9f7537aa2f48fd23858872b4d 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -366,7 +366,13 @@ static int __init nf_log_ipv4_init(void)
 	if (ret < 0)
 		return ret;
 
-	nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+	ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+	if (ret < 0) {
+		pr_err("log: failed to register logger\n");
+		unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 7b17a0be93e7eccb2a26cd3294713d0f1112158d..b89576a5ca3e2b3964b8ce4aec09c8965496c2f2 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -398,7 +398,13 @@ static int __init nf_log_ipv6_init(void)
 	if (ret < 0)
 		return ret;
 
-	nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+	ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+	if (ret < 0) {
+		pr_err("log: failed to register logger\n");
+		unregister_pernet_subsys(&nf_log_ipv6_net_ops);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index daad6022c689c47a66a47e7a89a83c0c848c53d6..04495d9debe784827fd0cbcf5e541f10fa06839d 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -82,10 +82,21 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 	mutex_lock(&nf_log_mutex);
 
 	if (pf == NFPROTO_UNSPEC) {
+		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+			if (rcu_dereference_protected(loggers[i][logger->type],
+					lockdep_is_held(&nf_log_mutex))) {
+				mutex_unlock(&nf_log_mutex);
+				return -EEXIST;
+			}
+		}
 		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
 			rcu_assign_pointer(loggers[i][logger->type], logger);
 	} else {
-		/* register at end of list to honor first register win */
+		if (rcu_dereference_protected(loggers[pf][logger->type],
+				lockdep_is_held(&nf_log_mutex))) {
+			mutex_unlock(&nf_log_mutex);
+			return -EEXIST;
+		}
 		rcu_assign_pointer(loggers[pf][logger->type], logger);
 	}
 
-- 
1.9.3


^ permalink raw reply related

* [PATCH] rtlwifi: prevent format string usage from leaking
From: Kees Cook @ 2014-10-20 21:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Larry Finger, Chaoming Li, John W. Linville, linux-wireless,
	netdev

Use "%s" in the workqueue allocation to make sure the rtl_hal_cfg name
can never accidentally leak information via a format string.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/net/wireless/rtlwifi/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rtlwifi/base.c b/drivers/net/wireless/rtlwifi/base.c
index 58ba71830886..40b6d1d006d7 100644
--- a/drivers/net/wireless/rtlwifi/base.c
+++ b/drivers/net/wireless/rtlwifi/base.c
@@ -467,7 +467,7 @@ static void _rtl_init_deferred_work(struct ieee80211_hw *hw)
 		    rtl_easy_concurrent_retrytimer_callback, (unsigned long)hw);
 	/* <2> work queue */
 	rtlpriv->works.hw = hw;
-	rtlpriv->works.rtl_wq = alloc_workqueue(rtlpriv->cfg->name, 0, 0);
+	rtlpriv->works.rtl_wq = alloc_workqueue("%s", 0, 0, rtlpriv->cfg->name);
 	INIT_DELAYED_WORK(&rtlpriv->works.watchdog_wq,
 			  (void *)rtl_watchdog_wq_callback);
 	INIT_DELAYED_WORK(&rtlpriv->works.ips_nic_off_wq,
-- 
1.9.1


-- 
Kees Cook
Chrome OS Security

^ permalink raw reply related

* [PATCH net] bpf: fix bug in eBPF verifier
From: Alexei Starovoitov @ 2014-10-20 21:54 UTC (permalink / raw)
  To: David S. Miller
  Cc: Hannes Frederic Sowa, Daniel Borkmann, netdev, linux-kernel

while comparing for verifier state equivalency the comparison
was missing a check for uninitialized register.
Make sure it does so and add a testcase.

Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier")
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

while we were staring at the verifier code with Hannes during LPC
something felt odd in this spot. Yes. It was a bug. Fix it.

 kernel/bpf/verifier.c       |    3 ++-
 samples/bpf/test_verifier.c |   11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 801f5f3..9f81818 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 		if (memcmp(&old->regs[i], &cur->regs[i],
 			   sizeof(old->regs[0])) != 0) {
 			if (old->regs[i].type == NOT_INIT ||
-			    old->regs[i].type == UNKNOWN_VALUE)
+			    (old->regs[i].type == UNKNOWN_VALUE &&
+			     cur->regs[i].type != NOT_INIT))
 				continue;
 			return false;
 		}
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index f44ef11..eb4bec0 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -209,6 +209,17 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
+		"program doesn't init R0 before exit in all branches",
+		.insns = {
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 !read_ok",
+		.result = REJECT,
+	},
+	{
 		"stack out of bounds",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
-- 
1.7.9.5

^ permalink raw reply related

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Kevin Fenzi @ 2014-10-20 20:53 UTC (permalink / raw)
  To: Dave Jones; +Cc: netdev, linux-kernel
In-Reply-To: <20141020204326.GA25668@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 752 bytes --]

On Mon, 20 Oct 2014 16:43:26 -0400
Dave Jones <davej@redhat.com> wrote:

> I've seen similar soft lockup traces from the sys_unshare path when
> running my fuzz tester.  It seems that if you create enough network
> namespaces, it can take a huge amount of time for them to be iterated.
> (Running trinity with '-c unshare' you can see the slow down happen.
> In some cases, it takes so long that the watchdog process kills it --
>  though the SIGKILL won't get delivered until the unshare() completes)
> 
> Any idea what this machine had been doing prior to this that may have
> involved creating lots of namespaces ?

That was right after boot. ;) 

This is my main rawhide running laptop.

A 'ip netns list' shows nothing.

kevin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

This patch save the fn before doing rt6_backtrack.
Hence, without redo-ing the fib6_lookup(), saved_fn can be used
to redo rt6_select() with RT6_LOOKUP_F_REACHABLE off.

Some minor changes I think make sense to review as a single patch:
* Remove the 'out:' goto label.
* Remove the 'reachable' variable. Only use the 'strict' variable instead.

After this patch, "failing ip6_ins_rt()" should be the only case that
requires a redo of fib6_lookup().

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98c523f..c910831 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -917,31 +917,40 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 				      struct flowi6 *fl6, int flags)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *saved_fn;
 	struct rt6_info *rt, *nrt;
 	int strict = 0;
 	int attempts = 3;
 	int err;
-	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
 
 redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	saved_fn = fn;
 
 redo_rt6_select:
-	rt = rt6_select(fn, oif, strict | reachable);
+	rt = rt6_select(fn, oif, strict);
 	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
+		rt = rt6_multipath_select(rt, fl6, oif, strict);
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
-		else
-			goto out;
+		else if (strict & RT6_LOOKUP_F_REACHABLE) {
+			/* also consider unreachable route */
+			strict &= ~RT6_LOOKUP_F_REACHABLE;
+			fn = saved_fn;
+			goto redo_rt6_select;
+		} else {
+			dst_hold(&rt->dst);
+			read_unlock_bh(&table->tb6_lock);
+			goto out2;
+		}
 	}
 
 	dst_hold(&rt->dst);
@@ -977,13 +986,6 @@ redo_rt6_select:
 	ip6_rt_put(rt);
 	goto redo_fib6_lookup_lock;
 
-out:
-	if (reachable) {
-		reachable = 0;
-		goto redo_fib6_lookup;
-	}
-	dst_hold(&rt->dst);
-	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
-- 
1.8.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox