Netdev List
 help / color / mirror / Atom feed
* [PATCH net 1/3] hv_netvsc: drop common code until callback model fixed
From: Stephen Hemminger @ 2018-06-11 19:44 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev
In-Reply-To: <20180611194456.8268-1-sthemmin@microsoft.com>

The callback model of handling network failover is not suitable
in the current form.
  1. It was merged without addressing all the review feedback.
  2. It was merged without approval of any of the netvsc maintainers.
  3. Design discussion on how to handle PV/VF fallback is still
     not complete.
  4. IMHO the code model using callbacks is trying to make
     something common which isn't.

Revert the netvsc specific changes for now. Does not impact ongoing
development of failover model for virtio.
Revisit this after a simpler library based failover kernel
routines are extracted.

This reverts
commit 9c6ffbacdb57 ("hv_netvsc: fix error return code in netvsc_probe()")
and
commit 1ff78076d8dd ("netvsc: refactor notifier/event handling code to use the failover framework")

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/Kconfig      |   1 -
 drivers/net/hyperv/hyperv_net.h |   2 -
 drivers/net/hyperv/netvsc_drv.c | 224 +++++++++++++++++++++++---------
 3 files changed, 165 insertions(+), 62 deletions(-)

diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig
index 23a2d145813a..0765d5f61714 100644
--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -2,6 +2,5 @@ config HYPERV_NET
 	tristate "Microsoft Hyper-V virtual network driver"
 	depends on HYPERV
 	select UCS2_STRING
-	select FAILOVER
 	help
 	  Select this option to enable the Hyper-V virtual network driver.
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 99d8e7398a5b..1be34d2e3563 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -932,8 +932,6 @@ struct net_device_context {
 	u32 vf_alloc;
 	/* Serial number of the VF to team with */
 	u32 vf_serial;
-
-	struct failover *failover;
 };
 
 /* Per channel data */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 8eec156418ea..2d4370c94b6e 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -43,7 +43,6 @@
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
-#include <net/failover.h>
 
 #include "hyperv_net.h"
 
@@ -1782,6 +1781,46 @@ static void netvsc_link_change(struct work_struct *w)
 	rtnl_unlock();
 }
 
+static struct net_device *get_netvsc_bymac(const u8 *mac)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		if (dev->netdev_ops != &device_ops)
+			continue;	/* not a netvsc device */
+
+		if (ether_addr_equal(mac, dev->perm_addr))
+			return dev;
+	}
+
+	return NULL;
+}
+
+static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		struct net_device_context *net_device_ctx;
+
+		if (dev->netdev_ops != &device_ops)
+			continue;	/* not a netvsc device */
+
+		net_device_ctx = netdev_priv(dev);
+		if (!rtnl_dereference(net_device_ctx->nvdev))
+			continue;	/* device is removed */
+
+		if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
+			return dev;	/* a match */
+	}
+
+	return NULL;
+}
+
 /* Called when VF is injecting data into network stack.
  * Change the associated network device from VF to netvsc.
  * note: already called with rcu_read_lock
@@ -1804,6 +1843,46 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
 	return RX_HANDLER_ANOTHER;
 }
 
+static int netvsc_vf_join(struct net_device *vf_netdev,
+			  struct net_device *ndev)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+	int ret;
+
+	ret = netdev_rx_handler_register(vf_netdev,
+					 netvsc_vf_handle_frame, ndev);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not register netvsc VF receive handler (err = %d)\n",
+			   ret);
+		goto rx_handler_failed;
+	}
+
+	ret = netdev_master_upper_dev_link(vf_netdev, ndev,
+					   NULL, NULL, NULL);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not set master device %s (err = %d)\n",
+			   ndev->name, ret);
+		goto upper_link_failed;
+	}
+
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
+
+	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+
+	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	return 0;
+
+upper_link_failed:
+	netdev_rx_handler_unregister(vf_netdev);
+rx_handler_failed:
+	return ret;
+}
+
 static void __netvsc_vf_setup(struct net_device *ndev,
 			      struct net_device *vf_netdev)
 {
@@ -1854,95 +1933,85 @@ static void netvsc_vf_setup(struct work_struct *w)
 	rtnl_unlock();
 }
 
-static int netvsc_pre_register_vf(struct net_device *vf_netdev,
-				  struct net_device *ndev)
+static int netvsc_register_vf(struct net_device *vf_netdev)
 {
+	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
 
+	if (vf_netdev->addr_len != ETH_ALEN)
+		return NOTIFY_DONE;
+
+	/*
+	 * We will use the MAC address to locate the synthetic interface to
+	 * associate with the VF interface. If we don't find a matching
+	 * synthetic interface, move on.
+	 */
+	ndev = get_netvsc_bymac(vf_netdev->perm_addr);
+	if (!ndev)
+		return NOTIFY_DONE;
+
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
-		return -ENODEV;
-
-	return 0;
-}
-
-static int netvsc_register_vf(struct net_device *vf_netdev,
-			      struct net_device *ndev)
-{
-	struct net_device_context *ndev_ctx = netdev_priv(ndev);
-
-	/* set slave flag before open to prevent IPv6 addrconf */
-	vf_netdev->flags |= IFF_SLAVE;
+		return NOTIFY_DONE;
 
-	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+	if (netvsc_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
 
-	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
-
-	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
 	dev_hold(vf_netdev);
-	rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev);
-
-	return 0;
+	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
+	return NOTIFY_OK;
 }
 
 /* VF up/down change detected, schedule to change data path */
-static int netvsc_vf_changed(struct net_device *vf_netdev,
-			     struct net_device *ndev)
+static int netvsc_vf_changed(struct net_device *vf_netdev)
 {
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
+	struct net_device *ndev;
 	bool vf_is_up = netif_running(vf_netdev);
 
+	ndev = get_netvsc_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev)
-		return -ENODEV;
+		return NOTIFY_DONE;
 
 	netvsc_switch_datapath(ndev, vf_is_up);
 	netdev_info(ndev, "Data path switched %s VF: %s\n",
 		    vf_is_up ? "to" : "from", vf_netdev->name);
 
-	return 0;
+	return NOTIFY_OK;
 }
 
-static int netvsc_pre_unregister_vf(struct net_device *vf_netdev,
-				    struct net_device *ndev)
+static int netvsc_unregister_vf(struct net_device *vf_netdev)
 {
+	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 
-	net_device_ctx = netdev_priv(ndev);
-	cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
-
-	return 0;
-}
-
-static int netvsc_unregister_vf(struct net_device *vf_netdev,
-				struct net_device *ndev)
-{
-	struct net_device_context *net_device_ctx;
+	ndev = get_netvsc_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
 
 	net_device_ctx = netdev_priv(ndev);
+	cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
 
 	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
 
+	netdev_rx_handler_unregister(vf_netdev);
+	netdev_upper_dev_unlink(vf_netdev, ndev);
 	RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
 	dev_put(vf_netdev);
 
-	return 0;
+	return NOTIFY_OK;
 }
 
-static struct failover_ops netvsc_failover_ops = {
-	.slave_pre_register	= netvsc_pre_register_vf,
-	.slave_register		= netvsc_register_vf,
-	.slave_pre_unregister	= netvsc_pre_unregister_vf,
-	.slave_unregister	= netvsc_unregister_vf,
-	.slave_link_change	= netvsc_vf_changed,
-	.slave_handle_frame	= netvsc_vf_handle_frame,
-};
-
 static int netvsc_probe(struct hv_device *dev,
 			const struct hv_vmbus_device_id *dev_id)
 {
@@ -2032,16 +2101,8 @@ static int netvsc_probe(struct hv_device *dev,
 		goto register_failed;
 	}
 
-	net_device_ctx->failover = failover_register(net, &netvsc_failover_ops);
-	if (IS_ERR(net_device_ctx->failover)) {
-		ret = PTR_ERR(net_device_ctx->failover);
-		goto err_failover;
-	}
-
 	return ret;
 
-err_failover:
-	unregister_netdev(net);
 register_failed:
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
@@ -2082,15 +2143,13 @@ static int netvsc_remove(struct hv_device *dev)
 	rtnl_lock();
 	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
 	if (vf_netdev)
-		failover_slave_unregister(vf_netdev);
+		netvsc_unregister_vf(vf_netdev);
 
 	if (nvdev)
 		rndis_filter_device_remove(dev, nvdev);
 
 	unregister_netdevice(net);
 
-	failover_unregister(ndev_ctx->failover);
-
 	rtnl_unlock();
 	rcu_read_unlock();
 
@@ -2117,8 +2176,54 @@ static struct  hv_driver netvsc_drv = {
 	.remove = netvsc_remove,
 };
 
+/*
+ * On Hyper-V, every VF interface is matched with a corresponding
+ * synthetic interface. The synthetic interface is presented first
+ * to the guest. When the corresponding VF instance is registered,
+ * we will take care of switching the data path.
+ */
+static int netvsc_netdev_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip our own events */
+	if (event_dev->netdev_ops == &device_ops)
+		return NOTIFY_DONE;
+
+	/* Avoid non-Ethernet type devices */
+	if (event_dev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	/* Avoid Vlan dev with same MAC registering as VF */
+	if (is_vlan_dev(event_dev))
+		return NOTIFY_DONE;
+
+	/* Avoid Bonding master dev with same MAC registering as VF */
+	if ((event_dev->priv_flags & IFF_BONDING) &&
+	    (event_dev->flags & IFF_MASTER))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return netvsc_register_vf(event_dev);
+	case NETDEV_UNREGISTER:
+		return netvsc_unregister_vf(event_dev);
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+		return netvsc_vf_changed(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block netvsc_netdev_notifier = {
+	.notifier_call = netvsc_netdev_event,
+};
+
 static void __exit netvsc_drv_exit(void)
 {
+	unregister_netdevice_notifier(&netvsc_netdev_notifier);
 	vmbus_driver_unregister(&netvsc_drv);
 }
 
@@ -2138,6 +2243,7 @@ static int __init netvsc_drv_init(void)
 	if (ret)
 		return ret;
 
+	register_netdevice_notifier(&netvsc_netdev_notifier);
 	return 0;
 }
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net 2/3] hv_netvsc: fix network namespace issues with VF support
From: Stephen Hemminger @ 2018-06-11 19:44 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev
In-Reply-To: <20180611194456.8268-1-sthemmin@microsoft.com>

When finding the parent netvsc device, the search needs to be across
all netvsc device instances (independent of network namespace).

Find parent device of VF using upper_dev_get routine which
searches only adjacent list.

Fixes: e8ff40d4bff1 ("hv_netvsc: improve VF device matching")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>

netns aware byref
---
 drivers/net/hyperv/hyperv_net.h |  2 ++
 drivers/net/hyperv/netvsc_drv.c | 43 +++++++++++++++------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 1be34d2e3563..c0b3f3c125d4 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -902,6 +902,8 @@ struct net_device_context {
 	struct hv_device *device_ctx;
 	/* netvsc_device */
 	struct netvsc_device __rcu *nvdev;
+	/* list of netvsc net_devices */
+	struct list_head list;
 	/* reconfigure work */
 	struct delayed_work dwork;
 	/* last reconfig time */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 2d4370c94b6e..8cb21e013d1d 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -69,6 +69,8 @@ static int debug = -1;
 module_param(debug, int, 0444);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+static LIST_HEAD(netvsc_dev_list);
+
 static void netvsc_change_rx_flags(struct net_device *net, int change)
 {
 	struct net_device_context *ndev_ctx = netdev_priv(net);
@@ -1783,13 +1785,10 @@ static void netvsc_link_change(struct work_struct *w)
 
 static struct net_device *get_netvsc_bymac(const u8 *mac)
 {
-	struct net_device *dev;
-
-	ASSERT_RTNL();
+	struct net_device_context *ndev_ctx;
 
-	for_each_netdev(&init_net, dev) {
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
+	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
+		struct net_device *dev = hv_get_drvdata(ndev_ctx->device_ctx);
 
 		if (ether_addr_equal(mac, dev->perm_addr))
 			return dev;
@@ -1800,25 +1799,18 @@ static struct net_device *get_netvsc_bymac(const u8 *mac)
 
 static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
 {
+	struct net_device_context *net_device_ctx;
 	struct net_device *dev;
 
-	ASSERT_RTNL();
-
-	for_each_netdev(&init_net, dev) {
-		struct net_device_context *net_device_ctx;
+	dev = netdev_master_upper_dev_get(vf_netdev);
+	if (!dev || dev->netdev_ops != &device_ops)
+		return NULL;	/* not a netvsc device */
 
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
+	net_device_ctx = netdev_priv(dev);
+	if (!rtnl_dereference(net_device_ctx->nvdev))
+		return NULL;	/* device is removed */
 
-		net_device_ctx = netdev_priv(dev);
-		if (!rtnl_dereference(net_device_ctx->nvdev))
-			continue;	/* device is removed */
-
-		if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
-			return dev;	/* a match */
-	}
-
-	return NULL;
+	return dev;
 }
 
 /* Called when VF is injecting data into network stack.
@@ -2095,15 +2087,19 @@ static int netvsc_probe(struct hv_device *dev,
 	else
 		net->max_mtu = ETH_DATA_LEN;
 
-	ret = register_netdev(net);
+	rtnl_lock();
+	ret = register_netdevice(net);
 	if (ret != 0) {
 		pr_err("Unable to register netdev.\n");
 		goto register_failed;
 	}
 
-	return ret;
+	list_add(&net_device_ctx->list, &netvsc_dev_list);
+	rtnl_unlock();
+	return 0;
 
 register_failed:
+	rtnl_unlock();
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
 	free_percpu(net_device_ctx->vf_stats);
@@ -2149,6 +2145,7 @@ static int netvsc_remove(struct hv_device *dev)
 		rndis_filter_device_remove(dev, nvdev);
 
 	unregister_netdevice(net);
+	list_del(&ndev_ctx->list);
 
 	rtnl_unlock();
 	rcu_read_unlock();
-- 
2.17.1

^ permalink raw reply related

* [PATCH net 0/3] hv_netvsc: notification and namespace fixes
From: Stephen Hemminger @ 2018-06-11 19:44 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev

This set of patches addresses two set of fixes. First it backs out
the common callback model which was merged in net-next without
completing all the review feedback or getting maintainer approval.

Then it fixes the transparent VF management code to handle network
namespaces.

Stephen Hemminger (3):
  hv_netvsc: drop common code until callback model fixed
  hv_netvsc: fix network namespace issues with VF support
  hv_netvsc: move VF to same namespace as netvsc device

 drivers/net/hyperv/Kconfig      |   1 -
 drivers/net/hyperv/hyperv_net.h |   4 +-
 drivers/net/hyperv/netvsc_drv.c | 242 ++++++++++++++++++++++++--------
 3 files changed, 184 insertions(+), 63 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH net 3/3] hv_netvsc: move VF to same namespace as netvsc device
From: Stephen Hemminger @ 2018-06-11 19:44 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev
In-Reply-To: <20180611194456.8268-1-sthemmin@microsoft.com>

When VF is added, the paravirtual device is already present
and may have been moved to another network namespace. For example,
sometimes the management interface is put in another net namespace
in some environments.

The VF should get moved to where the netvsc device is when the
VF is discovered. The user can move it later (if desired).

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 8cb21e013d1d..bf1b845c1147 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1930,6 +1930,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
+	int ret;
 
 	if (vf_netdev->addr_len != ETH_ALEN)
 		return NOTIFY_DONE;
@@ -1948,11 +1949,29 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
 		return NOTIFY_DONE;
 
-	if (netvsc_vf_join(vf_netdev, ndev) != 0)
+	/* if syntihetic interface is a different namespace,
+	 * then move the VF to that namespace; join will be
+	 * done again in that context.
+	 */
+	if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
+		ret = dev_change_net_namespace(vf_netdev,
+					       dev_net(ndev), "eth%d");
+		if (ret)
+			netdev_err(vf_netdev,
+				   "could not move to same namespace as %s: %d\n",
+				   ndev->name, ret);
+		else
+			netdev_info(vf_netdev,
+				    "VF moved to namespace with: %s\n",
+				    ndev->name);
 		return NOTIFY_DONE;
+	}
 
 	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
+	if (netvsc_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
+
 	dev_hold(vf_netdev);
 	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
 	return NOTIFY_OK;
-- 
2.17.1

^ permalink raw reply related

* RE: locking in wimax/i2400m
From: Perez-Gonzalez, Inaky @ 2018-06-11 19:56 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, linux-wimax
  Cc: netdev@vger.kernel.org, tglx@linutronix.de
In-Reply-To: <20180611160528.be37vniefy4est6m@linutronix.de>

Hello Sebastian

Thanks for checking this out,

    SA> I tried to figure out if the URB-completion handler uses any
    SA> locking and stumbled here.

    SA> i2400m_pm_notifier() is called from process context. This
    SA> function invokes i2400m_fw_cache() + i2400m_fw_uncache(). Both
    SA> functions do spin_lock(&i2400m->rx_lock); while in other
    SA> places (say i2400mu_rxd()) it does
    SA> spin_lock_irqsave(&i2400m->rx_lock, flags);

    SA> So what do I miss? Is this lock never used in interrupt
    SA> context and lockdep didn't complain or did nobody try suspend
    SA> with this driver before?  From what I can tell
    SA> i2400m_dev_bootstrap() has the same locking problem.

I don't remember ever getting to try suspend in the driver, so that
might be the case. That said, I haven't touched this code in years, or
the current locking best practices, so I'll let others chime in,
Thomas being prolly one of the key ones.

^ permalink raw reply

* RE: [net] fq_codel: fix NULL pointer deref in fq_codel_reset
From: Keller, Jacob E @ 2018-06-11 19:57 UTC (permalink / raw)
  To: Kirsher, Jeffrey T, davem@davemloft.net
  Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com, Eric Dumazet
In-Reply-To: <20180611170011.7200-1-jeffrey.t.kirsher@intel.com>

> -----Original Message-----
> From: Kirsher, Jeffrey T
> Sent: Monday, June 11, 2018 10:00 AM
> To: davem@davemloft.net
> Cc: Keller, Jacob E <jacob.e.keller@intel.com>; netdev@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jogreene@redhat.com; Eric
> Dumazet <edumazet@google.com>; Kirsher, Jeffrey T
> <jeffrey.t.kirsher@intel.com>
> Subject: [net] fq_codel: fix NULL pointer deref in fq_codel_reset
> 
> From: Jacob Keller <jacob.e.keller@intel.com>
> 
> The function qdisc_create_dftl attempts to create a default qdisc. If
> this fails, it calls qdisc_destroy when cleaning up. The qdisc_destroy
> function calls the ->reset op on the qdisc.
> 
> In the case of sch_fq_codel.c, this function will panic when the qdisc
> wasn't properly initialized:
> 
>    kernel: BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000008
>    kernel: IP: fq_codel_reset+0x58/0xd0 [sch_fq_codel]
>    kernel: PGD 0 P4D 0
>    kernel: Oops: 0000 [#1] SMP PTI
>    kernel: Modules linked in: i40iw i40e(OE) xt_CHECKSUM iptable_mangle
> ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat
> nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack tun bridge stp llc
> devlink ebtable_filter ebtables ip6table_filter ip6_tables rpcrdma ib_isert
> iscsi_target_mod sunrpc ib_iser libiscsi scsi_transport_iscsi ib_srpt
> target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm
> ib_uverbs ib_umad rdma_cm ib_cm iw_cm intel_rapl sb_edac
> x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass
> crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt
> iTCO_vendor_support intel_uncore ib_core intel_rapl_perf mei_me mei joydev
> i2c_i801 lpc_ich ioatdma shpchp wmi sch_fq_codel xfs libcrc32c mgag200 ixgbe
> drm_kms_helper isci ttm firewire_ohci
>    kernel:  mdio drm igb libsas crc32c_intel firewire_core ptp pps_core
> scsi_transport_sas crc_itu_t dca i2c_algo_bit ipmi_si ipmi_devintf
> ipmi_msghandler [last unloaded: i40e]
>    kernel: CPU: 10 PID: 4219 Comm: ip Tainted: G           OE    4.16.13custom-fq-
> codel-test+ #3
>    kernel: Hardware name: Intel Corporation S2600CO/S2600CO, BIOS
> SE5C600.86B.02.05.0004.051120151007 05/11/2015
>    kernel: RIP: 0010:fq_codel_reset+0x58/0xd0 [sch_fq_codel]
>    kernel: RSP: 0018:ffffbfbf4c1fb620 EFLAGS: 00010246
>    kernel: RAX: 0000000000000400 RBX: 0000000000000000 RCX: 00000000000005b9
>    kernel: RDX: 0000000000000000 RSI: ffff9d03264a60c0 RDI: ffff9cfd17b31c00
>    kernel: RBP: 0000000000000001 R08: 00000000000260c0 R09: ffffffffb679c3e9
>    kernel: R10: fffff1dab06a0e80 R11: ffff9cfd163af800 R12: ffff9cfd17b31c00
>    kernel: R13: 0000000000000001 R14: ffff9cfd153de600 R15: 0000000000000001
>    kernel: FS:  00007fdec2f92800(0000) GS:ffff9d0326480000(0000)
> knlGS:0000000000000000
>    kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>    kernel: CR2: 0000000000000008 CR3: 0000000c1956a006 CR4: 00000000000606e0
>    kernel: Call Trace:
>    kernel:  qdisc_destroy+0x56/0x140
>    kernel:  qdisc_create_dflt+0x8b/0xb0
>    kernel:  mq_init+0xc1/0xf0
>    kernel:  qdisc_create_dflt+0x5a/0xb0
>    kernel:  dev_activate+0x205/0x230
>    kernel:  __dev_open+0xf5/0x160
>    kernel:  __dev_change_flags+0x1a3/0x210
>    kernel:  dev_change_flags+0x21/0x60
>    kernel:  do_setlink+0x660/0xdf0
>    kernel:  ? down_trylock+0x25/0x30
>    kernel:  ? xfs_buf_trylock+0x1a/0xd0 [xfs]
>    kernel:  ? rtnl_newlink+0x816/0x990
>    kernel:  ? _xfs_buf_find+0x327/0x580 [xfs]
>    kernel:  ? _cond_resched+0x15/0x30
>    kernel:  ? kmem_cache_alloc+0x20/0x1b0
>    kernel:  ? rtnetlink_rcv_msg+0x200/0x2f0
>    kernel:  ? rtnl_calcit.isra.30+0x100/0x100
>    kernel:  ? netlink_rcv_skb+0x4c/0x120
>    kernel:  ? netlink_unicast+0x19e/0x260
>    kernel:  ? netlink_sendmsg+0x1ff/0x3c0
>    kernel:  ? sock_sendmsg+0x36/0x40
>    kernel:  ? ___sys_sendmsg+0x295/0x2f0
>    kernel:  ? ebitmap_cmp+0x6d/0x90
>    kernel:  ? dev_get_by_name_rcu+0x73/0x90
>    kernel:  ? skb_dequeue+0x52/0x60
>    kernel:  ? __inode_wait_for_writeback+0x7f/0xf0
>    kernel:  ? bit_waitqueue+0x30/0x30
>    kernel:  ? fsnotify_grab_connector+0x3c/0x60
>    kernel:  ? __sys_sendmsg+0x51/0x90
>    kernel:  ? do_syscall_64+0x74/0x180
>    kernel:  ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>    kernel: Code: 00 00 48 89 87 00 02 00 00 8b 87 a0 01 00 00 85 c0 0f 84 84 00 00 00
> 31 ed 48 63 dd 83 c5 01 48 c1 e3 06 49 03 9c 24 90 01 00 00 <48> 8b 73 08 48 8b 3b e8
> 6c 9a 4f f6 48 8d 43 10 48 c7 03 00 00
>    kernel: RIP: fq_codel_reset+0x58/0xd0 [sch_fq_codel] RSP: ffffbfbf4c1fb620
>    kernel: CR2: 0000000000000008
>    kernel: ---[ end trace e81a62bede66274e ]---
> 
> This occurs because if fq_codel_init fails, it has left the private data
> in an incomplete state. For example, if tcf_block_get fails, (as in the
> above panic), then q->flows and q->backlogs will be NULL. Thus they will
> cause NULL pointer access when attempting to reset them in
> fq_codel_reset.
> 
> We could mitigate some of these issues by changing fq_codel_init to more
> explicitly cleanup after itself when failing. For example, we could
> ensure that q->flowcnt was set to 0 so that the loop over each flow in
> fq_codel_reset would not trigger. However, this would not prevent a NULL
> pointer dereference when attempting to memset the q->backlogs.
> 
> Instead, just add a NULL check prior to attempting to reset these
> fields.
> 
> Cc: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
>  net/sched/sch_fq_codel.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
> index 22fa13cf5d8b..1658c314ee40 100644
> --- a/net/sched/sch_fq_codel.c
> +++ b/net/sched/sch_fq_codel.c
> @@ -352,14 +352,17 @@ static void fq_codel_reset(struct Qdisc *sch)
> 
>  	INIT_LIST_HEAD(&q->new_flows);
>  	INIT_LIST_HEAD(&q->old_flows);
> -	for (i = 0; i < q->flows_cnt; i++) {
> -		struct fq_codel_flow *flow = q->flows + i;
> +	if (q->flows) {
> +		for (i = 0; i < q->flows_cnt; i++) {
> +			struct fq_codel_flow *flow = q->flows + i;
> 
> -		fq_codel_flow_purge(flow);
> -		INIT_LIST_HEAD(&flow->flowchain);
> -		codel_vars_init(&flow->cvars);
> +			fq_codel_flow_purge(flow);
> +			INIT_LIST_HEAD(&flow->flowchain);
> +			codel_vars_init(&flow->cvars);
> +		}
>  	}
> -	memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
> +	if (q->backlogs)
> +		memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));

I'm open to alternative suggestinos for fixing this, I think Eric suggested that maybe we should just remove the ->reset() call from qdisc_destroy..?

I don't really understand enough about this code, so if someone has a better suggestion please feel free to suggest it.

Thanks,
Jake

>  	sch->q.qlen = 0;
>  	sch->qstats.backlog = 0;
>  	q->memory_usage = 0;
> --
> 2.17.1

^ permalink raw reply

* Re: [net] fq_codel: fix NULL pointer deref in fq_codel_reset
From: Cong Wang @ 2018-06-11 20:02 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: David Miller, Jacob Keller, Linux Kernel Network Developers,
	nhorman, sassmann, jogreene, Eric Dumazet
In-Reply-To: <20180611170011.7200-1-jeffrey.t.kirsher@intel.com>

On Mon, Jun 11, 2018 at 10:00 AM, Jeff Kirsher
<jeffrey.t.kirsher@intel.com> wrote:
>
> We could mitigate some of these issues by changing fq_codel_init to more
> explicitly cleanup after itself when failing. For example, we could
> ensure that q->flowcnt was set to 0 so that the loop over each flow in
> fq_codel_reset would not trigger. However, this would not prevent a NULL
> pointer dereference when attempting to memset the q->backlogs.

Are you saying memset(ptr, 0, 0) is not nop?? :-/

Making q->flows_cnt 0 is simpler and easier to understand.

^ permalink raw reply

* Re: 4.17.0-10146-gf0dc7f9c6dd9: hw csum failure on powerpc+sungem
From: Mathieu Malaterre @ 2018-06-11 20:20 UTC (permalink / raw)
  To: Meelis Roos; +Cc: netdev, Mauro Carvalho Chehab, linuxppc-dev, LKML
In-Reply-To: <alpine.LRH.2.21.1806111352330.17091@math.ut.ee>

Hi Meelis,

On Mon, Jun 11, 2018 at 1:21 PM Meelis Roos <mroos@linux.ee> wrote:
>
> I am seeing this on PowerMac G4 with sungem ethernet driver. 4.17 was
> OK, 4.17.0-10146-gf0dc7f9c6dd9 is problematic.

Same here.

> [  140.518664] eth0: hw csum failure
> [  140.518699] CPU: 0 PID: 1237 Comm: postconf Not tainted 4.17.0-10146-gf0dc7f9c6dd9 #83
> [  140.518707] Call Trace:
> [  140.518734] [effefd90] [c03d6db8] __skb_checksum_complete+0xd8/0xdc (unreliable)
> [  140.518759] [effefdb0] [c04c1284] icmpv6_rcv+0x248/0x4ec
> [  140.518775] [effefdd0] [c049a448] ip6_input_finish.constprop.0+0x11c/0x5f4
> [  140.518786] [effefe10] [c049b1c0] ip6_mc_input+0xcc/0x100
> [  140.518807] [effefe20] [c03e110c] __netif_receive_skb_core+0x310/0x944
> [  140.518820] [effefe70] [c03e76ec] napi_gro_receive+0xd0/0xe8
> [  140.518845] [effefe80] [f3e1f66c] gem_poll+0x618/0x1274 [sungem]
> [  140.518856] [effeff30] [c03e6f0c] net_rx_action+0x198/0x374
> [  140.518872] [effeff90] [c0501a88] __do_softirq+0x120/0x278
> [  140.518890] [effeffe0] [c0036188] irq_exit+0xd8/0xdc
> [  140.518908] [effefff0] [c000f478] call_do_irq+0x24/0x3c
> [  140.518925] [d05a5d30] [c0007120] do_IRQ+0x74/0xf0
> [  140.518941] [d05a5d50] [c0012474] ret_from_except+0x0/0x14
> [  140.518960] --- interrupt: 501 at copy_page+0x40/0x90
>                    LR = copy_user_page+0x18/0x30
> [  140.518973] [d05a5e10] [d058cd80] 0xd058cd80 (unreliable)
> [  140.518989] [d05a5e20] [c00fa2bc] wp_page_copy+0xec/0x654
> [  140.519002] [d05a5e60] [c00fd3a4] do_wp_page+0xa8/0x5b4
> [  140.519013] [d05a5e90] [c00fe934] handle_mm_fault+0x564/0xa84
> [  140.519025] [d05a5f00] [c0016230] do_page_fault+0x1bc/0x7e8
> [  140.519037] [d05a5f40] [c0012300] handle_page_fault+0x14/0x40
> [  140.519048] --- interrupt: 301 at 0xb78b6864
>                    LR = 0xb78b6c54
>

For some reason if I do a git bisect it returns that:

$ git bisect good
3036bc45364f98515a2c446d7fac2c34dcfbeff4 is the first bad commit

Could you also check on your side please.

> --
> Meelis Roos (mroos@linux.ee)

^ permalink raw reply

* Re: [net] fq_codel: fix NULL pointer deref in fq_codel_reset
From: Cong Wang @ 2018-06-11 20:22 UTC (permalink / raw)
  To: Keller, Jacob E
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	nhorman@redhat.com, sassmann@redhat.com, jogreene@redhat.com,
	Eric Dumazet
In-Reply-To: <02874ECE860811409154E81DA85FBB5884BC3EC7@ORSMSX115.amr.corp.intel.com>

On Mon, Jun 11, 2018 at 12:57 PM, Keller, Jacob E
<jacob.e.keller@intel.com> wrote:
>
> I'm open to alternative suggestinos for fixing this, I think Eric suggested that maybe we should just remove the ->reset() call from qdisc_destroy..?

You can't remove ->reset() for non-failure call path.

For failure path, yeah, but it is much simpler to just make
q->flows_cnt be 0 for this specific case.

^ permalink raw reply

* Re: [PATCH] iwlwifi: pcie: make array prop static, shrinks object size
From: Greg Kroah-Hartman @ 2018-06-11 20:45 UTC (permalink / raw)
  To: Joe Perches
  Cc: Colin King, Johannes Berg, Emmanuel Grumbach, Luca Coelho,
	Intel Linux Wireless, Kalle Valo, David S . Miller,
	linux-wireless, netdev, kernel-janitors, linux-kernel
In-Reply-To: <7f508dccf95cff7fb8e083666b8939c58f65f894.camel@perches.com>

On Mon, Jun 11, 2018 at 12:40:55PM -0700, Joe Perches wrote:
> (adding Greg KH)
> 
> On Mon, 2018-06-11 at 18:15 +0100, Colin King wrote:
> > From: Colin Ian King <colin.king@canonical.com>
> > 
> > Don't populate the read-only array 'prop' on the stack but
> > instead make it static. Makes the object code smaller by 20 bytes:
> > 
> > Before:
> >    text    data     bss     dec     hex filename
> >   71659   14614     576   86849   15341 trans.o
> > 
> > After:
> >    text    data     bss     dec     hex filename
> >   71479   14774     576   86829   1532d trans.o
> > 
> > (gcc version 7.3.0 x86_64)
> > 
> > Signed-off-by: Colin Ian King <colin.king@canonical.com>
> > ---
> >  drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
> > index 7229991ae70d..c4626ebe5da1 100644
> > --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
> > +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
> > @@ -1946,7 +1946,7 @@ static void iwl_trans_pcie_removal_wk(struct work_struct *wk)
> >  	struct iwl_trans_pcie_removal *removal =
> >  		container_of(wk, struct iwl_trans_pcie_removal, work);
> >  	struct pci_dev *pdev = removal->pdev;
> > -	char *prop[] = {"EVENT=INACCESSIBLE", NULL};
> > +	static char *prop[] = {"EVENT=INACCESSIBLE", NULL};
> >  
> >  	dev_err(&pdev->dev, "Device gone - attempting removal\n");
> >  	kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, prop);
> 
> Now what is happening is that prop is being reloaded
> each invocation with the constant addresses of the strings.
> 
> It seems the prototype and function for kobject_uevent_env
> should change as well to avoid this.
> 
> Perhaps this should become:
> ---
>  drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 2 +-
>  include/linux/kobject.h                         | 2 +-
>  lib/kobject_uevent.c                            | 2 +-
>  3 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
> index 7229991ae70d..6668a8aad22e 100644
> --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
> +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
> @@ -1946,7 +1946,7 @@ static void iwl_trans_pcie_removal_wk(struct work_struct *wk)
>  	struct iwl_trans_pcie_removal *removal =
>  		container_of(wk, struct iwl_trans_pcie_removal, work);
>  	struct pci_dev *pdev = removal->pdev;
> -	char *prop[] = {"EVENT=INACCESSIBLE", NULL};
> +	static const char * const prop[] = {"EVENT=INACCESSIBLE", NULL};
>  
>  	dev_err(&pdev->dev, "Device gone - attempting removal\n");
>  	kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, prop);
> diff --git a/include/linux/kobject.h b/include/linux/kobject.h
> index 7f6f93c3df9c..9f5cf553dd1e 100644
> --- a/include/linux/kobject.h
> +++ b/include/linux/kobject.h
> @@ -217,7 +217,7 @@ extern struct kobject *firmware_kobj;
>  
>  int kobject_uevent(struct kobject *kobj, enum kobject_action action);
>  int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
> -			char *envp[]);
> +			const char * const envp[]);
>  int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);
>  
>  __printf(2, 3)
> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
> index 63d0816ab23b..9107989a0cc8 100644
> --- a/lib/kobject_uevent.c
> +++ b/lib/kobject_uevent.c
> @@ -452,7 +452,7 @@ static void zap_modalias_env(struct kobj_uevent_env *env)
>   * corresponding error when it fails.
>   */
>  int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
> -		       char *envp_ext[])
> +		       const char * const envp_ext[])
>  {
>  	struct kobj_uevent_env *env;
>  	const char *action_string = kobject_actions[action];

No objection from me, care to make it a real patch so that I can apply
it after 4.18-rc1 is out?

thanks,

greg k-h

^ permalink raw reply

* RE: [net] fq_codel: fix NULL pointer deref in fq_codel_reset
From: Keller, Jacob E @ 2018-06-11 20:47 UTC (permalink / raw)
  To: Cong Wang, Kirsher, Jeffrey T
  Cc: David Miller, Linux Kernel Network Developers, nhorman@redhat.com,
	sassmann@redhat.com, jogreene@redhat.com, Eric Dumazet
In-Reply-To: <CAM_iQpWmcBtQn8+g-QG2UKENeWYD=Zqq_78E8oNxS7tkLi0Cvw@mail.gmail.com>

> -----Original Message-----
> From: Cong Wang [mailto:xiyou.wangcong@gmail.com]
> Sent: Monday, June 11, 2018 1:03 PM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>
> Cc: David Miller <davem@davemloft.net>; Keller, Jacob E
> <jacob.e.keller@intel.com>; Linux Kernel Network Developers
> <netdev@vger.kernel.org>; nhorman@redhat.com; sassmann@redhat.com;
> jogreene@redhat.com; Eric Dumazet <edumazet@google.com>
> Subject: Re: [net] fq_codel: fix NULL pointer deref in fq_codel_reset
> 

> Making q->flows_cnt 0 is simpler and easier to understand.

Feel free to propose such a patch :)

Thanks,
Jake

^ permalink raw reply

* [Patch net] smc: convert to ->poll_mask
From: Cong Wang @ 2018-06-11 21:07 UTC (permalink / raw)
  To: netdev; +Cc: penguin-kernel, Cong Wang, Christoph Hellwig, Ursula Braun

smc->clcsock is an internal TCP socket, after TCP socket
converts to ->poll_mask, ->poll doesn't exist any more.
So just convert smc socket to ->poll_mask too.

Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
Reported-by: syzbot+f5066e369b2d5fff630f@syzkaller.appspotmail.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/smc/af_smc.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 973b4471b532..da7f02edcd37 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1273,8 +1273,7 @@ static __poll_t smc_accept_poll(struct sock *parent)
 	return mask;
 }
 
-static __poll_t smc_poll(struct file *file, struct socket *sock,
-			     poll_table *wait)
+static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
@@ -1290,7 +1289,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
 		release_sock(sk);
-		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
 		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		if (sk->sk_err) {
@@ -1308,11 +1307,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			}
 		}
 	} else {
-		if (sk->sk_state != SMC_CLOSED) {
-			release_sock(sk);
-			sock_poll_wait(file, sk_sleep(sk), wait);
-			lock_sock(sk);
-		}
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1625,7 +1619,7 @@ static const struct proto_ops smc_sock_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= smc_accept,
 	.getname	= smc_getname,
-	.poll		= smc_poll,
+	.poll_mask	= smc_poll_mask,
 	.ioctl		= smc_ioctl,
 	.listen		= smc_listen,
 	.shutdown	= smc_shutdown,
-- 
2.13.0

^ permalink raw reply related

* Re: [PATCH net] ipv6: allow PMTU exceptions to local routes
From: David Miller @ 2018-06-11 21:17 UTC (permalink / raw)
  To: ja; +Cc: netdev, kafai, kernel-team, lvs-devel
In-Reply-To: <20180610230254.6347-1-ja@ssi.bg>

From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 11 Jun 2018 02:02:54 +0300

> IPVS setups with local client and remote tunnel server need
> to create exception for the local virtual IP. What we do is to
> change PMTU from 64KB (on "lo") to 1460 in the common case.
> 
> Suggested-by: Martin KaFai Lau <kafai@fb.com>
> Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
> Fixes: 7343ff31ebf0 ("ipv6: Don't create clones of host routes.")
> Signed-off-by: Julian Anastasov <ja@ssi.bg>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH] net: dsa: add error handling for pskb_trim_rcsum
From: David Miller @ 2018-06-11 21:20 UTC (permalink / raw)
  To: jiazhouyang09; +Cc: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel
In-Reply-To: <1528694795-32000-1-git-send-email-jiazhouyang09@gmail.com>

From: Zhouyang Jia <jiazhouyang09@gmail.com>
Date: Mon, 11 Jun 2018 13:26:35 +0800

> When pskb_trim_rcsum fails, the lack of error-handling code may
> cause unexpected results.
> 
> This patch adds error-handling code after calling pskb_trim_rcsum.
> 
> Signed-off-by: Zhouyang Jia <jiazhouyang09@gmail.com>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* [PATCH net] tls: fix NULL pointer dereference on poll
From: Daniel Borkmann @ 2018-06-11 21:22 UTC (permalink / raw)
  To: davem; +Cc: davejwatson, netdev, ast, Daniel Borkmann, Christoph Hellwig

While hacking on kTLS, I ran into the following panic from an
unprivileged netserver / netperf TCP session:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  PGD 800000037f378067 P4D 800000037f378067 PUD 3c0e61067 PMD 0
  Oops: 0010 [#1] SMP KASAN PTI
  CPU: 1 PID: 2289 Comm: netserver Not tainted 4.17.0+ #139
  Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016
  RIP: 0010:          (null)
  Code: Bad RIP value.
  RSP: 0018:ffff88036abcf740 EFLAGS: 00010246
  RAX: dffffc0000000000 RBX: ffff88036f5f6800 RCX: 1ffff1006debed26
  RDX: ffff88036abcf920 RSI: ffff8803cb1a4f00 RDI: ffff8803c258c280
  RBP: ffff8803c258c280 R08: ffff8803c258c280 R09: ffffed006f559d48
  R10: ffff88037aacea43 R11: ffffed006f559d49 R12: ffff8803c258c280
  R13: ffff8803cb1a4f20 R14: 00000000000000db R15: ffffffffc168a350
  FS:  00007f7e631f4700(0000) GS:ffff8803d1c80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffffffffffffd6 CR3: 00000003ccf64005 CR4: 00000000003606e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   ? tls_sw_poll+0xa4/0x160 [tls]
   ? sock_poll+0x20a/0x680
   ? do_select+0x77b/0x11a0
   ? poll_schedule_timeout.constprop.12+0x130/0x130
   ? pick_link+0xb00/0xb00
   ? read_word_at_a_time+0x13/0x20
   ? vfs_poll+0x270/0x270
   ? deref_stack_reg+0xad/0xe0
   ? __read_once_size_nocheck.constprop.6+0x10/0x10
  [...]

Debugging further, it turns out that calling into ctx->sk_poll() is
invalid since sk_poll itself is NULL which was saved from the original
TCP socket in order for tls_sw_poll() to invoke it.

Looks like the recent conversion from poll to poll_mask callback started
in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed
to eventually convert kTLS, too: TCP's ->poll was converted over to the
->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
and therefore kTLS wrongly saved the ->poll old one which is now NULL.

Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN |
POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in
tcp_poll_mask() as well that is mangled here.

Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Watson <davejwatson@fb.com>
---
 include/net/tls.h  |  6 ++----
 net/tls/tls_main.c |  2 +-
 net/tls/tls_sw.c   | 19 +++++++++----------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 70c2737..7f84ea3 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -109,8 +109,7 @@ struct tls_sw_context_rx {
 
 	struct strparser strp;
 	void (*saved_data_ready)(struct sock *sk);
-	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-				struct poll_table_struct *wait);
+	__poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events);
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
@@ -225,8 +224,7 @@ void tls_sw_free_resources_tx(struct sock *sk);
 void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait);
+__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 301f224..a127d61 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -712,7 +712,7 @@ static int __init tls_register(void)
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
 	tls_sw_proto_ops = inet_stream_ops;
-	tls_sw_proto_ops.poll = tls_sw_poll;
+	tls_sw_proto_ops.poll_mask = tls_sw_poll_mask;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 8ca57d0..34895b7 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -915,23 +915,22 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 	return copied ? : err;
 }
 
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait)
+__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events)
 {
-	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	__poll_t mask;
 
-	/* Grab POLLOUT and POLLHUP from the underlying socket */
-	ret = ctx->sk_poll(file, sock, wait);
+	/* Grab EPOLLOUT and EPOLLHUP from the underlying socket */
+	mask = ctx->sk_poll_mask(sock, events);
 
-	/* Clear POLLIN bits, and set based on recv_pkt */
-	ret &= ~(POLLIN | POLLRDNORM);
+	/* Clear EPOLLIN bits, and set based on recv_pkt */
+	mask &= ~(EPOLLIN | EPOLLRDNORM);
 	if (ctx->recv_pkt)
-		ret |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
-	return ret;
+	return mask;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1188,7 +1187,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
+		sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
 	}
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH] net: cxgb3: add error handling for some functions
From: David Miller @ 2018-06-11 21:23 UTC (permalink / raw)
  To: jiazhouyang09; +Cc: santosh, netdev, linux-kernel
In-Reply-To: <1528706547-35243-1-git-send-email-jiazhouyang09@gmail.com>

From: Zhouyang Jia <jiazhouyang09@gmail.com>
Date: Mon, 11 Jun 2018 16:42:26 +0800

> When sysfs_create_group or alloc_skb fails, the lack of error-handling
> code may cause unexpected results.
> 
> This patch adds error-handling code after the functions.
> 
> Signed-off-by: Zhouyang Jia <jiazhouyang09@gmail.com>

The ->nofail_skb handling is perfectly fine.

It is a backup SKB, meant to handle alloc_skb() failures for the
primary 'skb' allocation the _next_ time this function is called.

Even if the ->nofail_skb allocation fails, the primary operation
done by init_tp_parity() has succeeeded and we should return
success.

All of the code in this function is perfectly prepared to handle
the case where ->nofail_skb is NULL.

^ permalink raw reply

* Re: [PATCH] net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets
From: Marc Dionne @ 2018-06-11 21:25 UTC (permalink / raw)
  To: Maciej Żenczykowski
  Cc: Maciej Żenczykowski, David S . Miller, Eric Dumazet, netdev,
	Jeffrey Altman, Marc Dionne
In-Reply-To: <20180603174705.51802-1-zenczykowski@gmail.com>

On Sun, Jun 3, 2018 at 2:47 PM, Maciej Żenczykowski
<zenczykowski@gmail.com> wrote:
> From: Maciej Żenczykowski <maze@google.com>
>
> It is not safe to do so because such sockets are already in the
> hash tables and changing these options can result in invalidating
> the tb->fastreuse(port) caching.
>
> This can have later far reaching consequences wrt. bind conflict checks
> which rely on these caches (for optimization purposes).
>
> Not to mention that you can currently end up with two identical
> non-reuseport listening sockets bound to the same local ip:port
> by clearing reuseport on them after they've already both been bound.
>
> There is unfortunately no EISBOUND error or anything similar,
> and EISCONN seems to be misleading for a bound-but-not-connected
> socket, so use EUCLEAN 'Structure needs cleaning' which AFAICT
> is the closest you can get to meaning 'socket in bad state'.
> (although perhaps EINVAL wouldn't be a bad choice either?)
>
> This does unfortunately run the risk of breaking buggy
> userspace programs...

This change is a potential performance regression for us.  Our
networking code sets up multiple sockets used by multiple threads to
listen on the same udp port.  For the first socket, the bind is done
before setting SO_REUSEPORT so that we can get an error and detect
that the port is currently in use by a different process, possibly a
previous incarnation of the same application.

With this change, the servers end up with a single listener socket and
thread, which can have a large impact on performance for a busy
server.

While we can adjust for the new behaviour going forward, this could be
an unpleasant surprise for our customers who get this update when
moving to a new kernel version or through a backport to stable
kernels, if this is targeted for stable.

Marc

^ permalink raw reply

* Re: [RFC nf-next 0/5] netfilter: add ebpf translation infrastructure
From: Alexei Starovoitov @ 2018-06-11 22:12 UTC (permalink / raw)
  To: Florian Westphal
  Cc: netfilter-devel, ast, daniel, netdev, David S. Miller, ecree
In-Reply-To: <20180601153216.10901-1-fw@strlen.de>

On Fri, Jun 01, 2018 at 05:32:11PM +0200, Florian Westphal wrote:
> This patch series adds a JIT layer to translate nft expressions
> to ebpf programs.
> 
> From commit phase, spawn a userspace program (using recently added UMH
> infrastructure).
> 
> We then provide rules that came in this transaction to the helper via pipe,
> using same nf_tables netlink that nftables already uses.
> 
> The userspace helper translates the rules, and, if successful, installs the
> generated program(s) via bpf syscall.
> 
> For each rule a small response containing the corresponding epbf file
> descriptor (can be -1 on failure) and a attribute count (how many
> expressions were jitted) gets sent back to kernel via pipe.
> 
> If translation fails, the rule is will be processed by nf_tables
> interpreter (as before this patch).
> 
> If translation succeeded, nf_tables fetches the bpf program using the file
> descriptor identifier, allocates a new rule blob containing the new 'ebpf'
> expression (and possible trailing un-translated expressions).
> 
> It then replaces the original rule in the transaction log with the new
> 'ebpf-rule'.  The original rule is retained in a private area inside the epbf
> expression to be able to present the original expressions back to userspace
> on 'nft list ruleset'.
> 
> For easier review, this contains the kernel-side only.
> nf_tables_jit_work() will not do anything, yet.
> 
> Unresolved issues:
>  - maps and sets.
>    It might be possible to add a new ebpf map type that just wraps
>    the nft set infrastructure for lookups.
>    This would allow nft userspace to continue to work as-is while
>    not requiring new ebpf helper.
>    Anonymous set should be a lot easier as they're immutable
>    and could probably be handled already by existing infra.
> 
>  - BPF_PROG_RUN() is bolted into nft main loop via a middleman expression.
>    I'm also abusing skb->cb[] to pass network and transport header offsets.
>    Its not 'public' api so this can be changed later.
> 
>  - always uses BPF_PROG_TYPE_SCHED_CLS.
>    This is because it "works" for current RFC purposes.
> 
>  - we should eventually support translating multiple (adjacent) rules
>    into single program.
> 
>    If we do this kernel will need to track mapping of rules to
>    program (to re-jit when a rule is changed.  This isn't implemented
>    so far, but can be added later.  Alternatively, one could also add a
>    'readonly' table switch to just prevent further updates.
> 
>    We will also need to dump the 'next' generation of the
>    to-be-translated table.  The kernel has this information, so its only
>    a matter of serializing it back to userspace from the commit phase.
> 
> The jitter is still limited.  So far it supports:
> 
>  * payload expression for network and transport header
>  * meta mark, nfproto, l4proto
>  * 32 bit immediates
>  * 32 bit bitmask ops
>  * accept/drop verdicts
> 
> As this uses netlink, there is also no technical requirement for
> libnftnl, its simply used here for convienience.
> 
> It doesn't need any userspace changes. Patches for libnftnl and nftables
> make debug info available (e.g. to map rule to its bpf prog id).
> 
> Comments welcome.

The implementation of patch 5 looks good to me, but I'm concerned with
patch 2 that adds 'ebpf expression' to nft. I see no reason to do so.
It seems existing support for infinite number of nft expressions is
used as a way to execute infinite number of bpf programs sequentially.
I don't think it was a scalable approach before and won't scale in the future.
I think the algorithm should consider all nft rules at once and generate
a program or two that will execute fast even when number of rules is large.
We have the same scalability issue with bpfilter RFC patches. That's why
they're still in RFC stage, since we need to figure out a way to support
thousands of iptable rules in scalable way.
There are papers on scalable packet classification algorithms that
use decision trees (hicuts, hypercuts, efficuts, etc)
Imo that is the direction should we should be looking at.
If we implement one of the algorithms as a tree(trie) with a generic lookup
it will be usuable from bpf(bpfilter), from XDP, and other places
inside the kernel.
We can even have multiple algorithms implemented and pick and choose
depending on the size of ruleset and its properties, since one size
doesn't always fit all.
I'm imagining umh will be doing iptables->trie+bpf conversion and
nft->trie+bpf conversion where bpf progs will be dealing with pieces
of logic that don't fit into trie lookup and provide generic mechanism
for parsing the packet in the specific way suited for trie lookup
for the given ruleset. The trie will be sized differently depending
on tuples needed in the lookup. Like if there is no ipv6 in the ruleset
the bpf prog won't be parsing that to prepare a tuple for given trie.
Just like bpf hash map can be of different key/value size, this new
trie will be customized for specific ruleset on the fly by umh.
At the end the trie lookup is fully generic and bpf progs before
and after are generic as well.
imo this way majority of iptables/nft rules can be converted and
performance will be great even with large rulesets.

^ permalink raw reply

* Re: [PATCH bpf] xsk: silence warning on memory allocation failure
From: Daniel Borkmann @ 2018-06-11 22:27 UTC (permalink / raw)
  To: Björn Töpel, magnus.karlsson, magnus.karlsson, ast,
	netdev
  Cc: Björn Töpel, penguin-kernel, syzkaller-bugs,
	syzbot+4abadc5d69117b346506
In-Reply-To: <20180611115712.9680-1-bjorn.topel@gmail.com>

On 06/11/2018 01:57 PM, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> syzkaller reported a warning from xdp_umem_pin_pages():
> 
>   WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>   ...
>   __do_kmalloc mm/slab.c:3713 [inline]
>   __kmalloc+0x25/0x760 mm/slab.c:3727
>   kmalloc_array include/linux/slab.h:634 [inline]
>   kcalloc include/linux/slab.h:645 [inline]
>   xdp_umem_pin_pages net/xdp/xdp_umem.c:205 [inline]
>   xdp_umem_reg net/xdp/xdp_umem.c:318 [inline]
>   xdp_umem_create+0x5c9/0x10f0 net/xdp/xdp_umem.c:349
>   xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531
>   __sys_setsockopt+0x1bd/0x390 net/socket.c:1935
>   __do_sys_setsockopt net/socket.c:1946 [inline]
>   __se_sys_setsockopt net/socket.c:1943 [inline]
>   __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943
>   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> This is a warning about attempting to allocate more than
> KMALLOC_MAX_SIZE memory. The request originates from userspace, and if
> the request is too big, the kernel is free to deny its allocation. In
> this patch, the failed allocation attempt is silenced with
> __GFP_NOWARN.
> 
> Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
> Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Applied to bpf, thanks Björn!

^ permalink raw reply

* Re: [PATCH net] tls: fix NULL pointer dereference on poll
From: Dave Watson @ 2018-06-11 22:27 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, netdev, ast, Christoph Hellwig
In-Reply-To: <20180611212204.24002-1-daniel@iogearbox.net>

On 06/11/18 11:22 PM, Daniel Borkmann wrote:
> While hacking on kTLS, I ran into the following panic from an
> unprivileged netserver / netperf TCP session:
> 
>   [...]
> Looks like the recent conversion from poll to poll_mask callback started
> in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed
> to eventually convert kTLS, too: TCP's ->poll was converted over to the
> ->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
> and therefore kTLS wrongly saved the ->poll old one which is now NULL.
> 
> Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN |
> POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in
> tcp_poll_mask() as well that is mangled here.

Thanks, was just trying to bisect this myself. Works for me. 

Tested-by: Dave Watson <davejwatson@fb.com>

> Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Dave Watson <davejwatson@fb.com>

^ permalink raw reply

* Re: [PATCH] net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets
From: Maciej Żenczykowski @ 2018-06-11 22:29 UTC (permalink / raw)
  To: Marc Dionne
  Cc: David S . Miller, Eric Dumazet, netdev, Jeffrey Altman,
	Marc Dionne
In-Reply-To: <CAB9dFdueWF6iXdqC+uN5pmfs5Z6hdb9b-Zq0g+dUtX20cMuWSg@mail.gmail.com>

> This change is a potential performance regression for us.  Our
> networking code sets up multiple sockets used by multiple threads to
> listen on the same udp port.  For the first socket, the bind is done
> before setting SO_REUSEPORT so that we can get an error and detect
> that the port is currently in use by a different process, possibly a
> previous incarnation of the same application.
>
> With this change, the servers end up with a single listener socket and
> thread, which can have a large impact on performance for a busy
> server.
>
> While we can adjust for the new behaviour going forward, this could be
> an unpleasant surprise for our customers who get this update when
> moving to a new kernel version or through a backport to stable
> kernels, if this is targeted for stable.

Probably you can:
fd1=socket()
fd2=socket()
bind(fd1,port=0)
setsockopt(fd2,REUSEPORT,1)
port = getsockname(fd1)
close(fd1)
bind(fd2,port)

Although yeah there's a slight chance of a race (ie. 2nd bind failing,
in which case close() and retry).

^ permalink raw reply

* Re: [PATCH bpf] xsk: silence warning on memory allocation failure
From: Y Song @ 2018-06-11 22:44 UTC (permalink / raw)
  To: Björn Töpel
  Cc: magnus.karlsson, magnus.karlsson, Alexei Starovoitov,
	Daniel Borkmann, netdev, Björn Töpel, Tetsuo Handa,
	syzkaller-bugs, syzbot+4abadc5d69117b346506
In-Reply-To: <20180611115712.9680-1-bjorn.topel@gmail.com>

On Mon, Jun 11, 2018 at 4:57 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> syzkaller reported a warning from xdp_umem_pin_pages():
>
>   WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>   ...
>   __do_kmalloc mm/slab.c:3713 [inline]
>   __kmalloc+0x25/0x760 mm/slab.c:3727
>   kmalloc_array include/linux/slab.h:634 [inline]
>   kcalloc include/linux/slab.h:645 [inline]
>   xdp_umem_pin_pages net/xdp/xdp_umem.c:205 [inline]
>   xdp_umem_reg net/xdp/xdp_umem.c:318 [inline]
>   xdp_umem_create+0x5c9/0x10f0 net/xdp/xdp_umem.c:349
>   xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531
>   __sys_setsockopt+0x1bd/0x390 net/socket.c:1935
>   __do_sys_setsockopt net/socket.c:1946 [inline]
>   __se_sys_setsockopt net/socket.c:1943 [inline]
>   __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943
>   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> This is a warning about attempting to allocate more than
> KMALLOC_MAX_SIZE memory. The request originates from userspace, and if
> the request is too big, the kernel is free to deny its allocation. In
> this patch, the failed allocation attempt is silenced with
> __GFP_NOWARN.
>
> Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
> Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Acked-by: Yonghong Song <yhs@fb.com>

^ permalink raw reply

* Re: Qualcomm rmnet driver and qmi_wwan
From: Subash Abhinov Kasiviswanathan @ 2018-06-11 23:00 UTC (permalink / raw)
  To: Bjørn Mork, Daniele Palmas; +Cc: Dan Williams, netdev
In-Reply-To: <87zi01w753.fsf@miraculix.mork.no>

> both patches work properly for me.
> 
> Maybe it could be helpful in the first patch to add a print when
> pass-through setting fails if raw-ip has not been set, just to let the
> user know what is happening.
> 

Thanks for testing Daniele.
I can add an error message there when pass through mode setting fails.
Will you be submitting the patch for iproute2. Otherwise, I can do so
a bit later.

> Maybe rmnet_is_real_dev_registered(dev->net) instead, since we use that
> elsewhere in this function?
> 
> Like Daniele said: It would be good to have some way to know when the
> rawip condition fails.  Or even better: Automatically force rawip mode
> when the rmnet driver attaches.  But that doesn't seem possible?  No
> notifications or anything when an rx handler is registered?
> 
> Hmm, looking at this I wonder: Is the rawip check really necessary?  
> You
> skip all the extra rawip code in the driver anyway, so I don't see how
> it matters.  But maybe the ethernet header_ops are a problem?
> 
> And I wonder about using skb->dev here.  Does that really work?  I
> didn't think we set that until later.  Why not use dev->net instead?
> 
> 

Hi Bjørn

Yes, I will switch to using dev->net.
There doesnt seem to be a net device notifier event when the rx 
registration
happens.

If the dev type is ethernet, rmnet driver will try to remove the first 
14
bytes since it assumes an ethernet header is present in the packet. 
Hence the
need for raw ip mode in qmi_wwan.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply

* Re: [PATCH] net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets
From: Marc Dionne @ 2018-06-11 23:09 UTC (permalink / raw)
  To: Maciej Żenczykowski
  Cc: David S . Miller, Eric Dumazet, netdev, Jeffrey Altman,
	Marc Dionne
In-Reply-To: <CANP3RGc=hsJ8VmB28ZgN_iBpvC0Ck3_HSctYQMjioZngNKnC7g@mail.gmail.com>

On Mon, Jun 11, 2018 at 7:29 PM, Maciej Żenczykowski
<zenczykowski@gmail.com> wrote:
>> This change is a potential performance regression for us.  Our
>> networking code sets up multiple sockets used by multiple threads to
>> listen on the same udp port.  For the first socket, the bind is done
>> before setting SO_REUSEPORT so that we can get an error and detect
>> that the port is currently in use by a different process, possibly a
>> previous incarnation of the same application.
>>
>> With this change, the servers end up with a single listener socket and
>> thread, which can have a large impact on performance for a busy
>> server.
>>
>> While we can adjust for the new behaviour going forward, this could be
>> an unpleasant surprise for our customers who get this update when
>> moving to a new kernel version or through a backport to stable
>> kernels, if this is targeted for stable.
>
> Probably you can:
> fd1=socket()
> fd2=socket()
> bind(fd1,port=0)
> setsockopt(fd2,REUSEPORT,1)
> port = getsockname(fd1)
> close(fd1)
> bind(fd2,port)
>
> Although yeah there's a slight chance of a race (ie. 2nd bind failing,
> in which case close() and retry).

This would be a bit different since we want a specific port rather
than a wildcard, but yes, a temporary socket could be bound just to
check if the port is in use and closed afterwards.  The problem is
that since fd2 has REUSEPORT set, that second bind might well succeed
if there was a race and the other user of the port also has REUSEPORT
set, as could be the case if it's another instance of the same
application.

Marc

^ permalink raw reply

* Re: [Intel-wired-lan] [jkirsher/next-queue PATCH 5/7] net: Add generic ndo_select_queue functions
From: kbuild test robot @ 2018-06-11 23:10 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: kbuild-all, netdev, intel-wired-lan, jeffrey.t.kirsher
In-Reply-To: <20180611174119.41352.28975.stgit@ahduyck-green-test.jf.intel.com>

[-- Attachment #1: Type: text/plain, Size: 2094 bytes --]

Hi Alexander,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on next-20180608]
[cannot apply to v4.17]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alexander-Duyck/Add-support-for-L2-Fwd-Offload-w-o-ndo_select_queue/20180612-015220
config: arm-allmodconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=arm 

All errors (new ones prefixed by >>):

>> drivers/net//ethernet/ti/netcp_core.c:1968:22: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
     .ndo_select_queue = dev_pick_tx_zero,
                         ^~~~~~~~~~~~~~~~
   drivers/net//ethernet/ti/netcp_core.c:1968:22: note: (near initialization for 'netcp_netdev_ops.ndo_select_queue')
   cc1: some warnings being treated as errors

vim +1968 drivers/net//ethernet/ti/netcp_core.c

  1955	
  1956	static const struct net_device_ops netcp_netdev_ops = {
  1957		.ndo_open		= netcp_ndo_open,
  1958		.ndo_stop		= netcp_ndo_stop,
  1959		.ndo_start_xmit		= netcp_ndo_start_xmit,
  1960		.ndo_set_rx_mode	= netcp_set_rx_mode,
  1961		.ndo_do_ioctl           = netcp_ndo_ioctl,
  1962		.ndo_get_stats64        = netcp_get_stats,
  1963		.ndo_set_mac_address	= eth_mac_addr,
  1964		.ndo_validate_addr	= eth_validate_addr,
  1965		.ndo_vlan_rx_add_vid	= netcp_rx_add_vid,
  1966		.ndo_vlan_rx_kill_vid	= netcp_rx_kill_vid,
  1967		.ndo_tx_timeout		= netcp_ndo_tx_timeout,
> 1968		.ndo_select_queue	= dev_pick_tx_zero,
  1969		.ndo_setup_tc		= netcp_setup_tc,
  1970	};
  1971	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 65910 bytes --]

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox