Netdev List
 help / color / mirror / Atom feed
* [PATCH v5 17/17] net: qualcomm: add QCA7000 UART driver
From: Stefan Wahren @ 2017-05-10  8:53 UTC (permalink / raw)
  To: Rob Herring, David S. Miller
  Cc: Mark Rutland, Greg Kroah-Hartman, Jiri Slaby, linux-serial,
	linux-kernel, netdev, devicetree, Stefan Wahren
In-Reply-To: <1494406408-31760-1-git-send-email-stefan.wahren@i2se.com>

This patch adds the Ethernet over UART driver for the
Qualcomm QCA7000 HomePlug GreenPHY.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
---
 drivers/net/ethernet/qualcomm/Kconfig         |  16 +
 drivers/net/ethernet/qualcomm/Makefile        |   2 +
 drivers/net/ethernet/qualcomm/qca_7k_common.h |   6 +
 drivers/net/ethernet/qualcomm/qca_uart.c      | 423 ++++++++++++++++++++++++++
 4 files changed, 447 insertions(+)
 create mode 100644 drivers/net/ethernet/qualcomm/qca_uart.c

diff --git a/drivers/net/ethernet/qualcomm/Kconfig b/drivers/net/ethernet/qualcomm/Kconfig
index b4c369d..877675a 100644
--- a/drivers/net/ethernet/qualcomm/Kconfig
+++ b/drivers/net/ethernet/qualcomm/Kconfig
@@ -30,6 +30,22 @@ config QCA7000_SPI
 	  To compile this driver as a module, choose M here. The module
 	  will be called qcaspi.
 
+config QCA7000_UART
+	tristate "Qualcomm Atheros QCA7000 UART support"
+	select QCA7000
+	depends on SERIAL_DEV_BUS && OF
+	---help---
+	  This UART protocol driver supports the Qualcomm Atheros QCA7000.
+
+	  Currently the driver assumes these device UART settings:
+	    Data bits: 8
+	    Parity: None
+	    Stop bits: 1
+	    Flow control: None
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called qcauart.
+
 config QCOM_EMAC
 	tristate "Qualcomm Technologies, Inc. EMAC Gigabit Ethernet support"
 	depends on HAS_DMA && HAS_IOMEM
diff --git a/drivers/net/ethernet/qualcomm/Makefile b/drivers/net/ethernet/qualcomm/Makefile
index 65556ca..92fa7c4 100644
--- a/drivers/net/ethernet/qualcomm/Makefile
+++ b/drivers/net/ethernet/qualcomm/Makefile
@@ -5,5 +5,7 @@
 obj-$(CONFIG_QCA7000) += qca_7k_common.o
 obj-$(CONFIG_QCA7000_SPI) += qcaspi.o
 qcaspi-objs := qca_7k.o qca_debug.o qca_spi.o
+obj-$(CONFIG_QCA7000_UART) += qcauart.o
+qcauart-objs := qca_uart.o
 
 obj-y += emac/
diff --git a/drivers/net/ethernet/qualcomm/qca_7k_common.h b/drivers/net/ethernet/qualcomm/qca_7k_common.h
index 07bdd6c..928554f 100644
--- a/drivers/net/ethernet/qualcomm/qca_7k_common.h
+++ b/drivers/net/ethernet/qualcomm/qca_7k_common.h
@@ -122,6 +122,12 @@ static inline void qcafrm_fsm_init_spi(struct qcafrm_handle *handle)
 	handle->state = handle->init;
 }
 
+static inline void qcafrm_fsm_init_uart(struct qcafrm_handle *handle)
+{
+	handle->init = QCAFRM_WAIT_AA1;
+	handle->state = handle->init;
+}
+
 /*   Gather received bytes and try to extract a full Ethernet frame
  *   by following a simple state machine.
  *
diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c
new file mode 100644
index 0000000..1f6514c
--- /dev/null
+++ b/drivers/net/ethernet/qualcomm/qca_uart.c
@@ -0,0 +1,423 @@
+/*
+ *   Copyright (c) 2011, 2012, Qualcomm Atheros Communications Inc.
+ *   Copyright (c) 2017, I2SE GmbH
+ *
+ *   Permission to use, copy, modify, and/or distribute this software
+ *   for any purpose with or without fee is hereby granted, provided
+ *   that the above copyright notice and this permission notice appear
+ *   in all copies.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+ *   WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+ *   WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
+ *   THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
+ *   CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+ *   LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ *   NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ *   CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*   This module implements the Qualcomm Atheros UART protocol for
+ *   kernel-based UART device; it is essentially an Ethernet-to-UART
+ *   serial converter;
+ */
+
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_net.h>
+#include <linux/sched.h>
+#include <linux/serdev.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include "qca_7k_common.h"
+
+#define QCAUART_DRV_VERSION "0.1.0"
+#define QCAUART_DRV_NAME "qcauart"
+#define QCAUART_TX_TIMEOUT (1 * HZ)
+
+struct qcauart {
+	struct net_device *net_dev;
+	spinlock_t lock;			/* transmit lock */
+	struct work_struct tx_work;		/* Flushes transmit buffer   */
+
+	struct serdev_device *serdev;
+	struct qcafrm_handle frm_handle;
+	struct sk_buff *rx_skb;
+
+	unsigned char *tx_head;			/* pointer to next XMIT byte */
+	int tx_left;				/* bytes left in XMIT queue  */
+	unsigned char *tx_buffer;
+};
+
+static int
+qca_tty_receive(struct serdev_device *serdev, const unsigned char *data,
+		size_t count)
+{
+	struct qcauart *qca = serdev_device_get_drvdata(serdev);
+	struct net_device *netdev = qca->net_dev;
+	struct net_device_stats *n_stats = &netdev->stats;
+	size_t i;
+
+	if (!qca->rx_skb) {
+		qca->rx_skb = netdev_alloc_skb_ip_align(netdev,
+							netdev->mtu +
+							VLAN_ETH_HLEN);
+		if (!qca->rx_skb) {
+			n_stats->rx_errors++;
+			n_stats->rx_dropped++;
+			return 0;
+		}
+	}
+
+	for (i = 0; i < count; i++) {
+		s32 retcode;
+
+		retcode = qcafrm_fsm_decode(&qca->frm_handle,
+					    qca->rx_skb->data,
+					    skb_tailroom(qca->rx_skb),
+					    data[i]);
+
+		switch (retcode) {
+		case QCAFRM_GATHER:
+		case QCAFRM_NOHEAD:
+			break;
+		case QCAFRM_NOTAIL:
+			netdev_dbg(netdev, "recv: no RX tail\n");
+			n_stats->rx_errors++;
+			n_stats->rx_dropped++;
+			break;
+		case QCAFRM_INVLEN:
+			netdev_dbg(netdev, "recv: invalid RX length\n");
+			n_stats->rx_errors++;
+			n_stats->rx_dropped++;
+			break;
+		default:
+			n_stats->rx_packets++;
+			n_stats->rx_bytes += retcode;
+			skb_put(qca->rx_skb, retcode);
+			qca->rx_skb->protocol = eth_type_trans(
+						qca->rx_skb, qca->rx_skb->dev);
+			qca->rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+			netif_rx_ni(qca->rx_skb);
+			qca->rx_skb = netdev_alloc_skb_ip_align(netdev,
+								netdev->mtu +
+								VLAN_ETH_HLEN);
+			if (!qca->rx_skb) {
+				netdev_dbg(netdev, "recv: out of RX resources\n");
+				n_stats->rx_errors++;
+				return i;
+			}
+		}
+	}
+
+	return i;
+}
+
+/* Write out any remaining transmit buffer. Scheduled when tty is writable */
+static void qcauart_transmit(struct work_struct *work)
+{
+	struct qcauart *qca = container_of(work, struct qcauart, tx_work);
+	struct net_device_stats *n_stats = &qca->net_dev->stats;
+	int written;
+
+	spin_lock_bh(&qca->lock);
+
+	/* First make sure we're connected. */
+	if (!netif_running(qca->net_dev)) {
+		spin_unlock_bh(&qca->lock);
+		return;
+	}
+
+	if (qca->tx_left <= 0)  {
+		/* Now serial buffer is almost free & we can start
+		 * transmission of another packet
+		 */
+		n_stats->tx_packets++;
+		spin_unlock_bh(&qca->lock);
+		netif_wake_queue(qca->net_dev);
+		return;
+	}
+
+	written = serdev_device_write_buf(qca->serdev, qca->tx_head,
+					  qca->tx_left);
+	if (written > 0) {
+		qca->tx_left -= written;
+		qca->tx_head += written;
+	}
+	spin_unlock_bh(&qca->lock);
+}
+
+/* Called by the driver when there's room for more data.
+ * Schedule the transmit.
+ */
+static void qca_tty_wakeup(struct serdev_device *serdev)
+{
+	struct qcauart *qca = serdev_device_get_drvdata(serdev);
+
+	schedule_work(&qca->tx_work);
+}
+
+static struct serdev_device_ops qca_serdev_ops = {
+	.receive_buf = qca_tty_receive,
+	.write_wakeup = qca_tty_wakeup,
+};
+
+static int qcauart_netdev_open(struct net_device *dev)
+{
+	struct qcauart *qca = netdev_priv(dev);
+
+	netif_start_queue(qca->net_dev);
+
+	return 0;
+}
+
+static int qcauart_netdev_close(struct net_device *dev)
+{
+	struct qcauart *qca = netdev_priv(dev);
+
+	spin_lock_bh(&qca->lock);
+	netif_stop_queue(dev);
+	qca->tx_left = 0;
+	spin_unlock_bh(&qca->lock);
+
+	return 0;
+}
+
+static netdev_tx_t
+qcauart_netdev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *n_stats = &dev->stats;
+	struct qcauart *qca = netdev_priv(dev);
+	u8 pad_len = 0;
+	int written;
+	u8 *pos;
+
+	spin_lock(&qca->lock);
+
+	WARN_ON(qca->tx_left);
+
+	if (!netif_running(dev))  {
+		spin_unlock(&qca->lock);
+		netdev_warn(qca->net_dev, "xmit: iface is down\n");
+		goto out;
+	}
+
+	pos = qca->tx_buffer;
+
+	if (skb->len < QCAFRM_MIN_LEN)
+		pad_len = QCAFRM_MIN_LEN - skb->len;
+
+	pos += qcafrm_create_header(pos, skb->len + pad_len);
+
+	memcpy(pos, skb->data, skb->len);
+	pos += skb->len;
+
+	if (pad_len) {
+		memset(pos, 0, pad_len);
+		pos += pad_len;
+	}
+
+	pos += qcafrm_create_footer(pos);
+
+	netif_stop_queue(qca->net_dev);
+
+	written = serdev_device_write_buf(qca->serdev, qca->tx_buffer,
+					  pos - qca->tx_buffer);
+	if (written > 0) {
+		qca->tx_left = (pos - qca->tx_buffer) - written;
+		qca->tx_head = qca->tx_buffer + written;
+		n_stats->tx_bytes += written;
+	}
+	spin_unlock(&qca->lock);
+
+	netif_trans_update(dev);
+out:
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void qcauart_netdev_tx_timeout(struct net_device *dev)
+{
+	struct qcauart *qca = netdev_priv(dev);
+
+	netdev_info(qca->net_dev, "Transmit timeout at %ld, latency %ld\n",
+		    jiffies, dev_trans_start(dev));
+	dev->stats.tx_errors++;
+	dev->stats.tx_dropped++;
+}
+
+static int qcauart_netdev_init(struct net_device *dev)
+{
+	struct qcauart *qca = netdev_priv(dev);
+	size_t len;
+
+	/* Finish setting up the device info. */
+	dev->mtu = QCAFRM_MAX_MTU;
+	dev->type = ARPHRD_ETHER;
+
+	qca->rx_skb = netdev_alloc_skb_ip_align(qca->net_dev,
+						qca->net_dev->mtu +
+						VLAN_ETH_HLEN);
+	if (!qca->rx_skb)
+		return -ENOBUFS;
+
+	len = QCAFRM_HEADER_LEN + QCAFRM_MAX_LEN + QCAFRM_FOOTER_LEN;
+	qca->tx_buffer = kmalloc(len, GFP_KERNEL);
+	if (!qca->tx_buffer)
+		return -ENOBUFS;
+
+	return 0;
+}
+
+static void qcauart_netdev_uninit(struct net_device *dev)
+{
+	struct qcauart *qca = netdev_priv(dev);
+
+	kfree(qca->tx_buffer);
+	if (qca->rx_skb)
+		dev_kfree_skb(qca->rx_skb);
+}
+
+static const struct net_device_ops qcauart_netdev_ops = {
+	.ndo_init = qcauart_netdev_init,
+	.ndo_uninit = qcauart_netdev_uninit,
+	.ndo_open = qcauart_netdev_open,
+	.ndo_stop = qcauart_netdev_close,
+	.ndo_start_xmit = qcauart_netdev_xmit,
+	.ndo_set_mac_address = eth_mac_addr,
+	.ndo_tx_timeout = qcauart_netdev_tx_timeout,
+	.ndo_validate_addr = eth_validate_addr,
+};
+
+static void qcauart_netdev_setup(struct net_device *dev)
+{
+	struct qcauart *qca;
+
+	dev->netdev_ops = &qcauart_netdev_ops;
+	dev->watchdog_timeo = QCAUART_TX_TIMEOUT;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->tx_queue_len = 100;
+
+	/* MTU range: 46 - 1500 */
+	dev->min_mtu = QCAFRM_MIN_MTU;
+	dev->max_mtu = QCAFRM_MAX_MTU;
+
+	qca = netdev_priv(dev);
+	memset(qca, 0, sizeof(struct qcauart));
+}
+
+static const struct of_device_id qca_uart_of_match[] = {
+	{
+	 .compatible = "qca,qca7000-uart",
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, qca_uart_of_match);
+
+static int qca_uart_probe(struct serdev_device *serdev)
+{
+	struct net_device *qcauart_dev = alloc_etherdev(sizeof(struct qcauart));
+	struct qcauart *qca;
+	const char *mac;
+	u32 speed = 115200;
+	int ret;
+
+	if (!qcauart_dev)
+		return -ENOMEM;
+
+	qcauart_netdev_setup(qcauart_dev);
+
+	qca = netdev_priv(qcauart_dev);
+	if (!qca) {
+		pr_err("qca_uart: Fail to retrieve private structure\n");
+		ret = -ENOMEM;
+		goto free;
+	}
+	qca->net_dev = qcauart_dev;
+	qca->serdev = serdev;
+	qcafrm_fsm_init_uart(&qca->frm_handle);
+
+	spin_lock_init(&qca->lock);
+	INIT_WORK(&qca->tx_work, qcauart_transmit);
+
+	of_property_read_u32(serdev->dev.of_node, "current-speed", &speed);
+
+	mac = of_get_mac_address(serdev->dev.of_node);
+
+	if (mac)
+		ether_addr_copy(qca->net_dev->dev_addr, mac);
+
+	if (!is_valid_ether_addr(qca->net_dev->dev_addr)) {
+		eth_hw_addr_random(qca->net_dev);
+		dev_info(&serdev->dev, "Using random MAC address: %pM\n",
+			 qca->net_dev->dev_addr);
+	}
+
+	netif_carrier_on(qca->net_dev);
+	serdev_device_set_drvdata(serdev, qca);
+	serdev_device_set_client_ops(serdev, &qca_serdev_ops);
+
+	ret = serdev_device_open(serdev);
+	if (ret) {
+		dev_err(&serdev->dev, "Unable to open device %s\n",
+			qcauart_dev->name);
+		goto free;
+	}
+
+	speed = serdev_device_set_baudrate(serdev, speed);
+	dev_info(&serdev->dev, "Using baudrate: %u\n", speed);
+
+	serdev_device_set_flow_control(serdev, false);
+
+	ret = register_netdev(qcauart_dev);
+	if (ret) {
+		dev_err(&serdev->dev, "Unable to register net device %s\n",
+			qcauart_dev->name);
+		goto free;
+	}
+
+	return 0;
+
+free:
+	free_netdev(qcauart_dev);
+	return ret;
+}
+
+static void qca_uart_remove(struct serdev_device *serdev)
+{
+	struct qcauart *qca = serdev_device_get_drvdata(serdev);
+
+	/* Flush any pending characters in the driver. */
+	serdev_device_close(serdev);
+
+	netif_carrier_off(qca->net_dev);
+	unregister_netdev(qca->net_dev);
+	free_netdev(qca->net_dev);
+}
+
+static struct serdev_device_driver qca_uart_driver = {
+	.probe = qca_uart_probe,
+	.remove = qca_uart_remove,
+	.driver = {
+		.name = QCAUART_DRV_NAME,
+		.of_match_table = of_match_ptr(qca_uart_of_match),
+	},
+};
+
+module_serdev_device_driver(qca_uart_driver);
+
+MODULE_DESCRIPTION("Qualcomm Atheros QCA7000 UART Driver");
+MODULE_AUTHOR("Qualcomm Atheros Communications");
+MODULE_AUTHOR("Stefan Wahren <stefan.wahren@i2se.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(QCAUART_DRV_VERSION);
-- 
2.1.4

^ permalink raw reply related

* [PATCH v5 10/17] net: qualcomm: prepare frame decoding for UART driver
From: Stefan Wahren @ 2017-05-10  8:53 UTC (permalink / raw)
  To: Rob Herring, David S. Miller
  Cc: Mark Rutland, Greg Kroah-Hartman, Jiri Slaby, linux-serial,
	linux-kernel, netdev, devicetree, Stefan Wahren
In-Reply-To: <1494406408-31760-1-git-send-email-stefan.wahren@i2se.com>

Unfortunately the frame format is not exactly identical between SPI
and UART. In case of SPI there is an additional HW length at the
beginning. So store the initial state to make the decoding state machine
more flexible and easy to extend for UART support.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
---
 drivers/net/ethernet/qualcomm/qca_7k_common.c | 12 ++++++------
 drivers/net/ethernet/qualcomm/qca_7k_common.h |  8 ++++++--
 drivers/net/ethernet/qualcomm/qca_spi.c       |  2 +-
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/qca_7k_common.c b/drivers/net/ethernet/qualcomm/qca_7k_common.c
index 6d17fbd..0d3daa9 100644
--- a/drivers/net/ethernet/qualcomm/qca_7k_common.c
+++ b/drivers/net/ethernet/qualcomm/qca_7k_common.c
@@ -83,7 +83,7 @@ qcafrm_fsm_decode(struct qcafrm_handle *handle, u8 *buf, u16 buf_len, u8 recv_by
 
 		if (recv_byte != 0x00) {
 			/* first two bytes of length must be 0 */
-			handle->state = QCAFRM_HW_LEN0;
+			handle->state = handle->init;
 		}
 		break;
 	case QCAFRM_HW_LEN2:
@@ -97,7 +97,7 @@ qcafrm_fsm_decode(struct qcafrm_handle *handle, u8 *buf, u16 buf_len, u8 recv_by
 	case QCAFRM_WAIT_AA4:
 		if (recv_byte != 0xAA) {
 			ret = QCAFRM_NOHEAD;
-			handle->state = QCAFRM_HW_LEN0;
+			handle->state = handle->init;
 		} else {
 			handle->state--;
 		}
@@ -119,7 +119,7 @@ qcafrm_fsm_decode(struct qcafrm_handle *handle, u8 *buf, u16 buf_len, u8 recv_by
 		len = handle->offset;
 		if (len > buf_len || len < QCAFRM_MIN_LEN) {
 			ret = QCAFRM_INVLEN;
-			handle->state = QCAFRM_HW_LEN0;
+			handle->state = handle->init;
 		} else {
 			handle->state = (enum qcafrm_state)(len + 1);
 			/* Remaining number of bytes. */
@@ -135,7 +135,7 @@ qcafrm_fsm_decode(struct qcafrm_handle *handle, u8 *buf, u16 buf_len, u8 recv_by
 	case QCAFRM_WAIT_551:
 		if (recv_byte != 0x55) {
 			ret = QCAFRM_NOTAIL;
-			handle->state = QCAFRM_HW_LEN0;
+			handle->state = handle->init;
 		} else {
 			handle->state = QCAFRM_WAIT_552;
 		}
@@ -143,11 +143,11 @@ qcafrm_fsm_decode(struct qcafrm_handle *handle, u8 *buf, u16 buf_len, u8 recv_by
 	case QCAFRM_WAIT_552:
 		if (recv_byte != 0x55) {
 			ret = QCAFRM_NOTAIL;
-			handle->state = QCAFRM_HW_LEN0;
+			handle->state = handle->init;
 		} else {
 			ret = handle->offset;
 			/* Frame is fully received. */
-			handle->state = QCAFRM_HW_LEN0;
+			handle->state = handle->init;
 		}
 		break;
 	}
diff --git a/drivers/net/ethernet/qualcomm/qca_7k_common.h b/drivers/net/ethernet/qualcomm/qca_7k_common.h
index 5df7c65..07bdd6c 100644
--- a/drivers/net/ethernet/qualcomm/qca_7k_common.h
+++ b/drivers/net/ethernet/qualcomm/qca_7k_common.h
@@ -61,6 +61,7 @@
 #define QCAFRM_ERR_BASE -1000
 
 enum qcafrm_state {
+	/* HW length is only available on SPI */
 	QCAFRM_HW_LEN0 = 0x8000,
 	QCAFRM_HW_LEN1 = QCAFRM_HW_LEN0 - 1,
 	QCAFRM_HW_LEN2 = QCAFRM_HW_LEN1 - 1,
@@ -101,6 +102,8 @@ enum qcafrm_state {
 struct qcafrm_handle {
 	/*  Current decoding state */
 	enum qcafrm_state state;
+	/* Initial state depends on connection type */
+	enum qcafrm_state init;
 
 	/* Offset in buffer (borrowed for length too) */
 	u16 offset;
@@ -113,9 +116,10 @@ u16 qcafrm_create_header(u8 *buf, u16 len);
 
 u16 qcafrm_create_footer(u8 *buf);
 
-static inline void qcafrm_fsm_init(struct qcafrm_handle *handle)
+static inline void qcafrm_fsm_init_spi(struct qcafrm_handle *handle)
 {
-	handle->state = QCAFRM_HW_LEN0;
+	handle->init = QCAFRM_HW_LEN0;
+	handle->state = handle->init;
 }
 
 /*   Gather received bytes and try to extract a full Ethernet frame
diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c
index 2bc3ba4..b7073a9 100644
--- a/drivers/net/ethernet/qualcomm/qca_spi.c
+++ b/drivers/net/ethernet/qualcomm/qca_spi.c
@@ -637,7 +637,7 @@ qcaspi_netdev_open(struct net_device *dev)
 	qca->intr_req = 1;
 	qca->intr_svc = 0;
 	qca->sync = QCASPI_SYNC_UNKNOWN;
-	qcafrm_fsm_init(&qca->frm_handle);
+	qcafrm_fsm_init_spi(&qca->frm_handle);
 
 	qca->spi_thread = kthread_run((void *)qcaspi_spi_thread,
 				      qca, "%s", dev->name);
-- 
2.1.4

^ permalink raw reply related

* Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")
From: Jason Wang @ 2017-05-10  8:56 UTC (permalink / raw)
  To: Anton Ivanov, Stefan Hajnoczi; +Cc: David S. Miller, netdev, Michael S. Tsirkin
In-Reply-To: <16de49b9-dec1-d1df-aa59-dedd90fffb92@cambridgegreys.com>



On 2017年05月10日 13:28, Anton Ivanov wrote:
> On 10/05/17 03:18, Jason Wang wrote:
>>
>> On 2017年05月09日 23:11, Stefan Hajnoczi wrote:
>>> On Tue, May 09, 2017 at 08:46:46AM +0100, Anton Ivanov wrote:
>>>> I have figured it out. Two issues.
>>>>
>>>> 1) skb->xmit_more is hardly ever set under virtualization because
>>>> the qdisc
>>>> is usually bypassed because of TCQ_F_CAN_BYPASS. Once
>>>> TCQ_F_CAN_BYPASS is
>>>> set a virtual NIC driver is not likely see skb->xmit_more (this
>>>> answers my
>>>> "how does this work at all" question).
>>>>
>>>> 2) If that flag is turned off (I patched sched_generic to turn it
>>>> off in
>>>> pfifo_fast while testing), DQL keeps xmit_more from being set. If
>>>> the driver
>>>> is not DQL enabled xmit_more is never ever set. If the driver is DQL
>>>> enabled
>>>> the queue is adjusted to ensure xmit_more stops happening within
>>>> 10-15 xmit
>>>> cycles.
>>>>
>>>> That is plain *wrong* for virtual NICs - virtio, emulated NICs, etc.
>>>> There,
>>>> the BIG cost is telling the hypervisor that it needs to "kick" the
>>>> packets.
>>>> The cost of putting them into the vNIC buffers is negligible. You want
>>>> xmit_more to happen - it makes between 50% and 300% (depending on vNIC
>>>> design) difference. If there is no xmit_more the vNIC will immediately
>>>> "kick" the hypervisor and try to signal that  the packet needs to move
>>>> straight away (as for example in virtio_net).
>> How do you measure the performance? TCP or just measure pps?
> In this particular case - tcp from guest. I have a couple of other
> benchmarks (forwarding, etc).

One more question, is the number for virtio-net or other emulated vNIC?

>
>>>> In addition to that, the perceived line rate is proportional to this
>>>> cost,
>>>> so I am not sure that the current dql math holds. In fact, I think
>>>> it does
>>>> not - it is trying to adjust something which influences the
>>>> perceived line
>>>> rate.
>>>>
>>>> So - how do we turn BOTH bypass and DQL adjustment while under
>>>> virtualization and set them to be "always qdisc" + "always xmit_more
>>>> allowed"
>> Virtio-net net does not support BQL. Before commit ea7735d97ba9
>> ("virtio-net: move free_old_xmit_skbs"), it's even impossible to
>> support that since we don't have tx interrupt for each packet.  I
>> haven't measured the impact of xmit_more, maybe I was wrong but I
>> think it may help in some cases since it may improve the batching on
>> host more or less.
> If you do not support BQL, you might as well look the xmit_more part
> kick code path. Line 1127.
>
> bool kick = !skb->xmit_more; effectively means kick = true;
>
> It will never be triggered. You will be kicking each packet and per
> packet.

Probably not, we have several ways to try to suppress this on the virtio 
layer, host can give hints to disable the kicks through:

- explicitly set a flag
- implicitly by not publishing a new event idx

FYI, I can get 100-200 packets per vm exit when testing 64 byte 
TCP_STREAM using netperf.

> xmit_more is now set only out of BQL. If BQL is not enabled you
> never get it. Now, will the current dql code work correctly if you do
> not have a defined line rate and completion interrupts - no idea.
> Probably not. IMHO instead of trying to fix it there should be a way for
> a device or architecture to turn it off.

In fact BQL is not the only user for xmit_more. Pktgen with burst is 
another. Test does not show obvious difference if I set burst from 0 to 
64 since we already had other ways to avoid kicking host.

>
> To be clear - I ran into this working on my own drivers for UML, you are
> cc-ed because you are likely to be one of the most affected.

I'm still not quite sure the issue. Looks like virtio-net is ok since 
BQL is not supported and the impact of xmit_more could be ignored.

Thanks

>
> A.
>
>> Thanks
>>
>>>> A.
>>>>
>>>> P.S. Cc-ing virtio maintainer
>>> CCing Michael Tsirkin and Jason Wang, who are the core virtio and
>>> virtio-net maintainers.  (I maintain the vsock driver - it's unrelated
>>> to this discussion.)
>>>
>>>> A.
>>>>
>>>>
>>>> On 08/05/17 08:15, Anton Ivanov wrote:
>>>>> Hi all,
>>>>>
>>>>> I was revising some of my old work for UML to prepare it for
>>>>> submission
>>>>> and I noticed that skb->xmit_more does not seem to be set any more.
>>>>>
>>>>> I traced the issue as far as net/sched/sched_generic.c
>>>>>
>>>>> try_bulk_dequeue_skb() is never invoked (the drivers I am working
>>>>> on are
>>>>> dql enabled so that is not the problem).
>>>>>
>>>>> More interestingly, if I put a breakpoint and debug output into
>>>>> dequeue_skb() around line 147 - right before the bulk: tag that skb
>>>>> there is always NULL. ???
>>>>>
>>>>> Similarly, debug in pfifo_fast_dequeue shows only NULLs being
>>>>> dequeued.
>>>>> Again - ???
>>>>>
>>>>> First and foremost, I apologize for the silly question, but how can
>>>>> this
>>>>> work at all? I see the skbs showing up at the driver level, why are
>>>>> NULLs being returned at qdisc dequeue and where do the skbs at the
>>>>> driver level come from?
>>>>>
>>>>> Second, where should I look to fix it?
>>>>>
>>>>> A.
>>>>>
>>>> -- 
>>>> Anton R. Ivanov
>>>>
>>>> Cambridge Greys Limited, England company No 10273661
>>>> http://www.cambridgegreys.com/
>>>>
>>
>

^ permalink raw reply

* Re: [PATCH 1/3] ptr_ring: batch ring zeroing
From: Jesper Dangaard Brouer @ 2017-05-10  9:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Jason Wang, brouer, netdev@vger.kernel.org,
	John Fastabend
In-Reply-To: <20170509163238-mutt-send-email-mst@kernel.org>

On Tue, 9 May 2017 16:33:14 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Sat, Apr 08, 2017 at 02:14:08PM +0200, Jesper Dangaard Brouer wrote:
> > On Fri, 7 Apr 2017 08:49:57 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > A known weakness in ptr_ring design is that it does not handle well the
> > > situation when ring is almost full: as entries are consumed they are
> > > immediately used again by the producer, so consumer and producer are
> > > writing to a shared cache line.
> > > 
> > > To fix this, add batching to consume calls: as entries are
> > > consumed do not write NULL into the ring until we get
> > > a multiple (in current implementation 2x) of cache lines
> > > away from the producer. At that point, write them all out.
> > > 
> > > We do the write out in the reverse order to keep
> > > producer from sharing cache with consumer for as long
> > > as possible.
> > > 
> > > Writeout also triggers when ring wraps around - there's
> > > no special reason to do this but it helps keep the code
> > > a bit simpler.
> > > 
> > > What should we do if getting away from producer by 2 cache lines
> > > would mean we are keeping the ring moe than half empty?
> > > Maybe we should reduce the batching in this case,
> > > current patch simply reduces the batching.
> > > 
> > > Notes:
> > > - it is no longer true that a call to consume guarantees
> > >   that the following call to produce will succeed.
> > >   No users seem to assume that.
> > > - batching can also in theory reduce the signalling rate:
> > >   users that would previously send interrups to the producer
> > >   to wake it up after consuming each entry would now only
> > >   need to do this once in a batch.
> > >   Doing this would be easy by returning a flag to the caller.
> > >   No users seem to do signalling on consume yet so this was not
> > >   implemented yet.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > > 
> > > Jason, I am curious whether the following gives you some of
> > > the performance boost that you see with vhost batching
> > > patches. Is vhost batching on top still helpful?
> > > 
> > >  include/linux/ptr_ring.h | 63 +++++++++++++++++++++++++++++++++++++++++-------
> > >  1 file changed, 54 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
> > > index 6c70444..6b2e0dd 100644
> > > --- a/include/linux/ptr_ring.h
> > > +++ b/include/linux/ptr_ring.h
> > > @@ -34,11 +34,13 @@
> > >  struct ptr_ring {
> > >  	int producer ____cacheline_aligned_in_smp;
> > >  	spinlock_t producer_lock;
> > > -	int consumer ____cacheline_aligned_in_smp;
> > > +	int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
> > > +	int consumer_tail; /* next entry to invalidate */
> > >  	spinlock_t consumer_lock;
> > >  	/* Shared consumer/producer data */
> > >  	/* Read-only by both the producer and the consumer */
> > >  	int size ____cacheline_aligned_in_smp; /* max entries in queue */
> > > +	int batch; /* number of entries to consume in a batch */
> > >  	void **queue;
> > >  };
> > >  
> > > @@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
> > >  static inline void *__ptr_ring_peek(struct ptr_ring *r)
> > >  {
> > >  	if (likely(r->size))
> > > -		return r->queue[r->consumer];
> > > +		return r->queue[r->consumer_head];
> > >  	return NULL;
> > >  }
> > >  
> > > @@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
> > >  /* Must only be called after __ptr_ring_peek returned !NULL */
> > >  static inline void __ptr_ring_discard_one(struct ptr_ring *r)
> > >  {
> > > -	r->queue[r->consumer++] = NULL;
> > > -	if (unlikely(r->consumer >= r->size))
> > > -		r->consumer = 0;
> > > +	/* Fundamentally, what we want to do is update consumer
> > > +	 * index and zero out the entry so producer can reuse it.
> > > +	 * Doing it naively at each consume would be as simple as:
> > > +	 *       r->queue[r->consumer++] = NULL;
> > > +	 *       if (unlikely(r->consumer >= r->size))
> > > +	 *               r->consumer = 0;
> > > +	 * but that is suboptimal when the ring is full as producer is writing
> > > +	 * out new entries in the same cache line.  Defer these updates until a
> > > +	 * batch of entries has been consumed.
> > > +	 */
> > > +	int head = r->consumer_head++;
> > > +
> > > +	/* Once we have processed enough entries invalidate them in
> > > +	 * the ring all at once so producer can reuse their space in the ring.
> > > +	 * We also do this when we reach end of the ring - not mandatory
> > > +	 * but helps keep the implementation simple.
> > > +	 */
> > > +	if (unlikely(r->consumer_head - r->consumer_tail >= r->batch ||
> > > +		     r->consumer_head >= r->size)) {
> > > +		/* Zero out entries in the reverse order: this way we touch the
> > > +		 * cache line that producer might currently be reading the last;
> > > +		 * producer won't make progress and touch other cache lines
> > > +		 * besides the first one until we write out all entries.
> > > +		 */
> > > +		while (likely(head >= r->consumer_tail))
> > > +			r->queue[head--] = NULL;
> > > +		r->consumer_tail = r->consumer_head;
> > > +	}
> > > +	if (unlikely(r->consumer_head >= r->size)) {
> > > +		r->consumer_head = 0;
> > > +		r->consumer_tail = 0;
> > > +	}
> > >  }  
> > 
> > I love this idea.  Reviewed and discussed the idea in-person with MST
> > during netdevconf[1] at this laptop.  I promised I will also run it
> > through my micro-benchmarking[2] once I return home (hint ptr_ring gets
> > used in network stack as skb_array).  
> 
> I'm merging this through my tree. Any objections?

I just did the micro-benchmark evaluation I promised and everything
looks good, so no objections from me.

John Fastabend recently posted a RFC patchset for removing the qdisc
lock.  The main purpose is to separate enqueue'ers and dequeue'ers from
serializing on the same lock.  But we need a new queue implementation
that avoids the false-sharing between enq+deq.

This is why John choose to use ptr_ring, changed (pfifo_fast) qdisc to
use this ptr_ring.  I think this patch might help his overload testing,
as my theory is that he is hitting false-sharing on the queue, due to
the queue always being full.


> > Reviewed-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > 
> > [1] http://netdevconf.org/2.1/
> > [2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/skb_array_bench01.c

If you like can also add my:

Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

$ modprobe skb_array_test01 
$ dmesg 
[73228.381497] skb_array_test01: Loaded
[73228.381498] skb_array_test01: PASSED - basic_init_and_cleanup()
[73228.381498] skb_array_test01: PASSED - basic_add_and_remove_object()
[73228.381505] skb_array_test01: PASSED - test_queue_full_condition()
[73228.381505] skb_array_test01: PASSED - test_queue_empty_condition()
[73228.381510] skb_array_test01: PASSED - test_queue_resize()

^ permalink raw reply

* Re: [PATCH net 2/2] xdp: disallow use of native and generic hook at once
From: Daniel Borkmann @ 2017-05-10  9:36 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: davem, alexei.starovoitov, john.fastabend, netdev
In-Reply-To: <20170509201842.2ed5e330@cakuba.netronome.com>

On 05/10/2017 05:18 AM, Jakub Kicinski wrote:
> On Wed, 10 May 2017 03:31:31 +0200, Daniel Borkmann wrote:
>> While working on the iproute2 generic XDP frontend, I noticed that
>> as of right now it's possible to have native *and* generic XDP
>> programs loaded both at the same time for the case when a driver
>> supports native XDP.
>
> Nice improvement!  A couple of absolute nitpicks below..
>
>> The intended model for generic XDP from b5cdae3291f7 ("net: Generic
>> XDP") is, however, that only one out of the two can be present at
>> once which is also indicated as such in the XPD netlink dump part.
>                                                ^^^
>                                                   XDP

Good point.

>> @@ -6851,6 +6851,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
>>   }
>>   EXPORT_SYMBOL(dev_change_proto_down);
>>
>> +bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
>
> Out of curiosity - the leading underscores refer to caller having to
> hold rtnl?  I assume they are not needed in the function below because
> it's static?

I think I don't quite follow the last question, but it probably makes
sense to add an ASSERT_RTNL() into dev_xdp_attached() inline helper to
make it clearly visible to callers of this api.

>> +{
>> +	struct netdev_xdp xdp;
>> +
>> +	memset(&xdp, 0, sizeof(xdp));
>> +	xdp.command = XDP_QUERY_PROG;
>
> Probably personal preference, but seems like designated struct
> initializer would do quite nicely here and save the memset :)

I had that initially, but I recalled that gcc < 4.6 does not handle this
style for the initialization of anonymous struct/union properly (e.g.,
we fixed that in iproute2 as well). Andrew Morton still uses gcc 4.4.4
and occasionally sends kernel fixes, so we might end up like this anyway.

>> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
>> index dda9f16..99320f0 100644
>> --- a/net/core/rtnetlink.c
>> +++ b/net/core/rtnetlink.c
>> @@ -1251,24 +1251,20 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
>>   {
>>   	struct nlattr *xdp;
>>   	u32 xdp_flags = 0;
>> -	u8 val = 0;
>>   	int err;
>> +	u8 val;
>>
>>   	xdp = nla_nest_start(skb, IFLA_XDP);
>>   	if (!xdp)
>>   		return -EMSGSIZE;
>> +
>>   	if (rcu_access_pointer(dev->xdp_prog)) {
>>   		xdp_flags = XDP_FLAGS_SKB_MODE;
>>   		val = 1;
>> -	} else if (dev->netdev_ops->ndo_xdp) {
>> -		struct netdev_xdp xdp_op = {};
>> -
>> -		xdp_op.command = XDP_QUERY_PROG;
>> -		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
>> -		if (err)
>> -			goto err_cancel;
>> -		val = xdp_op.prog_attached;
>> +	} else {
>> +		val = dev_xdp_attached(dev);
>> 	}
>
> Would it make sense to set xdp_flags to XDP_FLAGS_DRV_MODE here to keep
> things symmetrical?  I know you are just preserving existing behaviour
> but it may seem slightly arbitrary to a new comer to report one of the
> very similarly named flags in the dump but not the other.

I thought about it, it's kind of redundant information since
IFLA_XDP_ATTACHED attribute w/o IFLA_XDP_FLAGS attribute today
says that it's native already. It might look strange if we add
also XDP_FLAGS_DRV_MODE there, since it doesn't give anything
new. I rather see it similar to XDP_FLAGS_UPDATE_IF_NOEXIST flag
that is for updating fd only, but I don't really have a strong
opinion on this though. I could add it to the respin if preferred.

^ permalink raw reply

* Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")
From: Anton Ivanov @ 2017-05-10  9:42 UTC (permalink / raw)
  To: Jason Wang, Stefan Hajnoczi; +Cc: David S. Miller, netdev, Michael S. Tsirkin
In-Reply-To: <6accf7bd-fb21-604b-9586-ecd0f6830a65@redhat.com>

On 10/05/17 09:56, Jason Wang wrote:
>
>
> On 2017年05月10日 13:28, Anton Ivanov wrote:
>> On 10/05/17 03:18, Jason Wang wrote:
>>>
>>> On 2017年05月09日 23:11, Stefan Hajnoczi wrote:
>>>> On Tue, May 09, 2017 at 08:46:46AM +0100, Anton Ivanov wrote:
>>>>> I have figured it out. Two issues.
>>>>>
>>>>> 1) skb->xmit_more is hardly ever set under virtualization because
>>>>> the qdisc
>>>>> is usually bypassed because of TCQ_F_CAN_BYPASS. Once
>>>>> TCQ_F_CAN_BYPASS is
>>>>> set a virtual NIC driver is not likely see skb->xmit_more (this
>>>>> answers my
>>>>> "how does this work at all" question).
>>>>>
>>>>> 2) If that flag is turned off (I patched sched_generic to turn it
>>>>> off in
>>>>> pfifo_fast while testing), DQL keeps xmit_more from being set. If
>>>>> the driver
>>>>> is not DQL enabled xmit_more is never ever set. If the driver is DQL
>>>>> enabled
>>>>> the queue is adjusted to ensure xmit_more stops happening within
>>>>> 10-15 xmit
>>>>> cycles.
>>>>>
>>>>> That is plain *wrong* for virtual NICs - virtio, emulated NICs, etc.
>>>>> There,
>>>>> the BIG cost is telling the hypervisor that it needs to "kick" the
>>>>> packets.
>>>>> The cost of putting them into the vNIC buffers is negligible. You 
>>>>> want
>>>>> xmit_more to happen - it makes between 50% and 300% (depending on 
>>>>> vNIC
>>>>> design) difference. If there is no xmit_more the vNIC will 
>>>>> immediately
>>>>> "kick" the hypervisor and try to signal that  the packet needs to 
>>>>> move
>>>>> straight away (as for example in virtio_net).
>>> How do you measure the performance? TCP or just measure pps?
>> In this particular case - tcp from guest. I have a couple of other
>> benchmarks (forwarding, etc).
>
> One more question, is the number for virtio-net or other emulated vNIC?

Other for now - you are cc-ed to keep you in the loop.

Virtio is next on my list - I am revisiting the l2tpv3.c driver in QEMU 
and looking at how to preserve bulking by adding back sendmmsg (as well 
as a list of other features/transports).

We had sendmmsg removed for the final inclusion in QEMU 2.1, it 
presently uses only recvmmsg so for the time being it does not care. 
That will most likely change once it starts using sendmmsg as well.

>
>>
>>>>> In addition to that, the perceived line rate is proportional to this
>>>>> cost,
>>>>> so I am not sure that the current dql math holds. In fact, I think
>>>>> it does
>>>>> not - it is trying to adjust something which influences the
>>>>> perceived line
>>>>> rate.
>>>>>
>>>>> So - how do we turn BOTH bypass and DQL adjustment while under
>>>>> virtualization and set them to be "always qdisc" + "always xmit_more
>>>>> allowed"
>>> Virtio-net net does not support BQL. Before commit ea7735d97ba9
>>> ("virtio-net: move free_old_xmit_skbs"), it's even impossible to
>>> support that since we don't have tx interrupt for each packet.  I
>>> haven't measured the impact of xmit_more, maybe I was wrong but I
>>> think it may help in some cases since it may improve the batching on
>>> host more or less.
>> If you do not support BQL, you might as well look the xmit_more part
>> kick code path. Line 1127.
>>
>> bool kick = !skb->xmit_more; effectively means kick = true;
>>
>> It will never be triggered. You will be kicking each packet and per
>> packet.
>
> Probably not, we have several ways to try to suppress this on the 
> virtio layer, host can give hints to disable the kicks through:
>
> - explicitly set a flag
> - implicitly by not publishing a new event idx
>
> FYI, I can get 100-200 packets per vm exit when testing 64 byte 
> TCP_STREAM using netperf.

I am aware of that. If, however, the host is providing a hint we might 
as well use it.

>
>> xmit_more is now set only out of BQL. If BQL is not enabled you
>> never get it. Now, will the current dql code work correctly if you do
>> not have a defined line rate and completion interrupts - no idea.
>> Probably not. IMHO instead of trying to fix it there should be a way for
>> a device or architecture to turn it off.
>
> In fact BQL is not the only user for xmit_more. Pktgen with burst is 
> another. Test does not show obvious difference if I set burst from 0 
> to 64 since we already had other ways to avoid kicking host.

That, as well as this not being wired to bulk transport.

>
>>
>> To be clear - I ran into this working on my own drivers for UML, you are
>> cc-ed because you are likely to be one of the most affected.
>
> I'm still not quite sure the issue. Looks like virtio-net is ok since 
> BQL is not supported and the impact of xmit_more could be ignored.

Presently - yes. If you have bulk aware transports to wire into that is 
likely to make a difference.

>
> Thanks
>
>>
>> A.
>>
>>> Thanks
>>>
>>>>> A.
>>>>>
>>>>> P.S. Cc-ing virtio maintainer
>>>> CCing Michael Tsirkin and Jason Wang, who are the core virtio and
>>>> virtio-net maintainers.  (I maintain the vsock driver - it's unrelated
>>>> to this discussion.)
>>>>
>>>>> A.
>>>>>
>>>>>
>>>>> On 08/05/17 08:15, Anton Ivanov wrote:
>>>>>> Hi all,
>>>>>>
>>>>>> I was revising some of my old work for UML to prepare it for
>>>>>> submission
>>>>>> and I noticed that skb->xmit_more does not seem to be set any more.
>>>>>>
>>>>>> I traced the issue as far as net/sched/sched_generic.c
>>>>>>
>>>>>> try_bulk_dequeue_skb() is never invoked (the drivers I am working
>>>>>> on are
>>>>>> dql enabled so that is not the problem).
>>>>>>
>>>>>> More interestingly, if I put a breakpoint and debug output into
>>>>>> dequeue_skb() around line 147 - right before the bulk: tag that skb
>>>>>> there is always NULL. ???
>>>>>>
>>>>>> Similarly, debug in pfifo_fast_dequeue shows only NULLs being
>>>>>> dequeued.
>>>>>> Again - ???
>>>>>>
>>>>>> First and foremost, I apologize for the silly question, but how can
>>>>>> this
>>>>>> work at all? I see the skbs showing up at the driver level, why are
>>>>>> NULLs being returned at qdisc dequeue and where do the skbs at the
>>>>>> driver level come from?
>>>>>>
>>>>>> Second, where should I look to fix it?
>>>>>>
>>>>>> A.
>>>>>>
>>>>> -- 
>>>>> Anton R. Ivanov
>>>>>
>>>>> Cambridge Greys Limited, England company No 10273661
>>>>> http://www.cambridgegreys.com/
>>>>>
>>>
>>
>
>


-- 
Anton R. Ivanov

Cambridge Greys Limited, England company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply

* Re: [PATCH] ip: mpls: fix printing of mpls labels
From: Simon Horman @ 2017-05-10 10:02 UTC (permalink / raw)
  To: David Ahern; +Cc: stephen, netdev, roopa
In-Reply-To: <20170509060413.11596-1-dsahern@gmail.com>

On Mon, May 08, 2017 at 11:04:13PM -0700, David Ahern wrote:
> If the kernel returns more labels than iproute2 expects, none of
> the labels are printed and (null) is shown instead:
>     $ ip -f mpls ro ls
>     101 as to (null) via inet 172.16.2.2 dev virt12
>     201 as to 202/203 via inet6 2001:db8:2::2 dev virt12
> 
> Remove the use of MPLS_MAX_LABELS and rely on buffer length that is
> passed to mpls_ntop. With this change ip can print the label stack
> returned by the kernel up to 255 characters (limit is due to size of
> buf passed in) which amounts to 31 labels with a separator.
> 
> With this change the above is:
>     $ ip/ip -f mpls ro ls
>     101 as to 102/103/104/105/106/107/108/109/110 via inet 172.16.2.2 dev virt12
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>

Reviewed-by: Simon Horman <simon.horman@netronome.com>

^ permalink raw reply

* Re: iproute2 ss outputs duplicate tcp sockets info on kernel 3.10.105
From: Phil Sutter @ 2017-05-10 10:37 UTC (permalink / raw)
  To: Li Er; +Cc: netdev, Cyrill Gorcunov
In-Reply-To: <591121b4.213b9f0a.5aed.b3a8@mx.google.com>

Hi,

Cc'ing Cyrill who wrote the code in question. Maybe he has an idea
what's going wrong here.

Cheers, Phil

On Mon, May 08, 2017 at 06:56:04PM -0700, Li Er wrote:
> i'm using v4.11.0 release of iproute2 and kernel 3.10.105, simply
> running
> 
>         $ ss
>         Netid  State      Recv-Q Send-Q Local Address:Port                 Peer Address:Port
>         tcp    CLOSE-WAIT 434    0      10.0.0.1:47931                65.49.18.136:https
>         tcp    CLOSE-WAIT 432    0      10.0.0.1:47932                65.49.18.136:https
>         tcp    CLOSE-WAIT 434    0      10.0.0.1:47931                65.49.18.136:https
>         tcp    CLOSE-WAIT 432    0      10.0.0.1:47932                65.49.18.136:https
> 
> as you can see, there's one duplicate entry of each  tcp  socket,
> however,  if  i  explicitly  specify  tcp socket by adding the -t
> switch,
> 
>         $ ss -t
>         State      Recv-Q Send-Q Local Address:Port                 Peer Address:Port
>         CLOSE-WAIT 434    0      10.0.0.1:47931                65.49.18.136:https
>         CLOSE-WAIT 432    0      10.0.0.1:47932                65.49.18.136:https
> 
> the duplication is gone.
> 
> this problem also occurs on  git  master,  but  not  on  iproute2
> v4.3.0,  so  i  did  a  git bisect and found out the commit which
> caused this is 9f66764e308e9c645b3fb2d1cfbb7fb207eb5de1,  and  by
> revert this commit on git master, i.e. removing
> 
>                                         err = rtnl_dump_done(rth, h);
>                                         if (err < 0)
>                                                 return -1;
> 
> these 3 lines of code of lib/libnetlink.c, the problem is gone.
> 
> since  i'm not familiar with the source code, i doubt this is the
> right way to solve the problem, what's your suggestions? thanks.

^ permalink raw reply

* RE: bpf pointer alignment validation
From: David Laight @ 2017-05-10 11:12 UTC (permalink / raw)
  To: 'Alexei Starovoitov', David Miller
  Cc: daniel@iogearbox.net, ast@fb.com, netdev@vger.kernel.org
In-Reply-To: <20170510055735.hfkoh4w3xaka5yl5@ast-mbp>

From: Alexei Starovoitov
> Sent: 10 May 2017 06:58
> > +static u32 calc_align(u32 imm)
> > +{
> > +	u32 align = 1;
> > +
> > +	if (!imm)
> > +		return 1U << 31;
> > +
> > +	while (!(imm & 1)) {
> > +		imm >>= 1;
> > +		align <<= 1;
> > +	}
> > +	return align;
> > +}
> 
> same question as in previous reply.
> Why not to use something like:
> static u32 calc_align(u32 n)
> {
>         if (!n)
>                 return 1U << 31;
>         return n - ((n - 1) & n);
> }

That function needs a comment saying what it returns.
Think I'd write it as:
		return n & ~(n & (n - 1));
(even though that might be one more instruction)

	David

^ permalink raw reply

* Re: [PATCH 1/2] net: Added mtu parameter to dev_forward_skb calls
From: kbuild test robot @ 2017-05-10 11:47 UTC (permalink / raw)
  To: Fredrik Markstrom
  Cc: kbuild-all, Eric Dumazet, David S. Miller, Stephen Hemminger,
	Alexei Starovoitov, Daniel Borkmann, netdev, linux-kernel, bridge,
	Fredrik Markström
In-Reply-To: <20170509124439.45674-2-fredrik.markstrom@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1341 bytes --]

Hi Fredrik,

[auto build test ERROR on net/master]
[also build test ERROR on v4.11 next-20170510]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Fredrik-Markstrom/net-Added-mtu-parameter-to-dev_forward_skb-calls/20170509-231142
config: x86_64-rhel (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   net//bridge/br_forward.c: In function '__br_forward':
>> net//bridge/br_forward.c:99:46: error: 'struct sk_buff' has no member named 'dev_mtu'
       if (!is_skb_forwardable(skb->dev, skb, skb->dev_mtu)) {
                                                 ^~

vim +99 net//bridge/br_forward.c

    93			}
    94			br_hook = NF_BR_FORWARD;
    95			skb_forward_csum(skb);
    96			net = dev_net(indev);
    97		} else {
    98			if (unlikely(netpoll_tx_running(to->br->dev))) {
  > 99				if (!is_skb_forwardable(skb->dev, skb, skb->dev_mtu)) {
   100					kfree_skb(skb);
   101				} else {
   102					skb_push(skb, ETH_HLEN);

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 39200 bytes --]

^ permalink raw reply

* Your response Is highly appreciated!
From: Mr.Adams Salem @ 2017-05-10 12:08 UTC (permalink / raw)




Hello , 

I am specifically contacting you in respect of a business proposal that I have for you as you appear very relevant in the proposal. 

Please kindly reply back to me for further details.

Waiting to hear from you. 

Regards,

Mr.Adams Salem

Email: mradams@salem-my.com

^ permalink raw reply

* Re: [PATCH 1/3] ptr_ring: batch ring zeroing
From: Michael S. Tsirkin @ 2017-05-10 12:20 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: linux-kernel, Jason Wang, netdev@vger.kernel.org, John Fastabend
In-Reply-To: <20170510111813.35f21ab0@redhat.com>

On Wed, May 10, 2017 at 11:18:13AM +0200, Jesper Dangaard Brouer wrote:
> On Tue, 9 May 2017 16:33:14 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Sat, Apr 08, 2017 at 02:14:08PM +0200, Jesper Dangaard Brouer wrote:
> > > On Fri, 7 Apr 2017 08:49:57 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > A known weakness in ptr_ring design is that it does not handle well the
> > > > situation when ring is almost full: as entries are consumed they are
> > > > immediately used again by the producer, so consumer and producer are
> > > > writing to a shared cache line.
> > > > 
> > > > To fix this, add batching to consume calls: as entries are
> > > > consumed do not write NULL into the ring until we get
> > > > a multiple (in current implementation 2x) of cache lines
> > > > away from the producer. At that point, write them all out.
> > > > 
> > > > We do the write out in the reverse order to keep
> > > > producer from sharing cache with consumer for as long
> > > > as possible.
> > > > 
> > > > Writeout also triggers when ring wraps around - there's
> > > > no special reason to do this but it helps keep the code
> > > > a bit simpler.
> > > > 
> > > > What should we do if getting away from producer by 2 cache lines
> > > > would mean we are keeping the ring moe than half empty?
> > > > Maybe we should reduce the batching in this case,
> > > > current patch simply reduces the batching.
> > > > 
> > > > Notes:
> > > > - it is no longer true that a call to consume guarantees
> > > >   that the following call to produce will succeed.
> > > >   No users seem to assume that.
> > > > - batching can also in theory reduce the signalling rate:
> > > >   users that would previously send interrups to the producer
> > > >   to wake it up after consuming each entry would now only
> > > >   need to do this once in a batch.
> > > >   Doing this would be easy by returning a flag to the caller.
> > > >   No users seem to do signalling on consume yet so this was not
> > > >   implemented yet.
> > > > 
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > ---
> > > > 
> > > > Jason, I am curious whether the following gives you some of
> > > > the performance boost that you see with vhost batching
> > > > patches. Is vhost batching on top still helpful?
> > > > 
> > > >  include/linux/ptr_ring.h | 63 +++++++++++++++++++++++++++++++++++++++++-------
> > > >  1 file changed, 54 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
> > > > index 6c70444..6b2e0dd 100644
> > > > --- a/include/linux/ptr_ring.h
> > > > +++ b/include/linux/ptr_ring.h
> > > > @@ -34,11 +34,13 @@
> > > >  struct ptr_ring {
> > > >  	int producer ____cacheline_aligned_in_smp;
> > > >  	spinlock_t producer_lock;
> > > > -	int consumer ____cacheline_aligned_in_smp;
> > > > +	int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
> > > > +	int consumer_tail; /* next entry to invalidate */
> > > >  	spinlock_t consumer_lock;
> > > >  	/* Shared consumer/producer data */
> > > >  	/* Read-only by both the producer and the consumer */
> > > >  	int size ____cacheline_aligned_in_smp; /* max entries in queue */
> > > > +	int batch; /* number of entries to consume in a batch */
> > > >  	void **queue;
> > > >  };
> > > >  
> > > > @@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
> > > >  static inline void *__ptr_ring_peek(struct ptr_ring *r)
> > > >  {
> > > >  	if (likely(r->size))
> > > > -		return r->queue[r->consumer];
> > > > +		return r->queue[r->consumer_head];
> > > >  	return NULL;
> > > >  }
> > > >  
> > > > @@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
> > > >  /* Must only be called after __ptr_ring_peek returned !NULL */
> > > >  static inline void __ptr_ring_discard_one(struct ptr_ring *r)
> > > >  {
> > > > -	r->queue[r->consumer++] = NULL;
> > > > -	if (unlikely(r->consumer >= r->size))
> > > > -		r->consumer = 0;
> > > > +	/* Fundamentally, what we want to do is update consumer
> > > > +	 * index and zero out the entry so producer can reuse it.
> > > > +	 * Doing it naively at each consume would be as simple as:
> > > > +	 *       r->queue[r->consumer++] = NULL;
> > > > +	 *       if (unlikely(r->consumer >= r->size))
> > > > +	 *               r->consumer = 0;
> > > > +	 * but that is suboptimal when the ring is full as producer is writing
> > > > +	 * out new entries in the same cache line.  Defer these updates until a
> > > > +	 * batch of entries has been consumed.
> > > > +	 */
> > > > +	int head = r->consumer_head++;
> > > > +
> > > > +	/* Once we have processed enough entries invalidate them in
> > > > +	 * the ring all at once so producer can reuse their space in the ring.
> > > > +	 * We also do this when we reach end of the ring - not mandatory
> > > > +	 * but helps keep the implementation simple.
> > > > +	 */
> > > > +	if (unlikely(r->consumer_head - r->consumer_tail >= r->batch ||
> > > > +		     r->consumer_head >= r->size)) {
> > > > +		/* Zero out entries in the reverse order: this way we touch the
> > > > +		 * cache line that producer might currently be reading the last;
> > > > +		 * producer won't make progress and touch other cache lines
> > > > +		 * besides the first one until we write out all entries.
> > > > +		 */
> > > > +		while (likely(head >= r->consumer_tail))
> > > > +			r->queue[head--] = NULL;
> > > > +		r->consumer_tail = r->consumer_head;
> > > > +	}
> > > > +	if (unlikely(r->consumer_head >= r->size)) {
> > > > +		r->consumer_head = 0;
> > > > +		r->consumer_tail = 0;
> > > > +	}
> > > >  }  
> > > 
> > > I love this idea.  Reviewed and discussed the idea in-person with MST
> > > during netdevconf[1] at this laptop.  I promised I will also run it
> > > through my micro-benchmarking[2] once I return home (hint ptr_ring gets
> > > used in network stack as skb_array).  
> > 
> > I'm merging this through my tree. Any objections?
> 
> I just did the micro-benchmark evaluation I promised and everything
> looks good, so no objections from me.
> 
> John Fastabend recently posted a RFC patchset for removing the qdisc
> lock.  The main purpose is to separate enqueue'ers and dequeue'ers from
> serializing on the same lock.  But we need a new queue implementation
> that avoids the false-sharing between enq+deq.
> 
> This is why John choose to use ptr_ring, changed (pfifo_fast) qdisc to
> use this ptr_ring.  I think this patch might help his overload testing,
> as my theory is that he is hitting false-sharing on the queue, due to
> the queue always being full.
> 
> 
> > > Reviewed-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > > 
> > > [1] http://netdevconf.org/2.1/
> > > [2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/skb_array_bench01.c
> 
> If you like can also add my:
> 
> Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>

I pushed it out already unfortunately so I can't attach that.
Sorry.  Thanks a lot for the testing!

> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
> 
> $ modprobe skb_array_test01 
> $ dmesg 
> [73228.381497] skb_array_test01: Loaded
> [73228.381498] skb_array_test01: PASSED - basic_init_and_cleanup()
> [73228.381498] skb_array_test01: PASSED - basic_add_and_remove_object()
> [73228.381505] skb_array_test01: PASSED - test_queue_full_condition()
> [73228.381505] skb_array_test01: PASSED - test_queue_empty_condition()
> [73228.381510] skb_array_test01: PASSED - test_queue_resize()

^ permalink raw reply

* [PULL] virtio: fixes, cleanups, performance
From: Michael S. Tsirkin @ 2017-05-10 12:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: kvm, virtualization, netdev, linux-kernel, borntraeger,
	colin.king, cornelia.huck, dan.carpenter, jasowang, mst, nsekhar,
	pasic

The following changes since commit a351e9b9fc24e982ec2f0e76379a49826036da12:

  Linux 4.11 (2017-04-30 19:47:48 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus

for you to fetch changes up to c8b0d7290657996a29f318b948d2397d30e70c36:

  s390/virtio: change maintainership (2017-05-09 16:43:25 +0300)

----------------------------------------------------------------
virtio: fixes, cleanups, performance

A bunch of changes to virtio, most affecting virtio net.
ptr_ring batched zeroing - first of batching enhancements
that seems ready.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

----------------------------------------------------------------
Christian Borntraeger (1):
      s390/virtio: change maintainership

Colin Ian King (1):
      tools/virtio: fix spelling mistake: "wakeus" -> "wakeups"

Cornelia Huck (1):
      virtio: virtio_driver doc

Dan Carpenter (2):
      ringtest: fix an assert statement
      virtio_net: tidy a couple debug statements

Michael S. Tsirkin (11):
      virtio: wrap find_vqs
      virtio: add context flag to find vqs
      virtio: allow extra context per descriptor
      virtio_net: allow specifying context for rx
      virtio_net: rework mergeable buffer handling
      virtio_net: reduce alignment for buffers
      virtio_net: fix support for small rings
      virtio_net: don't reset twice on XDP on/off
      ptr_ring: batch ring zeroing
      ringtest: support test specific parameters
      ptr_ring: support testing different batching sizes

Sekhar Nori (1):
      tools/virtio: fix build breakage

 MAINTAINERS                                |   2 +-
 drivers/block/virtio_blk.c                 |   3 +-
 drivers/char/virtio_console.c              |   6 +-
 drivers/crypto/virtio/virtio_crypto_core.c |   3 +-
 drivers/gpu/drm/virtio/virtgpu_kms.c       |   3 +-
 drivers/misc/mic/vop/vop_main.c            |   9 +-
 drivers/net/caif/caif_virtio.c             |   3 +-
 drivers/net/virtio_net.c                   | 147 ++++++++++++++++-------------
 drivers/remoteproc/remoteproc_virtio.c     |  10 +-
 drivers/rpmsg/virtio_rpmsg_bus.c           |   2 +-
 drivers/s390/virtio/kvm_virtio.c           |   8 +-
 drivers/s390/virtio/virtio_ccw.c           |   7 +-
 drivers/scsi/virtio_scsi.c                 |   3 +-
 drivers/virtio/virtio_balloon.c            |   3 +-
 drivers/virtio/virtio_input.c              |   3 +-
 drivers/virtio/virtio_mmio.c               |   8 +-
 drivers/virtio/virtio_pci_common.c         |  17 ++--
 drivers/virtio/virtio_pci_common.h         |   4 +-
 drivers/virtio/virtio_pci_legacy.c         |   4 +-
 drivers/virtio/virtio_pci_modern.c         |  12 ++-
 drivers/virtio/virtio_ring.c               |  77 ++++++++++++---
 include/linux/ptr_ring.h                   |  63 +++++++++++--
 include/linux/virtio.h                     |  13 +++
 include/linux/virtio_config.h              |  25 ++++-
 include/linux/virtio_ring.h                |   3 +
 net/vmw_vsock/virtio_transport.c           |   6 +-
 tools/virtio/linux/virtio.h                |   1 +
 tools/virtio/ringtest/main.c               |  15 ++-
 tools/virtio/ringtest/main.h               |   2 +
 tools/virtio/ringtest/ptr_ring.c           |   3 +
 tools/virtio/virtio_test.c                 |   4 +-
 tools/virtio/vringh_test.c                 |   7 +-
 32 files changed, 330 insertions(+), 146 deletions(-)

^ permalink raw reply

* Re: [PATCH net-next V4 10/10] vhost_net: try batch dequing from skb array
From: Michael S. Tsirkin @ 2017-05-10 12:34 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel
In-Reply-To: <1494387382-19916-11-git-send-email-jasowang@redhat.com>

On Wed, May 10, 2017 at 11:36:22AM +0800, Jason Wang wrote:
> We used to dequeue one skb during recvmsg() from skb_array, this could
> be inefficient because of the bad cache utilization and spinlock
> touching for each packet. This patch tries to batch them by calling
> batch dequeuing helpers explicitly on the exported skb array and pass
> the skb back through msg_control for underlayer socket to finish the
> userspace copying.
> 
> Batch dequeuing is also the requirement for more batching improvement
> on rx.
> 
> Tests were done by pktgen on tap with XDP1 in guest on top of batch
> zeroing:
> 
> rx batch | pps
> 
> 256        2.41Mpps (+6.16%)
> 128        2.48Mpps (+8.80%)
> 64         2.38Mpps (+3.96%) <- Default
> 16         2.31Mpps (+1.76%)
> 4          2.31Mpps (+1.76%)
> 1          2.30Mpps (+1.32%)
> 0          2.27Mpps (+7.48%)
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 111 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9b51989..fbaecf3 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -28,6 +28,8 @@
>  #include <linux/if_macvlan.h>
>  #include <linux/if_tap.h>
>  #include <linux/if_vlan.h>
> +#include <linux/skb_array.h>
> +#include <linux/skbuff.h>
>  
>  #include <net/sock.h>
>  
> @@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {
>  	struct vhost_virtqueue *vq;
>  };
>  
> +#define VHOST_RX_BATCH 64
> +struct vhost_net_buf {
> +	struct sk_buff *queue[VHOST_RX_BATCH];
> +	int tail;
> +	int head;
> +};
> +

Do you strictly need to put this inline? This structure is quite big
already. Do you see a measureabe difference if you make it

	struct sk_buff **queue;
	int tail;
	int head;

?

Will also make it easier to play with the size in the future
should someone want to see how does it work e.g. for different
ring sizes.

>  struct vhost_net_virtqueue {
>  	struct vhost_virtqueue vq;
>  	size_t vhost_hlen;
> @@ -99,6 +108,8 @@ struct vhost_net_virtqueue {
>  	/* Reference counting for outstanding ubufs.
>  	 * Protected by vq mutex. Writers must also take device mutex. */
>  	struct vhost_net_ubuf_ref *ubufs;
> +	struct skb_array *rx_array;
> +	struct vhost_net_buf rxq;
>  };
>  
>  struct vhost_net {
> @@ -117,6 +128,71 @@ struct vhost_net {
>  
>  static unsigned vhost_net_zcopy_mask __read_mostly;
>  
> +static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
> +{
> +	if (rxq->tail != rxq->head)
> +		return rxq->queue[rxq->head];
> +	else
> +		return NULL;
> +}
> +
> +static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
> +{
> +	return rxq->tail - rxq->head;
> +}
> +
> +static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
> +{
> +	return rxq->tail == rxq->head;
> +}
> +
> +static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
> +{
> +	void *ret = vhost_net_buf_get_ptr(rxq);
> +	++rxq->head;
> +	return ret;
> +}
> +
> +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
> +{
> +	struct vhost_net_buf *rxq = &nvq->rxq;
> +
> +	rxq->head = 0;
> +	rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
> +					      VHOST_RX_BATCH);
> +	return rxq->tail;
> +}
> +
> +static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
> +{
> +	struct vhost_net_buf *rxq = &nvq->rxq;
> +
> +	if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
> +		skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
> +				    vhost_net_buf_get_size(rxq));
> +		rxq->head = rxq->tail = 0;
> +	}
> +}
> +
> +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
> +{
> +	struct vhost_net_buf *rxq = &nvq->rxq;
> +
> +	if (!vhost_net_buf_is_empty(rxq))
> +		goto out;
> +
> +	if (!vhost_net_buf_produce(nvq))
> +		return 0;
> +
> +out:
> +	return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
> +}
> +
> +static void vhost_net_buf_init(struct vhost_net_buf *rxq)
> +{
> +	rxq->head = rxq->tail = 0;
> +}
> +
>  static void vhost_net_enable_zcopy(int vq)
>  {
>  	vhost_net_zcopy_mask |= 0x1 << vq;
> @@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n)
>  		n->vqs[i].ubufs = NULL;
>  		n->vqs[i].vhost_hlen = 0;
>  		n->vqs[i].sock_hlen = 0;
> +		vhost_net_buf_init(&n->vqs[i].rxq);
>  	}
>  
>  }
> @@ -503,15 +580,14 @@ static void handle_tx(struct vhost_net *net)
>  	mutex_unlock(&vq->mutex);
>  }
>  
> -static int peek_head_len(struct sock *sk)
> +static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>  {
> -	struct socket *sock = sk->sk_socket;
>  	struct sk_buff *head;
>  	int len = 0;
>  	unsigned long flags;
>  
> -	if (sock->ops->peek_len)
> -		return sock->ops->peek_len(sock);
> +	if (rvq->rx_array)
> +		return vhost_net_buf_peek(rvq);
>  
>  	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
>  	head = skb_peek(&sk->sk_receive_queue);
> @@ -537,10 +613,11 @@ static int sk_has_rx_data(struct sock *sk)
>  
>  static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>  {
> +	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
>  	unsigned long uninitialized_var(endtime);
> -	int len = peek_head_len(sk);
> +	int len = peek_head_len(rvq, sk);
>  
>  	if (!len && vq->busyloop_timeout) {
>  		/* Both tx vq and rx socket were polled here */
> @@ -561,7 +638,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>  			vhost_poll_queue(&vq->poll);
>  		mutex_unlock(&vq->mutex);
>  
> -		len = peek_head_len(sk);
> +		len = peek_head_len(rvq, sk);
>  	}
>  
>  	return len;
> @@ -699,6 +776,8 @@ static void handle_rx(struct vhost_net *net)
>  		/* On error, stop handling until the next kick. */
>  		if (unlikely(headcount < 0))
>  			goto out;
> +		if (nvq->rx_array)
> +			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
>  		/* On overrun, truncate and discard */
>  		if (unlikely(headcount > UIO_MAXIOV)) {
>  			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
> @@ -841,6 +920,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  		n->vqs[i].done_idx = 0;
>  		n->vqs[i].vhost_hlen = 0;
>  		n->vqs[i].sock_hlen = 0;
> +		vhost_net_buf_init(&n->vqs[i].rxq);
>  	}
>  	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
>  
> @@ -856,11 +936,14 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
>  					struct vhost_virtqueue *vq)
>  {
>  	struct socket *sock;
> +	struct vhost_net_virtqueue *nvq =
> +		container_of(vq, struct vhost_net_virtqueue, vq);
>  
>  	mutex_lock(&vq->mutex);
>  	sock = vq->private_data;
>  	vhost_net_disable_vq(n, vq);
>  	vq->private_data = NULL;
> +	vhost_net_buf_unproduce(nvq);
>  	mutex_unlock(&vq->mutex);
>  	return sock;
>  }
> @@ -953,6 +1036,25 @@ static struct socket *get_raw_socket(int fd)
>  	return ERR_PTR(r);
>  }
>  
> +static struct skb_array *get_tap_skb_array(int fd)
> +{
> +	struct skb_array *array;
> +	struct file *file = fget(fd);
> +
> +	if (!file)
> +		return NULL;
> +	array = tun_get_skb_array(file);
> +	if (!IS_ERR(array))
> +		goto out;
> +	array = tap_get_skb_array(file);
> +	if (!IS_ERR(array))
> +		goto out;
> +	array = NULL;
> +out:
> +	fput(file);
> +	return array;
> +}
> +
>  static struct socket *get_tap_socket(int fd)
>  {
>  	struct file *file = fget(fd);
> @@ -1029,6 +1131,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  
>  		vhost_net_disable_vq(n, vq);
>  		vq->private_data = sock;
> +		vhost_net_buf_unproduce(nvq);
> +		if (index == VHOST_NET_VQ_RX)
> +			nvq->rx_array = get_tap_skb_array(fd);
>  		r = vhost_vq_init_access(vq);
>  		if (r)
>  			goto err_used;
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH net-next V4 00/10] vhost_net batch dequeuing
From: Michael S. Tsirkin @ 2017-05-10 12:37 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel
In-Reply-To: <1494387382-19916-1-git-send-email-jasowang@redhat.com>

On Wed, May 10, 2017 at 11:36:12AM +0800, Jason Wang wrote:
> This series tries to implement rx batching for vhost-net. This is done
> by batching the dequeuing from skb_array which was exported by
> underlayer socket and pass the sbk back through msg_control to finish
> userspace copying. This is also the requirement for more batching
> implemention on rx path.
> 
> Tests shows at most 8.8% improvment bon rx pps on top of batch zeroing.
> 
> Please review.
> 
> Thanks
> 
> Changes from V3:
> - add batch zeroing patch to fix the build warnings

FYI that one's going upstream now.

> Changes from V2:
> - rebase to net-next HEAD
> - use unconsume helpers to put skb back on releasing
> - introduce and use vhost_net internal buffer helpers
> - renew performance numbers on top of batch zeroing
> 
> Changes from V1:
> - switch to use for() in __ptr_ring_consume_batched()
> - rename peek_head_len_batched() to fetch_skbs()
> - use skb_array_consume_batched() instead of
>   skb_array_consume_batched_bh() since no consumer run in bh
> - drop the lockless peeking patch since skb_array could be resized, so
>   it's not safe to call lockless one
> 
> Jason Wang (8):
>   skb_array: introduce skb_array_unconsume
>   ptr_ring: introduce batch dequeuing
>   skb_array: introduce batch dequeuing
>   tun: export skb_array
>   tap: export skb_array
>   tun: support receiving skb through msg_control
>   tap: support receiving skb from msg_control
>   vhost_net: try batch dequing from skb array
> 
> Michael S. Tsirkin (2):
>   ptr_ring: batch ring zeroing
>   ptr_ring: add ptr_ring_unconsume
> 
>  drivers/net/tap.c         |  25 ++++++-
>  drivers/net/tun.c         |  31 ++++++--
>  drivers/vhost/net.c       | 117 +++++++++++++++++++++++++++--
>  include/linux/if_tap.h    |   5 ++
>  include/linux/if_tun.h    |   5 ++
>  include/linux/ptr_ring.h  | 183 +++++++++++++++++++++++++++++++++++++++++++---
>  include/linux/skb_array.h |  31 ++++++++
>  7 files changed, 370 insertions(+), 27 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply

* [PATCH] mdio: mux: Correct mdio_mux_init error path issues
From: Jon Mason @ 2017-05-10 15:20 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, linux-kernel, bcm-kernel-feedback-list

There is a potential unnecessary refcount decriment on error path of
put_device(&pb->mii_bus->dev), as it is possible to avoid the
of_mdio_find_bus() call if mux_bus is specified by the calling function.

The same put_device() is not called in the error path if the
devm_kzalloc of pb fails.  This caused the variable used in the
put_device() to be changed, as the pb pointer was obviously not set up.

There is an unnecessary of_node_get() on child_bus_node if the
of_mdiobus_register() is successful, as the
for_each_available_child_of_node() automatically increments this.
Thus the refcount on this node will always be +1 more than it should be.

There is no of_node_put() on child_bus_node if the of_mdiobus_register()
call fails.

Finally, it is lacking devm_kfree() of pb in the error path.  While this
might not be technically necessary, it was present in other parts of the
function.  So, I am adding it where necessary to make it uniform.

Signed-off-by: Jon Mason <jon.mason@broadcom.com>
Fixes: f20e6657a875 ("mdio: mux: Enhanced MDIO mux framework for integrated multiplexers")
Fixes: 0ca2997d1452 ("netdev/of/phy: Add MDIO bus multiplexer support.")
---
 drivers/net/phy/mdio-mux.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c
index 963838d4fac1..6943c5ece44a 100644
--- a/drivers/net/phy/mdio-mux.c
+++ b/drivers/net/phy/mdio-mux.c
@@ -122,10 +122,9 @@ int mdio_mux_init(struct device *dev,
 	pb = devm_kzalloc(dev, sizeof(*pb), GFP_KERNEL);
 	if (pb == NULL) {
 		ret_val = -ENOMEM;
-		goto err_parent_bus;
+		goto err_pb_kz;
 	}
 
-
 	pb->switch_data = data;
 	pb->switch_fn = switch_fn;
 	pb->current_child = -1;
@@ -154,6 +153,7 @@ int mdio_mux_init(struct device *dev,
 		cb->mii_bus = mdiobus_alloc();
 		if (!cb->mii_bus) {
 			ret_val = -ENOMEM;
+			devm_kfree(dev, cb);
 			of_node_put(child_bus_node);
 			break;
 		}
@@ -169,8 +169,8 @@ int mdio_mux_init(struct device *dev,
 		if (r) {
 			mdiobus_free(cb->mii_bus);
 			devm_kfree(dev, cb);
+			of_node_put(child_bus_node);
 		} else {
-			of_node_get(child_bus_node);
 			cb->next = pb->children;
 			pb->children = cb;
 		}
@@ -181,9 +181,11 @@ int mdio_mux_init(struct device *dev,
 		return 0;
 	}
 
+	devm_kfree(dev, pb);
+err_pb_kz:
 	/* balance the reference of_mdio_find_bus() took */
-	put_device(&pb->mii_bus->dev);
-
+	if (!mux_bus)
+		put_device(&parent_bus->dev);
 err_parent_bus:
 	of_node_put(parent_bus_node);
 	return ret_val;
-- 
2.7.4

^ permalink raw reply related

* Re: bpf pointer alignment validation
From: David Miller @ 2017-05-10 15:33 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: daniel, ast, netdev
In-Reply-To: <20170510055735.hfkoh4w3xaka5yl5@ast-mbp>

From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Tue, 9 May 2017 22:57:37 -0700

> On Tue, May 09, 2017 at 02:32:34PM -0400, David Miller wrote:
>>  
>> +static u32 calc_align(u32 imm)
>> +{
>> +	u32 align = 1;
>> +
>> +	if (!imm)
>> +		return 1U << 31;
>> +
>> +	while (!(imm & 1)) {
>> +		imm >>= 1;
>> +		align <<= 1;
>> +	}
>> +	return align;
>> +}
> 
> same question as in previous reply.
> Why not to use something like:
> static u32 calc_align(u32 n)
> {
>         if (!n)
>                 return 1U << 31;
>         return n - ((n - 1) & n);
> }

Ok.

I did a cursory search and we don't have a generic kernel helper for
this kind of calculation.  I was actually quite surprised, as we
have one for everything else :-)

> this needs to be tweaked like
> if (log_level > 1)
>         verbose("%d:", insn_idx);
> else
> 	verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
> 
> otherwise it prints prev_insn_idx which is meaningful
> only with processing exit and search pruning.

Agreed.

> Nice to see all these comments.
> I wonder how we can make them automatic in the verifier.
> Like if verifier can somehow hint the user in such human friendly way
> about what is happening with the program.
> Today that's the #1 problem. Most folks complaining
> that verifier error messages are too hard to understand.

We can put whatever text strings we want into that verifier buffer.
It is as flexible as netlink extended acks.

> would it make sense to bpf_prog_test_run() it here as well?

We could.

> On x86 not much value, but on sparc we can somehow look for traps?
> Is there some counter of unaligned traps that we can read and report
> as error to user space after prog_test_run ?

Unfortunately, no.

> These tests we cannot really run, since they don't do any load/store.
> I mean more for some future tests. Or some sort of debug warn
> that there were traps while bpf prog was executed, so the user
> is alarmed and reports to us, since that would be a bug in verifier
> align logic?

One thing I could definitely do is add logic to the unaligned trap
handler to print something special in the logs if we find that we
are executing BPF code.

The basic structure of the log message can be codified in a generic
bpf_xxx() helper, which architectures call with the PC and unaligned
address as arguments.

I was thinking more last night about strict alignment mode for the
verifier, so that bugs can be spotted on all architectures.  But
it stupidly will not work.

The problem is that x86 and powerpc define NET_IP_ALIGN as 0, so all
bets are off.

One thing we could do in "strict alignment" verifier mode is pretend
that NET_IP_ALIGN is 2 in the alignment checks.

^ permalink raw reply

* [net-mellanox] question about potential null pointer dereference
From: Gustavo A. R. Silva @ 2017-05-10 15:36 UTC (permalink / raw)
  To: Jiri Pirko, Ido Schimmel; +Cc: netdev, linux-kernel


Hello everybody,

While looking into Coverity ID 1350941 I ran into the following piece  
of code at  
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:1483:

1483static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
1484                                            char *sfn_pl, int rec_index,
1485                                            bool adding)
1486{
1487        struct mlxsw_sp_port *mlxsw_sp_port;
1488        char mac[ETH_ALEN];
1489        u8 local_port;
1490        u16 vid, fid;
1491        bool do_notification = true;
1492        int err;
1493
1494        mlxsw_reg_sfn_mac_unpack(sfn_pl, rec_index, mac, &fid,  
&local_port);
1495        mlxsw_sp_port = mlxsw_sp->ports[local_port];
1496        if (!mlxsw_sp_port) {
1497                dev_err_ratelimited(mlxsw_sp->bus_info->dev,  
"Incorrect local port in FDB notification\n");
1498                goto just_remove;
1499        }
1500
1501        if (mlxsw_sp_fid_is_vfid(fid)) {
1502                struct mlxsw_sp_port *mlxsw_sp_vport;
1503
1504                mlxsw_sp_vport =  
mlxsw_sp_port_vport_find_by_fid(mlxsw_sp_port,
1505                                                                 fid);
1506                if (!mlxsw_sp_vport) {
1507                        netdev_err(mlxsw_sp_port->dev, "Failed to  
find a matching vPort following FDB notification\n");
1508                        goto just_remove;
1509                }
1510                vid = 0;
1511                /* Override the physical port with the vPort. */
1512                mlxsw_sp_port = mlxsw_sp_vport;
1513        } else {
1514                vid = fid;
1515        }
1516
1517do_fdb_op:
1518        err = mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid,
1519                                      adding, true);
1520        if (err) {
1521                if (net_ratelimit())
1522                        netdev_err(mlxsw_sp_port->dev, "Failed to  
set FDB entry\n");
1523                return;
1524        }
1525
1526        if (!do_notification)
1527                return;
1528        mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning_sync,
1529                                    adding, mac, vid, mlxsw_sp_port->dev);
1530        return;
1531
1532just_remove:
1533        adding = false;
1534        do_notification = false;
1535        goto do_fdb_op;
1536}


The issue here is that line 1496 implies that mlxsw_sp_port might be  
NULL. If this is the case, the execution path jumps to line 1532 and  
then to line 1517. All this could end up dereferencing a NULL pointer  
at line 1522.

Is there any chance for mlxsw_sp_port to be NULL at line 1496 and, at  
the same time, a NULL pointer dereference occurs at line 1522?

I'm trying to figure out if this is a false positive or something that  
actually needs to be fixed.

I'd really appreciate any comment on this.
Thank you!

^ permalink raw reply

* Re: [net-mellanox] question about potential null pointer dereference
From: Ido Schimmel @ 2017-05-10 15:51 UTC (permalink / raw)
  To: Gustavo A. R. Silva; +Cc: Jiri Pirko, netdev, linux-kernel
In-Reply-To: <20170510103659.Horde.a0uOuKuhfTuy1DbqQ378EHK@gator4166.hostgator.com>

On Wed, May 10, 2017 at 10:36:59AM -0500, Gustavo A. R. Silva wrote:
> 
> Hello everybody,
> 
> While looking into Coverity ID 1350941 I ran into the following piece of
> code at drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:1483:
> 
> 1483static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
> 1484                                            char *sfn_pl, int rec_index,
> 1485                                            bool adding)
> 1486{
> 1487        struct mlxsw_sp_port *mlxsw_sp_port;
> 1488        char mac[ETH_ALEN];
> 1489        u8 local_port;
> 1490        u16 vid, fid;
> 1491        bool do_notification = true;
> 1492        int err;
> 1493
> 1494        mlxsw_reg_sfn_mac_unpack(sfn_pl, rec_index, mac, &fid,
> &local_port);
> 1495        mlxsw_sp_port = mlxsw_sp->ports[local_port];
> 1496        if (!mlxsw_sp_port) {
> 1497                dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect
> local port in FDB notification\n");
> 1498                goto just_remove;
> 1499        }
> 1500
> 1501        if (mlxsw_sp_fid_is_vfid(fid)) {
> 1502                struct mlxsw_sp_port *mlxsw_sp_vport;
> 1503
> 1504                mlxsw_sp_vport =
> mlxsw_sp_port_vport_find_by_fid(mlxsw_sp_port,
> 1505                                                                 fid);
> 1506                if (!mlxsw_sp_vport) {
> 1507                        netdev_err(mlxsw_sp_port->dev, "Failed to find a
> matching vPort following FDB notification\n");
> 1508                        goto just_remove;
> 1509                }
> 1510                vid = 0;
> 1511                /* Override the physical port with the vPort. */
> 1512                mlxsw_sp_port = mlxsw_sp_vport;
> 1513        } else {
> 1514                vid = fid;
> 1515        }
> 1516
> 1517do_fdb_op:
> 1518        err = mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid,
> 1519                                      adding, true);
> 1520        if (err) {
> 1521                if (net_ratelimit())
> 1522                        netdev_err(mlxsw_sp_port->dev, "Failed to set
> FDB entry\n");
> 1523                return;
> 1524        }
> 1525
> 1526        if (!do_notification)
> 1527                return;
> 1528        mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning_sync,
> 1529                                    adding, mac, vid, mlxsw_sp_port->dev);
> 1530        return;
> 1531
> 1532just_remove:
> 1533        adding = false;
> 1534        do_notification = false;
> 1535        goto do_fdb_op;
> 1536}
> 
> 
> The issue here is that line 1496 implies that mlxsw_sp_port might be NULL.
> If this is the case, the execution path jumps to line 1532 and then to line
> 1517. All this could end up dereferencing a NULL pointer at line 1522.
> 
> Is there any chance for mlxsw_sp_port to be NULL at line 1496 and, at the
> same time, a NULL pointer dereference occurs at line 1522?
> 
> I'm trying to figure out if this is a false positive or something that
> actually needs to be fixed.

In theory, yes, it can happen, but it didn't happen yet. I recently
patched that and now it's in Jiri's queue. I guess he'll send it
tomorrow.

https://github.com/jpirko/linux_mlxsw/commit/4160fb9ad6eeacba7736cfdbf7f52248432c2e89

Thanks for looking into this!

> 
> I'd really appreciate any comment on this.
> Thank you!
> --
> Gustavo A. R. Silva

^ permalink raw reply

* Re: bpf pointer alignment validation
From: Daniel Borkmann @ 2017-05-10 15:51 UTC (permalink / raw)
  To: David Miller, alexei.starovoitov; +Cc: ast, netdev
In-Reply-To: <20170510.113357.362211347006868605.davem@davemloft.net>

On 05/10/2017 05:33 PM, David Miller wrote:
> From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Date: Tue, 9 May 2017 22:57:37 -0700
>
>> On Tue, May 09, 2017 at 02:32:34PM -0400, David Miller wrote:
>>>
>>> +static u32 calc_align(u32 imm)
>>> +{
>>> +	u32 align = 1;
>>> +
>>> +	if (!imm)
>>> +		return 1U << 31;
>>> +
>>> +	while (!(imm & 1)) {
>>> +		imm >>= 1;
>>> +		align <<= 1;
>>> +	}
>>> +	return align;
>>> +}
>>
>> same question as in previous reply.
>> Why not to use something like:
>> static u32 calc_align(u32 n)
>> {
>>          if (!n)
>>                  return 1U << 31;
>>          return n - ((n - 1) & n);
>> }
>
> Ok.
>
> I did a cursory search and we don't have a generic kernel helper for
> this kind of calculation.  I was actually quite surprised, as we
> have one for everything else :-)
>
>> this needs to be tweaked like
>> if (log_level > 1)
>>          verbose("%d:", insn_idx);
>> else
>> 	verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
>>
>> otherwise it prints prev_insn_idx which is meaningful
>> only with processing exit and search pruning.
>
> Agreed.
>
>> Nice to see all these comments.
>> I wonder how we can make them automatic in the verifier.
>> Like if verifier can somehow hint the user in such human friendly way
>> about what is happening with the program.
>> Today that's the #1 problem. Most folks complaining
>> that verifier error messages are too hard to understand.
>
> We can put whatever text strings we want into that verifier buffer.
> It is as flexible as netlink extended acks.
>
>> would it make sense to bpf_prog_test_run() it here as well?
>
> We could.
>
>> On x86 not much value, but on sparc we can somehow look for traps?
>> Is there some counter of unaligned traps that we can read and report
>> as error to user space after prog_test_run ?
>
> Unfortunately, no.
>
>> These tests we cannot really run, since they don't do any load/store.
>> I mean more for some future tests. Or some sort of debug warn
>> that there were traps while bpf prog was executed, so the user
>> is alarmed and reports to us, since that would be a bug in verifier
>> align logic?
>
> One thing I could definitely do is add logic to the unaligned trap
> handler to print something special in the logs if we find that we
> are executing BPF code.
>
> The basic structure of the log message can be codified in a generic
> bpf_xxx() helper, which architectures call with the PC and unaligned
> address as arguments.
>
> I was thinking more last night about strict alignment mode for the
> verifier, so that bugs can be spotted on all architectures.  But
> it stupidly will not work.
>
> The problem is that x86 and powerpc define NET_IP_ALIGN as 0, so all
> bets are off.
>
> One thing we could do in "strict alignment" verifier mode is pretend
> that NET_IP_ALIGN is 2 in the alignment checks.

Would probably be good nevertheless to have this as a flag for
program loads, which gets then passed through to the verifier to
explicitly enable strict alignment checks.

Might certainly aide developing & testing programs on archs with
efficient unaligned access and later actually running them on archs
that don't have it. (And at minimum, it also helps for checking
the test suite against the verifier.)

^ permalink raw reply

* Re: bpf pointer alignment validation
From: David Miller @ 2017-05-10 15:57 UTC (permalink / raw)
  To: daniel; +Cc: alexei.starovoitov, ast, netdev
In-Reply-To: <59133716.7060800@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 10 May 2017 17:51:50 +0200

> Would probably be good nevertheless to have this as a flag for
> program loads, which gets then passed through to the verifier to
> explicitly enable strict alignment checks.
> 
> Might certainly aide developing & testing programs on archs with
> efficient unaligned access and later actually running them on archs
> that don't have it. (And at minimum, it also helps for checking
> the test suite against the verifier.)

Ok, I can implement this flag.

The only question is where to put it?  An unused bit in the program
type? :-)

^ permalink raw reply

* Re: [net-mellanox] question about potential null pointer dereference
From: Gustavo A. R. Silva @ 2017-05-10 16:01 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: Jiri Pirko, netdev, linux-kernel
In-Reply-To: <20170510155131.GA11944@splinter.mtl.com>


Quoting Ido Schimmel <idosch@mellanox.com>:

> On Wed, May 10, 2017 at 10:36:59AM -0500, Gustavo A. R. Silva wrote:
>>
>> Hello everybody,
>>
>> While looking into Coverity ID 1350941 I ran into the following piece of
>> code at drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:1483:
>>
>> 1483static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
>> 1484                                            char *sfn_pl, int rec_index,
>> 1485                                            bool adding)
>> 1486{
>> 1487        struct mlxsw_sp_port *mlxsw_sp_port;
>> 1488        char mac[ETH_ALEN];
>> 1489        u8 local_port;
>> 1490        u16 vid, fid;
>> 1491        bool do_notification = true;
>> 1492        int err;
>> 1493
>> 1494        mlxsw_reg_sfn_mac_unpack(sfn_pl, rec_index, mac, &fid,
>> &local_port);
>> 1495        mlxsw_sp_port = mlxsw_sp->ports[local_port];
>> 1496        if (!mlxsw_sp_port) {
>> 1497                dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect
>> local port in FDB notification\n");
>> 1498                goto just_remove;
>> 1499        }
>> 1500
>> 1501        if (mlxsw_sp_fid_is_vfid(fid)) {
>> 1502                struct mlxsw_sp_port *mlxsw_sp_vport;
>> 1503
>> 1504                mlxsw_sp_vport =
>> mlxsw_sp_port_vport_find_by_fid(mlxsw_sp_port,
>> 1505                                                                 fid);
>> 1506                if (!mlxsw_sp_vport) {
>> 1507                        netdev_err(mlxsw_sp_port->dev, "Failed to find a
>> matching vPort following FDB notification\n");
>> 1508                        goto just_remove;
>> 1509                }
>> 1510                vid = 0;
>> 1511                /* Override the physical port with the vPort. */
>> 1512                mlxsw_sp_port = mlxsw_sp_vport;
>> 1513        } else {
>> 1514                vid = fid;
>> 1515        }
>> 1516
>> 1517do_fdb_op:
>> 1518        err = mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid,
>> 1519                                      adding, true);
>> 1520        if (err) {
>> 1521                if (net_ratelimit())
>> 1522                        netdev_err(mlxsw_sp_port->dev, "Failed to set
>> FDB entry\n");
>> 1523                return;
>> 1524        }
>> 1525
>> 1526        if (!do_notification)
>> 1527                return;
>> 1528        mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning_sync,
>> 1529                                    adding, mac, vid,  
>> mlxsw_sp_port->dev);
>> 1530        return;
>> 1531
>> 1532just_remove:
>> 1533        adding = false;
>> 1534        do_notification = false;
>> 1535        goto do_fdb_op;
>> 1536}
>>
>>
>> The issue here is that line 1496 implies that mlxsw_sp_port might be NULL.
>> If this is the case, the execution path jumps to line 1532 and then to line
>> 1517. All this could end up dereferencing a NULL pointer at line 1522.
>>
>> Is there any chance for mlxsw_sp_port to be NULL at line 1496 and, at the
>> same time, a NULL pointer dereference occurs at line 1522?
>>
>> I'm trying to figure out if this is a false positive or something that
>> actually needs to be fixed.
>
> In theory, yes, it can happen, but it didn't happen yet. I recently
> patched that and now it's in Jiri's queue. I guess he'll send it
> tomorrow.
>
> https://github.com/jpirko/linux_mlxsw/commit/4160fb9ad6eeacba7736cfdbf7f52248432c2e89
>

Great, glad to see it is fixed now.

> Thanks for looking into this!
>

Sure thing, thanks for clarifying. :)

^ permalink raw reply

* Re: bpf pointer alignment validation
From: Alexei Starovoitov @ 2017-05-10 16:15 UTC (permalink / raw)
  To: David Miller, daniel; +Cc: alexei.starovoitov, netdev
In-Reply-To: <20170510.115720.1396306489898364855.davem@davemloft.net>

On 5/10/17 8:57 AM, David Miller wrote:
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Wed, 10 May 2017 17:51:50 +0200
>
>> Would probably be good nevertheless to have this as a flag for
>> program loads, which gets then passed through to the verifier to
>> explicitly enable strict alignment checks.
>>
>> Might certainly aide developing & testing programs on archs with
>> efficient unaligned access and later actually running them on archs
>> that don't have it. (And at minimum, it also helps for checking
>> the test suite against the verifier.)
>
> Ok, I can implement this flag.
>
> The only question is where to put it?  An unused bit in the program
> type? :-)

just add '__u32 prog_flags' to bpf_attr PROG_LOAD anon union.
There was no need for such flags in the past, hence no flags field
existed. Now it's time.

^ permalink raw reply

* Re: bpf pointer alignment validation
From: Daniel Borkmann @ 2017-05-10 16:21 UTC (permalink / raw)
  To: David Miller; +Cc: alexei.starovoitov, ast, netdev
In-Reply-To: <20170510.115720.1396306489898364855.davem@davemloft.net>

On 05/10/2017 05:57 PM, David Miller wrote:
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Wed, 10 May 2017 17:51:50 +0200
>
>> Would probably be good nevertheless to have this as a flag for
>> program loads, which gets then passed through to the verifier to
>> explicitly enable strict alignment checks.
>>
>> Might certainly aide developing & testing programs on archs with
>> efficient unaligned access and later actually running them on archs
>> that don't have it. (And at minimum, it also helps for checking
>> the test suite against the verifier.)
>
> Ok, I can implement this flag.
>
> The only question is where to put it?  An unused bit in the program
> type? :-)

See for example 7f677633379b ("bpf: introduce BPF_F_ALLOW_OVERRIDE
flag"). We can add a flags field to the prog loading part of union
bpf_attr; we would need to make sure to update BPF_PROG_LOAD_LAST_FIELD
to the new member, and to reject unknown flags, of course. Then the
syscall will handle compat with older binaries just fine by design,
the main bpf syscall code and CHECK_ATTR() macros will ensure this
(backward compat, and to a limited degree also forward compat).

^ permalink raw reply

* Re: [Patch net] ipv4: restore rt->fi for reference counting
From: Cong Wang @ 2017-05-10 16:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Linux Kernel Network Developers, Andrey Konovalov,
	Eric Dumazet
In-Reply-To: <1494373805.7796.98.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, May 9, 2017 at 4:50 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2017-05-09 at 16:35 -0700, Cong Wang wrote:
>
>> All of them take RCU read lock, so, as I explained in the code comment,
>> they all should be fine because of synchronize_net() on unregister path.
>> Do you see anything otherwise?
>
> They might take rcu lock, but compiler is still allowed to read
> fi->fib_dev multiple times, and crashes might happen.
>
> You will need to audit all code and fix it, using proper
> rcu_dereference() or similar code ensuring compiler wont do stupid
> things.
>

Point taken. But without my patch, nh_dev is supposed to be protected
by RCU too, it is freed in a rcu callback and dereferenced like:

struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox