Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2 07/12] amd-xgbe: Prepare for ethtool set-channel support
From: Tom Lendacky @ 2018-05-23 16:39 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180523163802.31625.76572.stgit@tlendack-t1.amdoffice.net>

In order to support being able to dynamically set/change the number of
Rx and Tx channels, update the code to:
 - Move alloc and free of device memory into callable functions
 - Move setting of the real number of Rx and Tx channels to device startup
 - Move mapping of the RSS channels to device startup

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |  108 ++++++++++++++++++-----------
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |   20 -----
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 2646c08..397e3a0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1312,14 +1312,72 @@ int xgbe_powerup(struct net_device *netdev, unsigned int caller)
 	return 0;
 }
 
+static void xgbe_free_memory(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_desc_if *desc_if = &pdata->desc_if;
+
+	/* Free the ring descriptors and buffers */
+	desc_if->free_ring_resources(pdata);
+
+	/* Free the channel and ring structures */
+	xgbe_free_channels(pdata);
+}
+
+static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_desc_if *desc_if = &pdata->desc_if;
+	struct net_device *netdev = pdata->netdev;
+	int ret;
+
+	/* Calculate the Rx buffer size before allocating rings */
+	pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
+
+	/* Allocate the channel and ring structures */
+	ret = xgbe_alloc_channels(pdata);
+	if (ret)
+		return ret;
+
+	/* Allocate the ring descriptors and buffers */
+	ret = desc_if->alloc_ring_resources(pdata);
+	if (ret)
+		goto err_channels;
+
+	/* Initialize the service and Tx timers */
+	xgbe_init_timers(pdata);
+
+	return 0;
+
+err_channels:
+	xgbe_free_memory(pdata);
+
+	return ret;
+}
+
 static int xgbe_start(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_hw_if *hw_if = &pdata->hw_if;
 	struct xgbe_phy_if *phy_if = &pdata->phy_if;
 	struct net_device *netdev = pdata->netdev;
+	unsigned int i;
 	int ret;
 
-	DBGPR("-->xgbe_start\n");
+	/* Set the number of queues */
+	ret = netif_set_real_num_tx_queues(netdev, pdata->tx_ring_count);
+	if (ret) {
+		netdev_err(netdev, "error setting real tx queue count\n");
+		return ret;
+	}
+
+	ret = netif_set_real_num_rx_queues(netdev, pdata->rx_ring_count);
+	if (ret) {
+		netdev_err(netdev, "error setting real rx queue count\n");
+		return ret;
+	}
+
+	/* Set RSS lookup table data for programming */
+	for (i = 0; i < XGBE_RSS_MAX_TABLE_SIZE; i++)
+		XGMAC_SET_BITS(pdata->rss_table[i], MAC_RSSDR, DMCH,
+			       i % pdata->rx_ring_count);
 
 	ret = hw_if->init(pdata);
 	if (ret)
@@ -1347,8 +1405,6 @@ static int xgbe_start(struct xgbe_prv_data *pdata)
 
 	clear_bit(XGBE_STOPPED, &pdata->dev_state);
 
-	DBGPR("<--xgbe_start\n");
-
 	return 0;
 
 err_irqs:
@@ -1823,11 +1879,8 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 static int xgbe_open(struct net_device *netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
-	struct xgbe_desc_if *desc_if = &pdata->desc_if;
 	int ret;
 
-	DBGPR("-->xgbe_open\n");
-
 	/* Create the various names based on netdev name */
 	snprintf(pdata->an_name, sizeof(pdata->an_name) - 1, "%s-pcs",
 		 netdev_name(netdev));
@@ -1872,43 +1925,25 @@ static int xgbe_open(struct net_device *netdev)
 		goto err_sysclk;
 	}
 
-	/* Calculate the Rx buffer size before allocating rings */
-	ret = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
-	if (ret < 0)
-		goto err_ptpclk;
-	pdata->rx_buf_size = ret;
-
-	/* Allocate the channel and ring structures */
-	ret = xgbe_alloc_channels(pdata);
-	if (ret)
-		goto err_ptpclk;
-
-	/* Allocate the ring descriptors and buffers */
-	ret = desc_if->alloc_ring_resources(pdata);
-	if (ret)
-		goto err_channels;
-
 	INIT_WORK(&pdata->service_work, xgbe_service);
 	INIT_WORK(&pdata->restart_work, xgbe_restart);
 	INIT_WORK(&pdata->stopdev_work, xgbe_stopdev);
 	INIT_WORK(&pdata->tx_tstamp_work, xgbe_tx_tstamp);
-	xgbe_init_timers(pdata);
+
+	ret = xgbe_alloc_memory(pdata);
+	if (ret)
+		goto err_ptpclk;
 
 	ret = xgbe_start(pdata);
 	if (ret)
-		goto err_rings;
+		goto err_mem;
 
 	clear_bit(XGBE_DOWN, &pdata->dev_state);
 
-	DBGPR("<--xgbe_open\n");
-
 	return 0;
 
-err_rings:
-	desc_if->free_ring_resources(pdata);
-
-err_channels:
-	xgbe_free_channels(pdata);
+err_mem:
+	xgbe_free_memory(pdata);
 
 err_ptpclk:
 	clk_disable_unprepare(pdata->ptpclk);
@@ -1928,18 +1963,11 @@ static int xgbe_open(struct net_device *netdev)
 static int xgbe_close(struct net_device *netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
-	struct xgbe_desc_if *desc_if = &pdata->desc_if;
-
-	DBGPR("-->xgbe_close\n");
 
 	/* Stop the device */
 	xgbe_stop(pdata);
 
-	/* Free the ring descriptors and buffers */
-	desc_if->free_ring_resources(pdata);
-
-	/* Free the channel and ring structures */
-	xgbe_free_channels(pdata);
+	xgbe_free_memory(pdata);
 
 	/* Disable the clocks */
 	clk_disable_unprepare(pdata->ptpclk);
@@ -1953,8 +1981,6 @@ static int xgbe_close(struct net_device *netdev)
 
 	set_bit(XGBE_DOWN, &pdata->dev_state);
 
-	DBGPR("<--xgbe_close\n");
-
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 441d0973..b41f236 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -265,7 +265,6 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 {
 	struct net_device *netdev = pdata->netdev;
 	struct device *dev = pdata->dev;
-	unsigned int i;
 	int ret;
 
 	netdev->irq = pdata->dev_irq;
@@ -324,26 +323,9 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 				pdata->tx_ring_count, pdata->rx_ring_count);
 	}
 
-	/* Set the number of queues */
-	ret = netif_set_real_num_tx_queues(netdev, pdata->tx_ring_count);
-	if (ret) {
-		dev_err(dev, "error setting real tx queue count\n");
-		return ret;
-	}
-
-	ret = netif_set_real_num_rx_queues(netdev, pdata->rx_ring_count);
-	if (ret) {
-		dev_err(dev, "error setting real rx queue count\n");
-		return ret;
-	}
-
-	/* Initialize RSS hash key and lookup table */
+	/* Initialize RSS hash key */
 	netdev_rss_key_fill(pdata->rss_key, sizeof(pdata->rss_key));
 
-	for (i = 0; i < XGBE_RSS_MAX_TABLE_SIZE; i++)
-		XGMAC_SET_BITS(pdata->rss_table[i], MAC_RSSDR, DMCH,
-			       i % pdata->rx_ring_count);
-
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, IP2TE, 1);
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, TCP4TE, 1);
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, UDP4TE, 1);

^ permalink raw reply related

* [PATCH net-next v2 08/12] amd-xgbe: Add ethtool show/set channels support
From: Tom Lendacky @ 2018-05-23 16:39 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180523163802.31625.76572.stgit@tlendack-t1.amdoffice.net>

Add ethtool support to show and set the device channel configuration.
Changing the channel configuration will result in a device restart.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c     |   25 +++++
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  134 ++++++++++++++++++++++++++
 drivers/net/ethernet/amd/xgbe/xgbe.h         |    4 +
 3 files changed, 163 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 397e3a0..24f1053 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1329,6 +1329,17 @@ static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
 	struct net_device *netdev = pdata->netdev;
 	int ret;
 
+	if (pdata->new_tx_ring_count) {
+		pdata->tx_ring_count = pdata->new_tx_ring_count;
+		pdata->tx_q_count = pdata->tx_ring_count;
+		pdata->new_tx_ring_count = 0;
+	}
+
+	if (pdata->new_rx_ring_count) {
+		pdata->rx_ring_count = pdata->new_rx_ring_count;
+		pdata->new_rx_ring_count = 0;
+	}
+
 	/* Calculate the Rx buffer size before allocating rings */
 	pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
 
@@ -1482,6 +1493,20 @@ static void xgbe_stopdev(struct work_struct *work)
 	netdev_alert(pdata->netdev, "device stopped\n");
 }
 
+void xgbe_full_restart_dev(struct xgbe_prv_data *pdata)
+{
+	/* If not running, "restart" will happen on open */
+	if (!netif_running(pdata->netdev))
+		return;
+
+	xgbe_stop(pdata);
+
+	xgbe_free_memory(pdata);
+	xgbe_alloc_memory(pdata);
+
+	xgbe_start(pdata);
+}
+
 void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
 	/* If not running, "restart" will happen on open */
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index d12f982..a880f10 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -705,6 +705,138 @@ static int xgbe_set_ringparam(struct net_device *netdev,
 	return 0;
 }
 
+static void xgbe_get_channels(struct net_device *netdev,
+			      struct ethtool_channels *channels)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	unsigned int rx, tx, combined;
+
+	/* Calculate maximums allowed:
+	 *   - Take into account the number of available IRQs
+	 *   - Do not take into account the number of online CPUs so that
+	 *     the user can over-subscribe if desired
+	 *   - Tx is additionally limited by the number of hardware queues
+	 */
+	rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+	rx = min(rx, pdata->channel_irq_count);
+	tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+	tx = min(tx, pdata->channel_irq_count);
+	tx = min(tx, pdata->tx_max_q_count);
+
+	combined = min(rx, tx);
+
+	channels->max_combined = combined;
+	channels->max_rx = rx ? rx - 1 : 0;
+	channels->max_tx = tx ? tx - 1 : 0;
+
+	/* Get current settings based on device state */
+	rx = pdata->new_rx_ring_count ? : pdata->rx_ring_count;
+	tx = pdata->new_tx_ring_count ? : pdata->tx_ring_count;
+
+	combined = min(rx, tx);
+	rx -= combined;
+	tx -= combined;
+
+	channels->combined_count = combined;
+	channels->rx_count = rx;
+	channels->tx_count = tx;
+}
+
+static void xgbe_print_set_channels_input(struct net_device *netdev,
+					  struct ethtool_channels *channels)
+{
+	netdev_err(netdev, "channel inputs: combined=%u, rx-only=%u, tx-only=%u\n",
+		   channels->combined_count, channels->rx_count,
+		   channels->tx_count);
+}
+
+static int xgbe_set_channels(struct net_device *netdev,
+			     struct ethtool_channels *channels)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	unsigned int rx, rx_curr, tx, tx_curr, combined;
+
+	/* Calculate maximums allowed:
+	 *   - Take into account the number of available IRQs
+	 *   - Do not take into account the number of online CPUs so that
+	 *     the user can over-subscribe if desired
+	 *   - Tx is additionally limited by the number of hardware queues
+	 */
+	rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+	rx = min(rx, pdata->channel_irq_count);
+	tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+	tx = min(tx, pdata->tx_max_q_count);
+	tx = min(tx, pdata->channel_irq_count);
+
+	combined = min(rx, tx);
+
+	/* Should not be setting other count */
+	if (channels->other_count) {
+		netdev_err(netdev,
+			   "other channel count must be zero\n");
+		return -EINVAL;
+	}
+
+	/* Require at least one Combined (Rx and Tx) channel */
+	if (!channels->combined_count) {
+		netdev_err(netdev,
+			   "at least one combined Rx/Tx channel is required\n");
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	/* Check combined channels */
+	if (channels->combined_count > combined) {
+		netdev_err(netdev,
+			   "combined channel count cannot exceed %u\n",
+			   combined);
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	/* Can have some Rx-only or Tx-only channels, but not both */
+	if (channels->rx_count && channels->tx_count) {
+		netdev_err(netdev,
+			   "cannot specify both Rx-only and Tx-only channels\n");
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	/* Check that we don't exceed the maximum number of channels */
+	if ((channels->combined_count + channels->rx_count) > rx) {
+		netdev_err(netdev,
+			   "total Rx channels (%u) requested exceeds maximum available (%u)\n",
+			   channels->combined_count + channels->rx_count, rx);
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	if ((channels->combined_count + channels->tx_count) > tx) {
+		netdev_err(netdev,
+			   "total Tx channels (%u) requested exceeds maximum available (%u)\n",
+			   channels->combined_count + channels->tx_count, tx);
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	rx = channels->combined_count + channels->rx_count;
+	tx = channels->combined_count + channels->tx_count;
+
+	rx_curr = pdata->new_rx_ring_count ? : pdata->rx_ring_count;
+	tx_curr = pdata->new_tx_ring_count ? : pdata->tx_ring_count;
+
+	if ((rx == rx_curr) && (tx == tx_curr))
+		goto out;
+
+	pdata->new_rx_ring_count = rx;
+	pdata->new_tx_ring_count = tx;
+
+	xgbe_full_restart_dev(pdata);
+
+out:
+	return 0;
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
 	.get_drvinfo = xgbe_get_drvinfo,
 	.get_msglevel = xgbe_get_msglevel,
@@ -729,6 +861,8 @@ static int xgbe_set_ringparam(struct net_device *netdev,
 	.get_module_eeprom = xgbe_get_module_eeprom,
 	.get_ringparam = xgbe_get_ringparam,
 	.set_ringparam = xgbe_set_ringparam,
+	.get_channels = xgbe_get_channels,
+	.set_channels = xgbe_set_channels,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 7dc0fac..7a412cf 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1122,6 +1122,9 @@ struct xgbe_prv_data {
 	unsigned int rx_ring_count;
 	unsigned int rx_desc_count;
 
+	unsigned int new_tx_ring_count;
+	unsigned int new_rx_ring_count;
+
 	unsigned int tx_max_q_count;
 	unsigned int rx_max_q_count;
 	unsigned int tx_q_count;
@@ -1336,6 +1339,7 @@ void xgbe_dump_rx_desc(struct xgbe_prv_data *, struct xgbe_ring *,
 void xgbe_init_rx_coalesce(struct xgbe_prv_data *);
 void xgbe_init_tx_coalesce(struct xgbe_prv_data *);
 void xgbe_restart_dev(struct xgbe_prv_data *pdata);
+void xgbe_full_restart_dev(struct xgbe_prv_data *pdata);
 
 #ifdef CONFIG_DEBUG_FS
 void xgbe_debugfs_init(struct xgbe_prv_data *);

^ permalink raw reply related

* [PATCH net-next v2 09/12] amd-xgbe: Always attempt link training in KR mode
From: Tom Lendacky @ 2018-05-23 16:39 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180523163802.31625.76572.stgit@tlendack-t1.amdoffice.net>

Link training is always attempted when in KR mode, but the code is
structured to check if link training has been enabled before attempting
to perform it.  Since that check will always be true, simplify the code
to always enable and start link training during KR auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   69 +++++++----------------------
 1 file changed, 16 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 9c39c72..450b89c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -216,31 +216,8 @@ static void xgbe_an_clear_interrupts_all(struct xgbe_prv_data *pdata)
 	xgbe_an37_clear_interrupts(pdata);
 }
 
-static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
-{
-	unsigned int reg;
-
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-	reg |= XGBE_KR_TRAINING_ENABLE;
-	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
-static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
-{
-	unsigned int reg;
-
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-	reg &= ~XGBE_KR_TRAINING_ENABLE;
-	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
 static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 {
-	/* Enable KR training */
-	xgbe_an73_enable_kr_training(pdata);
-
 	/* Set MAC to 10G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_10000);
 
@@ -250,9 +227,6 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 2.5G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_2500);
 
@@ -262,9 +236,6 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -278,9 +249,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 	if (pdata->kr_redrv)
 		return xgbe_kr_mode(pdata);
 
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 10G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_10000);
 
@@ -290,9 +258,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -302,9 +267,6 @@ static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -314,9 +276,6 @@ static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_100_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -425,6 +384,12 @@ static void xgbe_an73_set(struct xgbe_prv_data *pdata, bool enable,
 {
 	unsigned int reg;
 
+	/* Disable KR training for now */
+	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+	reg &= ~XGBE_KR_TRAINING_ENABLE;
+	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
+
+	/* Update AN settings */
 	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
 	reg &= ~MDIO_AN_CTRL1_ENABLE;
 
@@ -522,21 +487,19 @@ static enum xgbe_an xgbe_an73_tx_training(struct xgbe_prv_data *pdata,
 	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
 
 	/* Start KR training */
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-	if (reg & XGBE_KR_TRAINING_ENABLE) {
-		if (pdata->phy_if.phy_impl.kr_training_pre)
-			pdata->phy_if.phy_impl.kr_training_pre(pdata);
+	if (pdata->phy_if.phy_impl.kr_training_pre)
+		pdata->phy_if.phy_impl.kr_training_pre(pdata);
 
-		reg |= XGBE_KR_TRAINING_START;
-		XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
-			    reg);
+	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+	reg |= XGBE_KR_TRAINING_ENABLE;
+	reg |= XGBE_KR_TRAINING_START;
+	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
 
-		netif_dbg(pdata, link, pdata->netdev,
-			  "KR training initiated\n");
+	netif_dbg(pdata, link, pdata->netdev,
+		  "KR training initiated\n");
 
-		if (pdata->phy_if.phy_impl.kr_training_post)
-			pdata->phy_if.phy_impl.kr_training_post(pdata);
-	}
+	if (pdata->phy_if.phy_impl.kr_training_post)
+		pdata->phy_if.phy_impl.kr_training_post(pdata);
 
 	return XGBE_AN_PAGE_RECEIVED;
 }

^ permalink raw reply related

* [PATCH net-next v2 10/12] amd-xgbe: Advertise FEC support with the KR re-driver
From: Tom Lendacky @ 2018-05-23 16:39 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180523163802.31625.76572.stgit@tlendack-t1.amdoffice.net>

When a KR re-driver is present, indicate the FEC support is available
during auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 141bb13..dd747f6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1720,6 +1720,10 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data *pdata,
 	XGBE_CLR_ADV(dlks, 1000baseKX_Full);
 	XGBE_CLR_ADV(dlks, 10000baseKR_Full);
 
+	/* Advertise FEC support is present */
+	if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
+		XGBE_SET_ADV(dlks, 10000baseR_FEC);
+
 	switch (phy_data->port_mode) {
 	case XGBE_PORT_MODE_BACKPLANE:
 		XGBE_SET_ADV(dlks, 10000baseKR_Full);

^ permalink raw reply related

* [PATCH net-next v2 11/12] amd-xgbe: Update the BelFuse quirk to support SGMII
From: Tom Lendacky @ 2018-05-23 16:39 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180523163802.31625.76572.stgit@tlendack-t1.amdoffice.net>

Instead of using a quirk to make the BelFuse 1GBT-SFP06 part look like
a 1000baseX part, program the SFP PHY to support SGMII and 10/100/1000
baseT.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  109 +++++++++++++++++++--------
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index dd747f6..194a569 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -860,6 +860,9 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
 	unsigned int phy_id = phy_data->phydev->phy_id;
 
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return false;
+
 	if ((phy_id & 0xfffffff0) != 0x01ff0cc0)
 		return false;
 
@@ -885,8 +888,80 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	return true;
 }
 
+static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+	struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
+	unsigned int phy_id = phy_data->phydev->phy_id;
+	int reg;
+
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return false;
+
+	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
+		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
+		return false;
+
+	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
+		   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
+		return false;
+
+	if ((phy_id & 0xfffffff0) != 0x03625d10)
+		return false;
+
+	/* Disable RGMII mode */
+	phy_write(phy_data->phydev, 0x18, 0x7007);
+	reg = phy_read(phy_data->phydev, 0x18);
+	phy_write(phy_data->phydev, 0x18, reg & ~0x0080);
+
+	/* Enable fiber register bank */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0001;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0001);
+
+	/* Power down SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg | 0x00800);
+
+	/* Configure SGMII-to-Copper mode */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0006;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0004);
+
+	/* Power up SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+	/* Enable copper register bank */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0001;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg);
+
+	/* Power up SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+	phy_data->phydev->supported = PHY_GBIT_FEATURES;
+	phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	phy_data->phydev->advertising = phy_data->phydev->supported;
+
+	netif_dbg(pdata, drv, pdata->netdev,
+		  "BelFuse PHY quirk in place\n");
+
+	return true;
+}
+
 static void xgbe_phy_external_phy_quirks(struct xgbe_prv_data *pdata)
 {
+	if (xgbe_phy_belfuse_phy_quirks(pdata))
+		return;
+
 	if (xgbe_phy_finisar_phy_quirks(pdata))
 		return;
 }
@@ -1027,37 +1102,6 @@ static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data)
 	return false;
 }
 
-static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
-{
-	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
-
-	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
-		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
-		return false;
-
-	if (!memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
-		    XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN)) {
-		phy_data->sfp_base = XGBE_SFP_BASE_1000_SX;
-		phy_data->sfp_cable = XGBE_SFP_CABLE_ACTIVE;
-		phy_data->sfp_speed = XGBE_SFP_SPEED_1000;
-		if (phy_data->sfp_changed)
-			netif_dbg(pdata, drv, pdata->netdev,
-				  "Bel-Fuse SFP quirk in place\n");
-		return true;
-	}
-
-	return false;
-}
-
-static bool xgbe_phy_sfp_parse_quirks(struct xgbe_prv_data *pdata)
-{
-	if (xgbe_phy_belfuse_parse_quirks(pdata))
-		return true;
-
-	return false;
-}
-
 static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -1076,9 +1120,6 @@ static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 	phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data);
 	phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data);
 
-	if (xgbe_phy_sfp_parse_quirks(pdata))
-		return;
-
 	/* Assume ACTIVE cable unless told it is PASSIVE */
 	if (sfp_base[XGBE_SFP_BASE_CABLE] & XGBE_SFP_BASE_CABLE_PASSIVE) {
 		phy_data->sfp_cable = XGBE_SFP_CABLE_PASSIVE;

^ permalink raw reply related

* [PATCH net-next v2 12/12] amd-xgbe: Improve SFP 100Mbps auto-negotiation
From: Tom Lendacky @ 2018-05-23 16:39 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180523163802.31625.76572.stgit@tlendack-t1.amdoffice.net>

After changing speed to 100Mbps as a result of auto-negotiation (AN),
some 10/100/1000Mbps SFPs indicate a successful link (no faults or loss
of signal), but cannot successfully transmit or receive data.  These
SFPs required an extra auto-negotiation (AN) after the speed change in
order to operate properly.  Add a quirk for these SFPs so that if the
outcome of the AN actually results in changing to a new speed, re-initiate
AN at that new speed.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |   77 +++++++++++++++------------
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    6 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h        |    1 
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 450b89c..4b5d625 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -331,13 +331,15 @@ static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
 	xgbe_change_mode(pdata, pdata->phy_if.phy_impl.switch_mode(pdata));
 }
 
-static void xgbe_set_mode(struct xgbe_prv_data *pdata,
+static bool xgbe_set_mode(struct xgbe_prv_data *pdata,
 			  enum xgbe_mode mode)
 {
 	if (mode == xgbe_cur_mode(pdata))
-		return;
+		return false;
 
 	xgbe_change_mode(pdata, mode);
+
+	return true;
 }
 
 static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
@@ -1178,21 +1180,23 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata)
 	return 0;
 }
 
-static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata, bool set_mode)
 {
 	int ret;
 
+	mutex_lock(&pdata->an_mutex);
+
 	set_bit(XGBE_LINK_INIT, &pdata->dev_state);
 	pdata->link_check = jiffies;
 
 	ret = pdata->phy_if.phy_impl.an_config(pdata);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (pdata->phy.autoneg != AUTONEG_ENABLE) {
 		ret = xgbe_phy_config_fixed(pdata);
 		if (ret || !pdata->kr_redrv)
-			return ret;
+			goto out;
 
 		netif_dbg(pdata, link, pdata->netdev, "AN redriver support\n");
 	} else {
@@ -1202,24 +1206,27 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	/* Disable auto-negotiation interrupt */
 	disable_irq(pdata->an_irq);
 
-	/* Start auto-negotiation in a supported mode */
-	if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KR);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SFI);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
-		xgbe_set_mode(pdata, XGBE_MODE_X);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
-	} else {
-		enable_irq(pdata->an_irq);
-		return -EINVAL;
+	if (set_mode) {
+		/* Start auto-negotiation in a supported mode */
+		if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KR);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SFI);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
+			xgbe_set_mode(pdata, XGBE_MODE_X);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
+		} else {
+			enable_irq(pdata->an_irq);
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	/* Disable and stop any in progress auto-negotiation */
@@ -1239,16 +1246,7 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	xgbe_an_init(pdata);
 	xgbe_an_restart(pdata);
 
-	return 0;
-}
-
-static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
-{
-	int ret;
-
-	mutex_lock(&pdata->an_mutex);
-
-	ret = __xgbe_phy_config_aneg(pdata);
+out:
 	if (ret)
 		set_bit(XGBE_LINK_ERR, &pdata->dev_state);
 	else
@@ -1259,6 +1257,16 @@ static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	return ret;
 }
 
+static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+{
+	return __xgbe_phy_config_aneg(pdata, true);
+}
+
+static int xgbe_phy_reconfig_aneg(struct xgbe_prv_data *pdata)
+{
+	return __xgbe_phy_config_aneg(pdata, false);
+}
+
 static bool xgbe_phy_aneg_done(struct xgbe_prv_data *pdata)
 {
 	return (pdata->an_result == XGBE_AN_COMPLETE);
@@ -1315,7 +1323,8 @@ static void xgbe_phy_status_result(struct xgbe_prv_data *pdata)
 
 	pdata->phy.duplex = DUPLEX_FULL;
 
-	xgbe_set_mode(pdata, mode);
+	if (xgbe_set_mode(pdata, mode) && pdata->an_again)
+		xgbe_phy_reconfig_aneg(pdata);
 }
 
 static void xgbe_phy_status(struct xgbe_prv_data *pdata)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 194a569..3ceb4f9 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -902,6 +902,9 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
 		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
 		return false;
 
+	/* For Bel-Fuse, use the extra AN flag */
+	pdata->an_again = 1;
+
 	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
 		   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
 		return false;
@@ -978,6 +981,9 @@ static int xgbe_phy_find_phy_device(struct xgbe_prv_data *pdata)
 	if (phy_data->phydev)
 		return 0;
 
+	/* Clear the extra AN flag */
+	pdata->an_again = 0;
+
 	/* Check for the use of an external PHY */
 	if (phy_data->phydev_mode == XGBE_MDIO_MODE_NONE)
 		return 0;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 7a412cf..47bcbcf 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1261,6 +1261,7 @@ struct xgbe_prv_data {
 	enum xgbe_rx kr_state;
 	enum xgbe_rx kx_state;
 	struct work_struct an_work;
+	unsigned int an_again;
 	unsigned int an_supported;
 	unsigned int parallel_detect;
 	unsigned int fec_ability;

^ permalink raw reply related

* [PATCH net-next] selftests: net: Test headroom handling of ip6_gre devices
From: Petr Machata @ 2018-05-23 16:41 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: davem, shuah

Commit 5691484df961 ("net: ip6_gre: Fix headroom request in
ip6erspan_tunnel_xmit()") and commit 01b8d064d58b ("net: ip6_gre:
Request headroom in __gre6_xmit()") fix problems in reserving headroom
in the packets tunneled through ip6gre/tap and ip6erspan netdevices.

These two patches included snippets that reproduced the issues. This
patch elevates the snippets to a full-fledged test case.

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 tools/testing/selftests/net/ip6_gre_headroom.sh | 59 +++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100755 tools/testing/selftests/net/ip6_gre_headroom.sh

diff --git a/tools/testing/selftests/net/ip6_gre_headroom.sh b/tools/testing/selftests/net/ip6_gre_headroom.sh
new file mode 100755
index 0000000..9aaf63fd
--- /dev/null
+++ b/tools/testing/selftests/net/ip6_gre_headroom.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test that enough headroom is reserved for the first packet passing through an
+# IPv6 GRE-like netdevice.
+
+setup_prepare()
+{
+	ip link add h1 type veth peer name swp1
+	ip link add h3 type veth peer name swp3
+
+	ip link set dev h1 up
+	ip address add 192.0.2.1/28 dev h1
+
+	ip link add dev vh3 type vrf table 20
+	ip link set dev h3 master vh3
+	ip link set dev vh3 up
+	ip link set dev h3 up
+
+	ip link set dev swp3 up
+	ip address add dev swp3 2001:db8:2::1/64
+
+	ip link set dev swp1 up
+	tc qdisc add dev swp1 clsact
+}
+
+cleanup()
+{
+	ip link del dev swp1
+	ip link del dev swp3
+	ip link del dev vh3
+}
+
+test_headroom()
+{
+	ip link add name gt6 "$@"
+	ip link set dev gt6 up
+
+	sleep 1
+
+	tc filter add dev swp1 ingress pref 1000 matchall skip_hw \
+		action mirred egress mirror dev gt6
+	ping -I h1 192.0.2.2 -c 1 -w 2 &> /dev/null
+	tc filter del dev swp1 ingress pref 1000
+
+	ip link del dev gt6
+
+	# If it doesn't panic, it passes.
+	printf "TEST: %-60s  [PASS]\n" "$2 headroom"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+test_headroom type ip6erspan \
+	      local 2001:db8:2::1 remote 2001:db8:2::2 oseq okey 123
+test_headroom type ip6gretap \
+	      local 2001:db8:2::1 remote 2001:db8:2::2
-- 
2.4.11

^ permalink raw reply related

* Re: [PATCH 2/4] arcnet: com20020: bindings for smsc com20020
From: Rob Herring @ 2018-05-23 16:49 UTC (permalink / raw)
  To: Andrea Greco
  Cc: tobin, Andrea Greco, Mark Rutland, netdev, devicetree,
	linux-kernel
In-Reply-To: <20180517130628.2770-1-andrea.greco.gapmilano@gmail.com>

On Thu, May 17, 2018 at 03:06:26PM +0200, Andrea Greco wrote:
> From: Andrea Greco <a.greco@4sigma.it>
> 
> Add devicetree bindings for smsc com20020
> 
> Signed-off-by: Andrea Greco <a.greco@4sigma.it>
> ---
>  .../devicetree/bindings/net/smsc-com20020.txt       | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/smsc-com20020.txt

One typo, otherwise:

Reviewed-by: Rob Herring <robh@kernel.org>

> 
> diff --git a/Documentation/devicetree/bindings/net/smsc-com20020.txt b/Documentation/devicetree/bindings/net/smsc-com20020.txt
> new file mode 100644
> index 000000000000..92360b054873
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/smsc-com20020.txt
> @@ -0,0 +1,21 @@
> +SMSC com20020 Arcnet network controller
> +
> +Required propelty:

property

> +- timeout-ns: Arcnet bus timeout, Idle Time (328000 - 20500)
> +- bus-speed-bps: Arcnet bus speed (10000000 - 156250)
> +- smsc,xtal-mhz: External oscillator frequency
> +- smsc,backplane-enabled: Controller use backplane mode
> +- reset-gpios: Chip reset pin
> +- interrupts: Should contain controller interrupt
> +
> +arcnet@28000000 {
> +    compatible = "smsc,com20020";
> +
> +	timeout-ns = <20500>;
> +	bus-speed-bps = <10000000>;
> +	smsc,xtal-mhz = <20>;
> +	smsc,backplane-enabled;
> +
> +	reset-gpios = <&gpio3 21 GPIO_ACTIVE_LOW>;
> +	interrupts = <&gpio2 10 GPIO_ACTIVE_LOW>;
> +};
> -- 
> 2.14.3
> 

^ permalink raw reply

* Re: [PATCH net] net: phy: broadcom: Fix bcm_write_exp()
From: Florian Fainelli @ 2018-05-23 16:57 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, arunp, David S. Miller, Ray Jui, Scott Branden, Jon Mason,
	maintainer:BROADCOM IPROC ARM ARCHITECTURE,
	moderated list:BROADCOM IPROC ARM ARCHITECTURE, open list
In-Reply-To: <89a1f8d7-5303-f3c0-aa38-43e64488ec5a@gmail.com>

On 05/22/2018 06:20 PM, Florian Fainelli wrote:
> Hi Andrew,
> 
> On 05/22/2018 05:15 PM, Andrew Lunn wrote:
>> On Tue, May 22, 2018 at 05:04:49PM -0700, Florian Fainelli wrote:
>>> On newer PHYs, we need to select the expansion register to write with
>>> setting bits [11:8] to 0xf. This was done correctly by bcm7xxx.c prior
>>> to being migrated to generic code under bcm-phy-lib.c which
>>> unfortunately used the older implementation from the BCM54xx days.
>>
>> Hi Florian
>>
>> Does selecting the expansion register affect access to the standard
>> registers? Does this need locking like the Marvell PHY has when
>> changing pages?
> 
> We should probably convert this to the page accessors since the
> expansion, misc and other shadow 0x1c accesses are all indirection
> layers to poke into a different address space of the PHY. That would be
> a separate fix though for a number of reasons.

I realize I did not quite answer your question, the answer to your
question AFAICT is no, setting the expansion register sequence and then
aborting mid-way is not a problem and does not impact the standard MII
registers because of how this is implemented. The registers are accessed
and latched through a specific indirect sequence, but there is no page
switching unlike the Marvell PHYs
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 2/3] net/ipv6: Udate fib6_table_lookup tracepoint
From: David Miller @ 2018-05-23 17:09 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, dsahern
In-Reply-To: <20180521212443.23612-3-dsahern@kernel.org>

From: dsahern@kernel.org
Date: Mon, 21 May 2018 14:24:42 -0700

> +		__entry->err = ip6_rt_type_to_error(f6i->fib6_type);

As the kbuild bot discovered, this doesn't work when IPV6=m.

^ permalink raw reply

* Re: [PATCH net-next 2/3] net/ipv6: Udate fib6_table_lookup tracepoint
From: David Ahern @ 2018-05-23 17:13 UTC (permalink / raw)
  To: David Miller, dsahern; +Cc: netdev
In-Reply-To: <20180523.130952.2276806407125412362.davem@davemloft.net>

On 5/23/18 11:09 AM, David Miller wrote:
> From: dsahern@kernel.org
> Date: Mon, 21 May 2018 14:24:42 -0700
> 
>> +		__entry->err = ip6_rt_type_to_error(f6i->fib6_type);
> 
> As the kbuild bot discovered, this doesn't work when IPV6=m.
> 

yep. I'll take a look later today. Thinking about moving the tracepoint
create from net/core/net-traces.c to net/ipv6/route.c.

^ permalink raw reply

* Re: [PATCH net-next v4 0/2] openvswitch: Support conntrack zone limit
From: David Miller @ 2018-05-23 17:13 UTC (permalink / raw)
  To: yihung.wei; +Cc: netdev, pshelar
In-Reply-To: <1526948165-32443-1-git-send-email-yihung.wei@gmail.com>

From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 21 May 2018 17:16:03 -0700

> v3->v4:
>   - Addresses comments from Parvin that include simplify netlink API,
>     and remove unncessary RCU lockings.
>   - Rebases to master.

Pravin, please review.

^ permalink raw reply

* Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
From: Martin KaFai Lau @ 2018-05-23 17:13 UTC (permalink / raw)
  To: Yonghong Song; +Cc: peterz, ast, daniel, netdev, kernel-team
In-Reply-To: <20180522163048.3128924-3-yhs@fb.com>

On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote:
> Currently, suppose a userspace application has loaded a bpf program
> and attached it to a tracepoint/kprobe/uprobe, and a bpf
> introspection tool, e.g., bpftool, wants to show which bpf program
> is attached to which tracepoint/kprobe/uprobe. Such attachment
> information will be really useful to understand the overall bpf
> deployment in the system.
> 
> There is a name field (16 bytes) for each program, which could
> be used to encode the attachment point. There are some drawbacks
> for this approaches. First, bpftool user (e.g., an admin) may not
> really understand the association between the name and the
> attachment point. Second, if one program is attached to multiple
> places, encoding a proper name which can imply all these
> attachments becomes difficult.
> 
> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
> Given a pid and fd, if the <pid, fd> is associated with a
> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
>    . prog_id
>    . tracepoint name, or
>    . k[ret]probe funcname + offset or kernel addr, or
>    . u[ret]probe filename + offset
> to the userspace.
> The user can use "bpftool prog" to find more information about
> bpf program itself with prog_id.
LGTM, some comments inline.

> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/trace_events.h |  16 ++++++
>  include/uapi/linux/bpf.h     |  27 ++++++++++
>  kernel/bpf/syscall.c         | 124 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/bpf_trace.c     |  48 +++++++++++++++++
>  kernel/trace/trace_kprobe.c  |  29 ++++++++++
>  kernel/trace/trace_uprobe.c  |  22 ++++++++
>  6 files changed, 266 insertions(+)
> 
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 2bde3ef..eab806d 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
>  int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>  int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>  struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> +			    u32 *attach_info, const char **buf,
> +			    u64 *probe_offset, u64 *probe_addr);
The first arg is 'const struct perf_event *event' while...

>  #else
>  static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>  {
> @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
>  {
>  	return NULL;
>  }
> +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id,
this one has 'const struct file *file'?

> +					  u32 *attach_info, const char **buf,
> +					  u64 *probe_offset, u64 *probe_addr)
> +{
> +	return -EOPNOTSUPP;
> +}
>  #endif
>  
>  enum {
> @@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
>  #ifdef CONFIG_KPROBE_EVENTS
>  extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
>  extern void perf_kprobe_destroy(struct perf_event *event);
> +extern int bpf_get_kprobe_info(const struct perf_event *event,
> +			       u32 *attach_info, const char **symbol,
> +			       u64 *probe_offset, u64 *probe_addr,
> +			       bool perf_type_tracepoint);
>  #endif
>  #ifdef CONFIG_UPROBE_EVENTS
>  extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
>  extern void perf_uprobe_destroy(struct perf_event *event);
> +extern int bpf_get_uprobe_info(const struct perf_event *event,
> +			       u32 *attach_info, const char **filename,
> +			       u64 *probe_offset, bool perf_type_tracepoint);
>  #endif
>  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
>  				     char *filter_str);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97446bb..a602150 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -97,6 +97,7 @@ enum bpf_cmd {
>  	BPF_RAW_TRACEPOINT_OPEN,
>  	BPF_BTF_LOAD,
>  	BPF_BTF_GET_FD_BY_ID,
> +	BPF_TASK_FD_QUERY,
>  };
>  
>  enum bpf_map_type {
> @@ -379,6 +380,22 @@ union bpf_attr {
>  		__u32		btf_log_size;
>  		__u32		btf_log_level;
>  	};
> +
> +	struct {
> +		int		pid;		/* input: pid */
> +		int		fd;		/* input: fd */
Should fd and pid be always positive?
The current fd (like map_fd) in bpf_attr is using __u32.

> +		__u32		flags;		/* input: flags */
> +		__u32		buf_len;	/* input: buf len */
> +		__aligned_u64	buf;		/* input/output:
> +						 *   tp_name for tracepoint
> +						 *   symbol for kprobe
> +						 *   filename for uprobe
> +						 */
> +		__u32		prog_id;	/* output: prod_id */
> +		__u32		attach_info;	/* output: BPF_ATTACH_* */
> +		__u64		probe_offset;	/* output: probe_offset */
> +		__u64		probe_addr;	/* output: probe_addr */
> +	} task_fd_query;
>  } __attribute__((aligned(8)));
>  
>  /* The description below is an attempt at providing documentation to eBPF
> @@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
>  	__u8	dmac[6];     /* ETH_ALEN */
>  };
>  
> +/* used by <task, fd> based query */
> +enum {
Nit. Instead of a comment, is it better to give this
enum a descriptive name?

> +	BPF_ATTACH_RAW_TRACEPOINT,	/* tp name */
> +	BPF_ATTACH_TRACEPOINT,		/* tp name */
> +	BPF_ATTACH_KPROBE,		/* (symbol + offset) or addr */
> +	BPF_ATTACH_KRETPROBE,		/* (symbol + offset) or addr */
> +	BPF_ATTACH_UPROBE,		/* filename + offset */
> +	BPF_ATTACH_URETPROBE,		/* filename + offset */
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index bfcde94..9356f0e 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -18,7 +18,9 @@
>  #include <linux/vmalloc.h>
>  #include <linux/mmzone.h>
>  #include <linux/anon_inodes.h>
> +#include <linux/fdtable.h>
>  #include <linux/file.h>
> +#include <linux/fs.h>
>  #include <linux/license.h>
>  #include <linux/filter.h>
>  #include <linux/version.h>
> @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
>  	return btf_get_fd_by_id(attr->btf_id);
>  }
>  
> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
> +				    union bpf_attr __user *uattr,
> +				    u32 prog_id, u32 attach_info,
> +				    const char *buf, u64 probe_offset,
> +				    u64 probe_addr)
> +{
> +	__u64 __user *ubuf;
Nit. ubuf is a string instead of an array of __u64?

> +	int len;
> +
> +	ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
> +	if (buf) {
> +		len = strlen(buf);
> +		if (attr->task_fd_query.buf_len < len + 1)
I think the current convention is to take the min,
copy whatever it can to buf and return the real
len/size in buf_len.  F.e., the prog_ids and prog_cnt in
__cgroup_bpf_query().

Should the same be done here or it does not make sense to
truncate the string?  The user may/may not need the tailing
char though if its pretty print has limited width anyway.
The user still needs to know what the buf_len should be to
retry also but I guess any reasonable buf_len should
work?

> +			return -ENOSPC;
> +		if (copy_to_user(ubuf, buf, len + 1))
> +			return -EFAULT;
> +	} else if (attr->task_fd_query.buf_len) {
> +		/* copy '\0' to ubuf */
> +		__u8 zero = 0;
> +
> +		if (copy_to_user(ubuf, &zero, 1))
> +			return -EFAULT;
> +	}
> +
> +	if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id,
> +			 sizeof(prog_id)) ||
> +	    copy_to_user(&uattr->task_fd_query.attach_info, &attach_info,
> +			 sizeof(attach_info)) ||
> +	    copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset,
> +			 sizeof(probe_offset)) ||
> +	    copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr,
> +			 sizeof(probe_addr)))
Nit. put_user() may be able to shorten them.

> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
> +
> +static int bpf_task_fd_query(const union bpf_attr *attr,
> +			     union bpf_attr __user *uattr)
> +{
> +	pid_t pid = attr->task_fd_query.pid;
> +	int fd = attr->task_fd_query.fd;
> +	const struct perf_event *event;
> +	struct files_struct *files;
> +	struct task_struct *task;
> +	struct file *file;
> +	int err;
> +
> +	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
> +		return -EINVAL;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	if (attr->task_fd_query.flags != 0)
How flags is used?

> +		return -EINVAL;
> +
> +	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
> +	if (!task)
> +		return -ENOENT;
> +
> +	files = get_files_struct(task);
> +	put_task_struct(task);
> +	if (!files)
> +		return -ENOENT;
> +
> +	err = 0;
> +	spin_lock(&files->file_lock);
> +	file = fcheck_files(files, fd);
> +	if (!file)
> +		err = -EBADF;
> +	else
> +		get_file(file);
> +	spin_unlock(&files->file_lock);
> +	put_files_struct(files);
> +
> +	if (err)
> +		goto out;
> +
> +	if (file->f_op == &bpf_raw_tp_fops) {
> +		struct bpf_raw_tracepoint *raw_tp = file->private_data;
> +		struct bpf_raw_event_map *btp = raw_tp->btp;
> +
> +		if (!raw_tp->prog)
> +			err = -ENOENT;
> +		else
> +			err = bpf_task_fd_query_copy(attr, uattr,
> +						     raw_tp->prog->aux->id,
> +						     BPF_ATTACH_RAW_TRACEPOINT,
> +						     btp->tp->name, 0, 0);
> +		goto put_file;
> +	}
> +
> +	event = perf_get_event(file);
> +	if (!IS_ERR(event)) {
> +		u64 probe_offset, probe_addr;
> +		u32 prog_id, attach_info;
> +		const char *buf;
> +
> +		err = bpf_get_perf_event_info(event, &prog_id, &attach_info,
> +					      &buf, &probe_offset,
> +					      &probe_addr);
> +		if (!err)
> +			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
> +						     attach_info, buf,
> +						     probe_offset,
> +						     probe_addr);
> +		goto put_file;
> +	}
> +
> +	err = -ENOTSUPP;
> +put_file:
> +	fput(file);
> +out:
> +	return err;
> +}
> +
>  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>  {
>  	union bpf_attr attr = {};
> @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>  	case BPF_BTF_GET_FD_BY_ID:
>  		err = bpf_btf_get_fd_by_id(&attr);
>  		break;
> +	case BPF_TASK_FD_QUERY:
> +		err = bpf_task_fd_query(&attr, uattr);
> +		break;
>  	default:
>  		err = -EINVAL;
>  		break;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ce2cbbf..323c80e 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -14,6 +14,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/ctype.h>
>  #include <linux/kprobes.h>
> +#include <linux/syscalls.h>
>  #include <linux/error-injection.h>
>  
>  #include "trace_probe.h"
> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
>  	mutex_unlock(&bpf_event_mutex);
>  	return err;
>  }
> +
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> +			    u32 *attach_info, const char **buf,
> +			    u64 *probe_offset, u64 *probe_addr)
> +{
> +	bool is_tracepoint, is_syscall_tp;
> +	struct bpf_prog *prog;
> +	int flags, err = 0;
> +
> +	prog = event->prog;
> +	if (!prog)
> +		return -ENOENT;
> +
> +	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
> +	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
> +		return -EOPNOTSUPP;
> +
> +	*prog_id = prog->aux->id;
> +	flags = event->tp_event->flags;
> +	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
> +	is_syscall_tp = is_syscall_trace_event(event->tp_event);
> +
> +	if (is_tracepoint || is_syscall_tp) {
> +		*buf = is_tracepoint ? event->tp_event->tp->name
> +				     : event->tp_event->name;
> +		*attach_info = BPF_ATTACH_TRACEPOINT;
> +		*probe_offset = 0x0;
> +		*probe_addr = 0x0;
> +	} else {
> +		/* kprobe/uprobe */
> +		err = -EOPNOTSUPP;
> +#ifdef CONFIG_KPROBE_EVENTS
> +		if (flags & TRACE_EVENT_FL_KPROBE)
> +			err = bpf_get_kprobe_info(event, attach_info, buf,
> +						  probe_offset, probe_addr,
> +						  event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> +#ifdef CONFIG_UPROBE_EVENTS
> +		if (flags & TRACE_EVENT_FL_UPROBE)
> +			err = bpf_get_uprobe_info(event, attach_info, buf,
> +						  probe_offset,
> +						  event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> +	}
> +
> +	return err;
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 02aed76..32e9190 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
>  			      head, NULL);
>  }
>  NOKPROBE_SYMBOL(kretprobe_perf_func);
> +
> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info,
> +			const char **symbol, u64 *probe_offset,
> +			u64 *probe_addr, bool perf_type_tracepoint)
> +{
> +	const char *pevent = trace_event_name(event->tp_event);
> +	const char *group = event->tp_event->class->system;
> +	struct trace_kprobe *tk;
> +
> +	if (perf_type_tracepoint)
> +		tk = find_trace_kprobe(pevent, group);
> +	else
> +		tk = event->tp_event->data;
> +	if (!tk)
> +		return -EINVAL;
> +
> +	*attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE
> +						  : BPF_ATTACH_KPROBE;
> +	if (tk->symbol) {
> +		*symbol = tk->symbol;
> +		*probe_offset = tk->rp.kp.offset;
> +		*probe_addr = 0;
> +	} else {
> +		*symbol = NULL;
> +		*probe_offset = 0;
> +		*probe_addr = (unsigned long)tk->rp.kp.addr;
> +	}
> +	return 0;
> +}
>  #endif	/* CONFIG_PERF_EVENTS */
>  
>  /*
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index ac89287..12a3667 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
>  {
>  	__uprobe_perf_func(tu, func, regs, ucb, dsize);
>  }
> +
> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info,
> +			const char **filename, u64 *probe_offset,
> +			bool perf_type_tracepoint)
> +{
> +	const char *pevent = trace_event_name(event->tp_event);
> +	const char *group = event->tp_event->class->system;
> +	struct trace_uprobe *tu;
> +
> +	if (perf_type_tracepoint)
> +		tu = find_probe_event(pevent, group);
> +	else
> +		tu = event->tp_event->data;
> +	if (!tu)
> +		return -EINVAL;
> +
> +	*attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE
> +					: BPF_ATTACH_UPROBE;
> +	*filename = tu->filename;
> +	*probe_offset = tu->offset;
> +	return 0;
> +}
>  #endif	/* CONFIG_PERF_EVENTS */
>  
>  static int
> -- 
> 2.9.5
> 

^ permalink raw reply

* Re: [PATCH bpf-next v3 4/7] tools/bpf: add ksym_get_addr() in trace_helpers
From: Martin KaFai Lau @ 2018-05-23 17:16 UTC (permalink / raw)
  To: Yonghong Song; +Cc: peterz, ast, daniel, netdev, kernel-team
In-Reply-To: <20180522163048.3128924-5-yhs@fb.com>

On Tue, May 22, 2018 at 09:30:48AM -0700, Yonghong Song wrote:
> Given a kernel function name, ksym_get_addr() will return the kernel
> address for this function, or 0 if it cannot find this function name
> in /proc/kallsyms. This function will be used later when a kernel
> address is used to initiate a kprobe perf event.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>

^ permalink raw reply

* Re: [PATCH v3 net-next 0/2] bpfilter
From: David Miller @ 2018-05-23 17:26 UTC (permalink / raw)
  To: ast
  Cc: daniel, torvalds, gregkh, luto, mcgrof, keescook, netdev,
	linux-kernel, kernel-team
In-Reply-To: <20180522022230.2492505-1-ast@kernel.org>

From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 21 May 2018 19:22:28 -0700

> v2->v3:
> - followed Luis's suggestion and significantly simplied first patch
>   with shmem_kernel_file_setup+kernel_write. Added kdoc for new helper
> - fixed typos and race to access pipes with mutex
> - tested with bpfilter being 'builtin'. CONFIG_BPFILTER_UMH=y|m both work.
>   Interesting to see a usermode executable being embedded inside vmlinux.
> - it doesn't hurt to enable bpfilter in .config.
>   ip_setsockopt commands sent to usermode via pipes and -ENOPROTOOPT is
>   returned from userspace, so kernel falls back to original iptables code
> 
> v1->v2:
> this patch set is almost a full rewrite of the earlier umh modules approach
> The v1 of patches and follow up discussion was covered by LWN:
> https://lwn.net/Articles/749108/
> 
> I believe the v2 addresses all issues brought up by Andy and others.
> Mainly there are zero changes to kernel/module.c
> Instead of teaching module loading logic to recognize special
> umh module, let normal kernel modules execute part of its own
> .init.rodata as a new user space process (Andy's idea)
> Patch 1 introduces this new helper:
> int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
> Input:
>   data + len == executable file
> Output:
>   struct umh_info {
>        struct file *pipe_to_umh;
>        struct file *pipe_from_umh;
>        pid_t pid;
>   };

Series applied, let the madness begin... :-)

^ permalink raw reply

* Re: [PATCH v3 net-next 0/2] bpfilter
From: Greg KH @ 2018-05-23 17:33 UTC (permalink / raw)
  To: David Miller
  Cc: ast, daniel, torvalds, luto, mcgrof, keescook, netdev,
	linux-kernel, kernel-team
In-Reply-To: <20180523.132648.459690706167609338.davem@davemloft.net>

On Wed, May 23, 2018 at 01:26:48PM -0400, David Miller wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> Date: Mon, 21 May 2018 19:22:28 -0700
> 
> > v2->v3:
> > - followed Luis's suggestion and significantly simplied first patch
> >   with shmem_kernel_file_setup+kernel_write. Added kdoc for new helper
> > - fixed typos and race to access pipes with mutex
> > - tested with bpfilter being 'builtin'. CONFIG_BPFILTER_UMH=y|m both work.
> >   Interesting to see a usermode executable being embedded inside vmlinux.
> > - it doesn't hurt to enable bpfilter in .config.
> >   ip_setsockopt commands sent to usermode via pipes and -ENOPROTOOPT is
> >   returned from userspace, so kernel falls back to original iptables code
> > 
> > v1->v2:
> > this patch set is almost a full rewrite of the earlier umh modules approach
> > The v1 of patches and follow up discussion was covered by LWN:
> > https://lwn.net/Articles/749108/
> > 
> > I believe the v2 addresses all issues brought up by Andy and others.
> > Mainly there are zero changes to kernel/module.c
> > Instead of teaching module loading logic to recognize special
> > umh module, let normal kernel modules execute part of its own
> > .init.rodata as a new user space process (Andy's idea)
> > Patch 1 introduces this new helper:
> > int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
> > Input:
> >   data + len == executable file
> > Output:
> >   struct umh_info {
> >        struct file *pipe_to_umh;
> >        struct file *pipe_from_umh;
> >        pid_t pid;
> >   };
> 
> Series applied, let the madness begin... :-)

Yeah, this is going to be fun :)

^ permalink raw reply

* Re: [PATCH net V2 0/4] Fix several issues of virtio-net mergeable XDP
From: David Miller @ 2018-05-23 17:37 UTC (permalink / raw)
  To: jasowang; +Cc: mst, virtualization, netdev, linux-kernel
In-Reply-To: <1526960671-11782-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Tue, 22 May 2018 11:44:27 +0800

> Please review the patches that tries to fix sevreal issues of
> virtio-net mergeable XDP.
> 
> Changes from V1:
> - check against 1 before decreasing instead of resetting to 1
> - typoe fixes

Series applied and queued up for -stable.

^ permalink raw reply

* [PATCH bpf-next] bpf: btf: Avoid variable length array
From: Martin KaFai Lau @ 2018-05-23 17:46 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Sparse warning:
kernel/bpf/btf.c:1985:34: warning: Variable length array is used.

This patch moves the nr_secs from btf_check_sec_info() to a macro.

Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 kernel/bpf/btf.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9cbeabb5aca3..517296712774 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1970,6 +1970,8 @@ static const size_t btf_sec_info_offset[] = {
 	offsetof(struct btf_header, str_off),
 };
 
+#define NR_SECS ARRAY_SIZE(btf_sec_info_offset)
+
 static int btf_sec_info_cmp(const void *a, const void *b)
 {
 	const struct btf_sec_info *x = a;
@@ -1981,8 +1983,7 @@ static int btf_sec_info_cmp(const void *a, const void *b)
 static int btf_check_sec_info(struct btf_verifier_env *env,
 			      u32 btf_data_size)
 {
-	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
-	struct btf_sec_info secs[nr_secs];
+	struct btf_sec_info secs[NR_SECS];
 	u32 total, expected_total, i;
 	const struct btf_header *hdr;
 	const struct btf *btf;
@@ -1991,17 +1992,17 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
 	hdr = &btf->hdr;
 
 	/* Populate the secs from hdr */
-	for (i = 0; i < nr_secs; i++)
+	for (i = 0; i < NR_SECS; i++)
 		secs[i] = *(struct btf_sec_info *)((void *)hdr +
 						   btf_sec_info_offset[i]);
 
-	sort(secs, nr_secs, sizeof(struct btf_sec_info),
+	sort(secs, NR_SECS, sizeof(struct btf_sec_info),
 	     btf_sec_info_cmp, NULL);
 
 	/* Check for gaps and overlap among sections */
 	total = 0;
 	expected_total = btf_data_size - hdr->hdr_len;
-	for (i = 0; i < nr_secs; i++) {
+	for (i = 0; i < NR_SECS; i++) {
 		if (expected_total < secs[i].off) {
 			btf_verifier_log(env, "Invalid section offset");
 			return -EINVAL;
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH v2] ath10k: transmit queued frames after waking queues
From: Rajkumar Manoharan @ 2018-05-23 18:05 UTC (permalink / raw)
  To: Erik Stromdahl
  Cc: Niklas Cassel, Kalle Valo, David S. Miller, ath10k,
	linux-wireless, netdev, linux-kernel, linux-wireless-owner
In-Reply-To: <c131da6e-6479-3a40-fbd3-9c61d6690ba8@gmail.com>

On 2018-05-23 09:25, Erik Stromdahl wrote:
> On 05/22/2018 11:15 PM, Niklas Cassel wrote:
> 
[...]
>> 
>> Perhaps it would be possible to call ath10k_mac_tx_push_pending()
>> from the equivalent to ath10k_htt_txrx_compl_task(),
>> but from SDIO's point of view.
> An equivalent for SDIO would most likely be 
> *ath10k_htt_htc_t2h_msg_handler*
> or any of the other functions called from this function.
> 
> *ath10k_txrx_tx_unref* is actually called from 
> *ath10k_htt_htc_t2h_msg_handler*,
> so that function could be viewed as an equivalent.
> 
> If the call should be added in the bus driver (sdio.c) it should most 
> likely be
> placed in *ath10k_sdio_mbox_rx_process_packets*
> 
> 		if (!pkt->trailer_only) {
> 			ep->ep_ops.ep_rx_complete(ar_sdio->ar, pkt->skb);
> 			ath10k_mac_tx_push_pending(ar_sdio->ar);
> 		} else {
> 			kfree_skb(pkt->skb)
> 		}
> 
> The above call would of course result in lot's of calls to
> *ath10k_mac_tx_push_pending*
> Adding a htt_num_pending check here wouldn't look nice.
> 
> The HL RX path differs from the LL path in that the t2h_msg_handler 
> returns
> false indicating that it has consumed the skb.
> 
> This is because it is the HL RX indication handler that delivers the 
> skb's
> to mac80211.
> 
I also dont prefer to call *_push_pending for every HTC packets. Similar 
to
LL approach, call ath10k_mac_tx_push_pending after processing all 
pending
rx messages like calling from ath10k_sdio_mbox_rxmsg_pending_handler.

--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -807,6 +807,8 @@ static int 
ath10k_sdio_mbox_rxmsg_pending_handler(struct ath10k *ar,
                 ath10k_warn(ar, "failed to get pending recv messages: 
%d\n",
                             ret);

+       ath10k_mac_tx_push_pending(ar);
+
         return ret;
  }

> Another solution could be to add an *else-statement* as a part of the
> *if (release)*
> in *ath10k_htt_htc_t2h_msg_handler*, where
> *ath10k_mac_tx_push_pending* could be called.
> 
> Something like this perhaps:
> 
> 	/* Free the indication buffer */
> 	if (release)
> 		dev_kfree_skb_any(skb);
> 	else if (!ar->htt.num_pending_tx)
> 		ath10k_mac_tx_push_pending(ar);
> 
> I think I prefer your original patch though.
>> 

Better to do changes as HL specific path instead in common path.
The above change will impact QCA6174 based devices.

-Rajkumar

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: btf: Avoid variable length array
From: Joe Perches @ 2018-05-23 18:11 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180523174659.354660-1-kafai@fb.com>

On Wed, 2018-05-23 at 10:46 -0700, Martin KaFai Lau wrote:
> Sparse warning:
> kernel/bpf/btf.c:1985:34: warning: Variable length array is used.

Perhaps use ARRAY_SIZE directly instead of indirectly via a #define

> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
[]
> @@ -1970,6 +1970,8 @@ static const size_t btf_sec_info_offset[] = {
>  	offsetof(struct btf_header, str_off),
>  };
>  
> +#define NR_SECS ARRAY_SIZE(btf_sec_info_offset)
> +
>  static int btf_sec_info_cmp(const void *a, const void *b)
>  {
>  	const struct btf_sec_info *x = a;
> @@ -1981,8 +1983,7 @@ static int btf_sec_info_cmp(const void *a, const void *b)
>  static int btf_check_sec_info(struct btf_verifier_env *env,
>  			      u32 btf_data_size)
>  {
> -	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
> -	struct btf_sec_info secs[nr_secs];
> +	struct btf_sec_info secs[NR_SECS];

	struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];

>  	u32 total, expected_total, i;
>  	const struct btf_header *hdr;
>  	const struct btf *btf;
> @@ -1991,17 +1992,17 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
>  	hdr = &btf->hdr;
>  
>  	/* Populate the secs from hdr */
> -	for (i = 0; i < nr_secs; i++)
> +	for (i = 0; i < NR_SECS; i++)

	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)

>  		secs[i] = *(struct btf_sec_info *)((void *)hdr +
>  						   btf_sec_info_offset[i]);

which makes this loop more intelligible.

> -	sort(secs, nr_secs, sizeof(struct btf_sec_info),
> +	sort(secs, NR_SECS, sizeof(struct btf_sec_info),
>  	     btf_sec_info_cmp, NULL);

etc...

^ permalink raw reply

* Re: [PATCH v2 1/1] tools/lib/libbpf.c: fix string format to allow build on arm32
From: Daniel Borkmann @ 2018-05-23 18:19 UTC (permalink / raw)
  To: Sirio Balmelli; +Cc: netdev
In-Reply-To: <20180523161704.4f5af2ehqdh6cqrh@vm4>

On 05/23/2018 06:17 PM, Sirio Balmelli wrote:
> On arm32, 'cd tools/testing/selftests/bpf && make' fails with:
> 
> libbpf.c:80:10: error: format ‘%ld’ expects argument of type ‘long int’, but argument 4 has type ‘int64_t {aka long long int}’ [-Werror=format=]
>    (func)("libbpf: " fmt, ##__VA_ARGS__); \
>           ^
> libbpf.c:83:30: note: in expansion of macro ‘__pr’
>  #define pr_warning(fmt, ...) __pr(__pr_warning, fmt, ##__VA_ARGS__)
>                               ^~~~
> libbpf.c:1072:3: note: in expansion of macro ‘pr_warning’
>    pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",
> 
> To fix, typecast 'key_size' and amend format string.
> 
> Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>

Applied to bpf-next, thank Sirio!

^ permalink raw reply

* Re: [PATCH net-next 00/13] nfp: abm: add basic support for advanced buffering NIC
From: David Miller @ 2018-05-23 18:28 UTC (permalink / raw)
  To: jakub.kicinski; +Cc: netdev, oss-drivers
In-Reply-To: <20180522051255.9438-1-jakub.kicinski@netronome.com>

From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:42 -0700

> This series lays groundwork for advanced buffer management NIC feature.
> It makes necessary NFP core changes, spawns representors and adds devlink
> glue.  Following series will add the actual buffering configuration (patch
> series size limit).
> 
> First three patches add support for configuring NFP buffer pools via a
> mailbox.  The existing devlink APIs are used for the purpose.
> 
> Third patch allows us to perform small reads from the NFP memory.
> 
> The rest of the patch set adds eswitch mode change support and makes
> the driver spawn appropriate representors.

Series applied, thank you!

^ permalink raw reply

* [PATCH net] ipv4: remove warning in ip_recv_error
From: Willem de Bruijn @ 2018-05-23 18:29 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

A precondition check in ip_recv_error triggered on an otherwise benign
race. Remove the warning.

The warning triggers when passing an ipv6 socket to this ipv4 error
handling function. RaceFuzzer was able to trigger it due to a race
in setsockopt IPV6_ADDRFORM.

  ---
  CPU0
    do_ipv6_setsockopt
      sk->sk_socket->ops = &inet_dgram_ops;

  ---
  CPU1
    sk->sk_prot->recvmsg
      udp_recvmsg
        ip_recv_error
          WARN_ON_ONCE(sk->sk_family == AF_INET6);

  ---
  CPU0
    do_ipv6_setsockopt
      sk->sk_family = PF_INET;

This socket option converts a v6 socket that is connected to a v4 peer
to an v4 socket. It updates the socket on the fly, changing fields in
sk as well as other structs. This is inherently non-atomic. It races
with the lockless udp_recvmsg path.

No other code makes an assumption that these fields are updated
atomically. It is benign here, too, as ip_recv_error cares only about
the protocol of the skbs enqueued on the error queue, for which
sk_family is not a precise predictor (thanks to another isue with
IPV6_ADDRFORM).

Link: http://lkml.kernel.org/r/20180518120826.GA19515@dragonet.kaist.ac.kr
Fixes: ("7ce875e5ecb8 ipv4: warn once on passing AF_INET6 socket to ip_recv_error")
Reported-by: DaeRyong Jeong <threeearcat@gmail.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/ip_sockglue.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5ad2d8ed3a3f..57bbb060faaf 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -505,8 +505,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	int err;
 	int copied;

-	WARN_ON_ONCE(sk->sk_family == AF_INET6);
-
 	err = -EAGAIN;
 	skb = sock_dequeue_err_skb(sk);
 	if (!skb)
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* Re: WARNING in ip_recv_error
From: Willem de Bruijn @ 2018-05-23 18:30 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, DaeLyong Jeong, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Network Development, LKML, Byoungyoung Lee, Kyungtae Kim,
	bammanag, Willem de Bruijn
In-Reply-To: <CAF=yD-KTfUbXGvU7qQy4=eHbuUB88=g_tQ8sp8TEebhW=rzKVQ@mail.gmail.com>

On Wed, May 23, 2018 at 11:40 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Sun, May 20, 2018 at 7:13 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Fri, May 18, 2018 at 2:59 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Fri, May 18, 2018 at 2:46 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Fri, May 18, 2018 at 2:44 PM, Willem de Bruijn
>>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>>> On Fri, May 18, 2018 at 1:09 PM, Willem de Bruijn
>>>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>>>> On Fri, May 18, 2018 at 11:44 AM, David Miller <davem@davemloft.net> wrote:
>>>>>>> From: Eric Dumazet <eric.dumazet@gmail.com>
>>>>>>> Date: Fri, 18 May 2018 08:30:43 -0700
>>>>>>>
>>>>>>>> We probably need to revert Willem patch (7ce875e5ecb8562fd44040f69bda96c999e38bbc)
>>>>>>>
>>>>>>> Is it really valid to reach ip_recv_err with an ipv6 socket?
>>>>>>
>>>>>> I guess the issue is that setsockopt IPV6_ADDRFORM is not an
>>>>>> atomic operation, so that the socket is neither fully ipv4 nor fully
>>>>>> ipv6 by the time it reaches ip_recv_error.
>>>>>>
>>>>>>   sk->sk_socket->ops = &inet_dgram_ops;
>>>>>>   < HERE >
>>>>>>   sk->sk_family = PF_INET;
>>>>>>
>>>>>> Even calling inet_recv_error to demux would not necessarily help.
>>>>>>
>>>>>> Safest would be to look up by skb->protocol, similar to what
>>>>>> ipv6_recv_error does to handle v4-mapped-v6.
>>>>>>
>>>>>> Or to make that function safe with PF_INET and swap the order
>>>>>> of the above two operations.
>>>>>>
>>>>>> All sound needlessly complicated for this rare socket option, but
>>>>>> I don't have a better idea yet. Dropping on the floor is not nice,
>>>>>> either.
>>>>>
>>>>> Ensuring that ip_recv_error correctly handles packets from either
>>>>> socket and removing the warning should indeed be good.
>>>>>
>>>>> It is robust against v4-mapped packets from an AF_INET6 socket,
>>>>> but see caveat on reconnect below.
>>>>>
>>>>> The code between ipv6_recv_error for v4-mapped addresses and
>>>>> ip_recv_error is essentially the same, the main difference being
>>>>> whether to return network headers as sockaddr_in with SOL_IP
>>>>> or sockaddr_in6 with SOL_IPV6.
>>>>>
>>>>> There are very few other locations in the stack that explicitly test
>>>>> sk_family in this way and thus would be vulnerable to races with
>>>>> IPV6_ADDRFORM.
>>>>>
>>>>> I'm not sure whether it is possible for a udpv6 socket to queue a
>>>>> real ipv6 packet on the error queue, disconnect, connect to an
>>>>> ipv4 address, call IPV6_ADDRFORM and then call ip_recv_error
>>>>> on a true ipv6 packet. That would return buggy data, e.g., in
>>>>> msg_name.
>>>>
>>>> In do_ipv6_setsockopt IPV6_ADDRFORM we can test that the
>>>> error queue is empty, and then take its lock for the duration of the
>>>> operation.
>>>
>>> Actually, no reason to hold the lock. This setsockopt holds the socket
>>> lock, which connect would need, too. So testing that the queue
>>> is empty after testing that it is connected to a v4 address is
>>> sufficient to ensure that no ipv6 packets are queued for reception.
>>>
>>> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
>>> index 4d780c7f0130..a975d6311341 100644
>>> --- a/net/ipv6/ipv6_sockglue.c
>>> +++ b/net/ipv6/ipv6_sockglue.c
>>> @@ -199,6 +199,11 @@ static int do_ipv6_setsockopt(struct sock *sk,
>>> int level, int optname,
>>>
>>>                         if (ipv6_only_sock(sk) ||
>>>                             !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
>>>                                 retv = -EADDRNOTAVAIL;
>>>                                 break;
>>>                         }
>>>
>>> +                       if (!skb_queue_empty(&sk->sk_error_queue)) {
>>> +                               retv = -EBUSY;
>>> +                               break;
>>> +                       }
>>> +
>>>                         fl6_free_socklist(sk);
>>>                         __ipv6_sock_mc_close(sk);
>>>
>>> After this it should be safe to remove the warning in ip_recv_error.
>>
>> Hmm.. nope.
>>
>> This ensures that the socket cannot produce any new true v6 packets.
>> But it does not guarantee that they are not already in the system, e.g.
>> queued in tc, and will find their way to the error queue later.
>>
>> We'll have to just be able to handle ipv6 packets in ip_recv_error.
>> Since IPV6_ADDRFORM is used to pass to legacy v4-only
>> processes and those likely are only confused by SOL_IPV6
>> error messages, it is probably best to just drop them and perhaps
>> WARN_ONCE.
>
> Even more fun, this is not limited to the error queue.
>
> I can queue a v6 packet for reception on a socket, connect to a v4
> address, call IPV6_ADDRFORM and then a regular recvfrom will
> return a partial v6 address as AF_INET.
>
> We definitely do not want to have to add a check
>
>   if (skb->protocol == htons(ETH_P_IPV6)) {
>     kfree_skb(skb);
>     goto try_again;
>   }
>
> to the normal recvmsg path.
>
> An alternative may be to tighten the check on when to allow
> IPV6_ADDRFORM. Not only return EBUSY if a packet is pending,
> but also if any sk_{rmem, omem, wmem}_alloc is non-zero. Only,
> these tightened constraints could break a legacy application.
>
> Either way, this race is somewhat tangential to the one that
> RaceFuzzer found. The sk changes that IPV6_ADDRFORM makes
> to sk_prot, sk_socket->ops and sk_family are not atomic and will
> not be. They need not be, because no other code assumes this
> consistency.
>
> So I'll start by removing the warning as Eric suggested.

http://patchwork.ozlabs.org/patch/919270/

^ permalink raw reply

* Re: [PATCH net] tuntap: correctly set SOCKWQ_ASYNC_NOSPACE
From: David Miller @ 2018-05-23 18:32 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, linux-kernel, mst, hannes, edumazet
In-Reply-To: <1526970064-29711-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Tue, 22 May 2018 14:21:04 +0800

> When link is down, writes to the device might fail with
> -EIO. Userspace needs an indication when the status is resolved.  As a
> fix, tun_net_open() attempts to wake up writers - but that is only
> effective if SOCKWQ_ASYNC_NOSPACE has been set in the past. This is
> not the case of vhost_net which only poll for EPOLLOUT after it meets
> errors during sendmsg().
> 
> This patch fixes this by making sure SOCKWQ_ASYNC_NOSPACE is set when
> socket is not writable or device is down to guarantee EPOLLOUT will be
> raised in either tun_chr_poll() or tun_sock_write_space() after device
> is up.
> 
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Cc: Eric Dumazet <edumazet@google.com>
> Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied and queued up for -stable, thanks Jason.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox