Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 2/5] net: stmmac: Switch stmmac_dma_ops to generic HW Interface Helpers
From: Jose Abreu @ 2018-04-06 13:08 UTC (permalink / raw)
  To: netdev
  Cc: Jose Abreu, David S. Miller, Joao Pinto, Giuseppe Cavallaro,
	Alexandre Torgue
In-Reply-To: <cover.1523019346.git.joabreu@synopsys.com>

Switch stmmac_dma_ops to generic Hardware Interface Helpers instead of
using hard-coded callbacks. This makes the code more readable and more
flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  50 -------
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 106 +++++++++++++++
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  14 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 147 +++++++++------------
 4 files changed, 172 insertions(+), 145 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 2c50d8c..b27221b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -381,56 +381,6 @@ struct dma_features {
 extern const struct stmmac_desc_ops enh_desc_ops;
 extern const struct stmmac_desc_ops ndesc_ops;
 
-/* Specific DMA helpers */
-struct stmmac_dma_ops {
-	/* DMA core initialization */
-	int (*reset)(void __iomem *ioaddr);
-	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
-		     u32 dma_tx, u32 dma_rx, int atds);
-	void (*init_chan)(void __iomem *ioaddr,
-			  struct stmmac_dma_cfg *dma_cfg, u32 chan);
-	void (*init_rx_chan)(void __iomem *ioaddr,
-			     struct stmmac_dma_cfg *dma_cfg,
-			     u32 dma_rx_phy, u32 chan);
-	void (*init_tx_chan)(void __iomem *ioaddr,
-			     struct stmmac_dma_cfg *dma_cfg,
-			     u32 dma_tx_phy, u32 chan);
-	/* Configure the AXI Bus Mode Register */
-	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
-	/* Dump DMA registers */
-	void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
-	/* Set tx/rx threshold in the csr6 register
-	 * An invalid value enables the store-and-forward mode */
-	void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
-			 int rxfifosz);
-	void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel,
-			    int fifosz, u8 qmode);
-	void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel,
-			    int fifosz, u8 qmode);
-	/* To track extra statistic (if supported) */
-	void (*dma_diagnostic_fr) (void *data, struct stmmac_extra_stats *x,
-				   void __iomem *ioaddr);
-	void (*enable_dma_transmission) (void __iomem *ioaddr);
-	void (*enable_dma_irq)(void __iomem *ioaddr, u32 chan);
-	void (*disable_dma_irq)(void __iomem *ioaddr, u32 chan);
-	void (*start_tx)(void __iomem *ioaddr, u32 chan);
-	void (*stop_tx)(void __iomem *ioaddr, u32 chan);
-	void (*start_rx)(void __iomem *ioaddr, u32 chan);
-	void (*stop_rx)(void __iomem *ioaddr, u32 chan);
-	int (*dma_interrupt) (void __iomem *ioaddr,
-			      struct stmmac_extra_stats *x, u32 chan);
-	/* If supported then get the optional core features */
-	void (*get_hw_feature)(void __iomem *ioaddr,
-			       struct dma_features *dma_cap);
-	/* Program the HW RX Watchdog */
-	void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 number_chan);
-	void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
-	void (*set_rx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
-	void (*set_rx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
-	void (*set_tx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
-	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan);
-};
-
 struct mac_device_info;
 
 /* Helpers to program the MAC core */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 4994677..e1a9ae6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -122,4 +122,110 @@ struct stmmac_desc_ops {
 #define stmmac_set_mss(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_mss, __args)
 
+struct stmmac_dma_cfg;
+struct dma_features;
+
+/* Specific DMA helpers */
+struct stmmac_dma_ops {
+	/* DMA core initialization */
+	int (*reset)(void __iomem *ioaddr);
+	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
+		     u32 dma_tx, u32 dma_rx, int atds);
+	void (*init_chan)(void __iomem *ioaddr,
+			  struct stmmac_dma_cfg *dma_cfg, u32 chan);
+	void (*init_rx_chan)(void __iomem *ioaddr,
+			     struct stmmac_dma_cfg *dma_cfg,
+			     u32 dma_rx_phy, u32 chan);
+	void (*init_tx_chan)(void __iomem *ioaddr,
+			     struct stmmac_dma_cfg *dma_cfg,
+			     u32 dma_tx_phy, u32 chan);
+	/* Configure the AXI Bus Mode Register */
+	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
+	/* Dump DMA registers */
+	void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
+	/* Set tx/rx threshold in the csr6 register
+	 * An invalid value enables the store-and-forward mode */
+	void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
+			 int rxfifosz);
+	void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel,
+			    int fifosz, u8 qmode);
+	void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel,
+			    int fifosz, u8 qmode);
+	/* To track extra statistic (if supported) */
+	void (*dma_diagnostic_fr) (void *data, struct stmmac_extra_stats *x,
+				   void __iomem *ioaddr);
+	void (*enable_dma_transmission) (void __iomem *ioaddr);
+	void (*enable_dma_irq)(void __iomem *ioaddr, u32 chan);
+	void (*disable_dma_irq)(void __iomem *ioaddr, u32 chan);
+	void (*start_tx)(void __iomem *ioaddr, u32 chan);
+	void (*stop_tx)(void __iomem *ioaddr, u32 chan);
+	void (*start_rx)(void __iomem *ioaddr, u32 chan);
+	void (*stop_rx)(void __iomem *ioaddr, u32 chan);
+	int (*dma_interrupt) (void __iomem *ioaddr,
+			      struct stmmac_extra_stats *x, u32 chan);
+	/* If supported then get the optional core features */
+	void (*get_hw_feature)(void __iomem *ioaddr,
+			       struct dma_features *dma_cap);
+	/* Program the HW RX Watchdog */
+	void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 number_chan);
+	void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
+	void (*set_rx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
+	void (*set_rx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
+	void (*set_tx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
+	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan);
+};
+
+#define stmmac_reset(__priv, __args...) \
+	stmmac_do_callback(__priv, dma, reset, __args)
+#define stmmac_dma_init(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init, __args)
+#define stmmac_init_chan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init_chan, __args)
+#define stmmac_init_rx_chan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init_rx_chan, __args)
+#define stmmac_init_tx_chan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init_tx_chan, __args)
+#define stmmac_axi(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, axi, __args)
+#define stmmac_dump_dma_regs(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dump_regs, __args)
+#define stmmac_dma_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_mode, __args)
+#define stmmac_dma_rx_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_rx_mode, __args)
+#define stmmac_dma_tx_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_tx_mode, __args)
+#define stmmac_dma_diagnostic_fr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_diagnostic_fr, __args)
+#define stmmac_enable_dma_transmission(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, enable_dma_transmission, __args)
+#define stmmac_enable_dma_irq(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, enable_dma_irq, __args)
+#define stmmac_disable_dma_irq(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, disable_dma_irq, __args)
+#define stmmac_start_tx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, start_tx, __args)
+#define stmmac_stop_tx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, stop_tx, __args)
+#define stmmac_start_rx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, start_rx, __args)
+#define stmmac_stop_rx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, stop_rx, __args)
+#define stmmac_dma_interrupt_status(__priv, __args...) \
+	stmmac_do_callback(__priv, dma, dma_interrupt, __args)
+#define stmmac_get_hw_feature(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, get_hw_feature, __args)
+#define stmmac_rx_watchdog(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, rx_watchdog, __args)
+#define stmmac_set_tx_ring_len(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_tx_ring_len, __args)
+#define stmmac_set_rx_ring_len(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_rx_ring_len, __args)
+#define stmmac_set_rx_tail_ptr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_rx_tail_ptr, __args)
+#define stmmac_set_tx_tail_ptr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_tx_tail_ptr, __args)
+#define stmmac_enable_tso(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, enable_tso, __args)
+
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 2c6ed47..7009718 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -443,7 +443,7 @@ static void stmmac_ethtool_gregs(struct net_device *dev,
 	memset(reg_space, 0x0, REG_SPACE_SIZE);
 
 	priv->hw->mac->dump_regs(priv->hw, reg_space);
-	priv->hw->dma->dump_regs(priv->ioaddr, reg_space);
+	stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space);
 	/* Copy DMA registers to where ethtool expects them */
 	memcpy(&reg_space[ETHTOOL_DMA_OFFSET], &reg_space[DMA_BUS_MODE / 4],
 	       NUM_DWMAC1000_DMA_REGS * 4);
@@ -529,7 +529,7 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 	u32 rx_queues_count = priv->plat->rx_queues_to_use;
 	u32 tx_queues_count = priv->plat->tx_queues_to_use;
 	unsigned long count;
-	int i, j = 0;
+	int i, j = 0, ret;
 
 	if (priv->dma_cap.asp && priv->hw->mac->safety_feat_dump) {
 		dump = priv->hw->mac->safety_feat_dump;
@@ -541,11 +541,9 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 	}
 
 	/* Update the DMA HW counters for dwmac10/100 */
-	if (priv->hw->dma->dma_diagnostic_fr)
-		priv->hw->dma->dma_diagnostic_fr(&dev->stats,
-						 (void *) &priv->xstats,
-						 priv->ioaddr);
-	else {
+	ret = stmmac_dma_diagnostic_fr(priv, &dev->stats, (void *) &priv->xstats,
+			priv->ioaddr);
+	if (ret) {
 		/* If supported, for new GMAC chips expose the MMC counters */
 		if (priv->dma_cap.rmon) {
 			dwmac_mmc_read(priv->mmcaddr, &priv->mmc);
@@ -810,7 +808,7 @@ static int stmmac_set_coalesce(struct net_device *dev,
 	priv->tx_coal_frames = ec->tx_max_coalesced_frames;
 	priv->tx_coal_timer = ec->tx_coalesce_usecs;
 	priv->rx_riwt = rx_riwt;
-	priv->hw->dma->rx_watchdog(priv->ioaddr, priv->rx_riwt, rx_cnt);
+	stmmac_rx_watchdog(priv, priv->ioaddr, priv->rx_riwt, rx_cnt);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c44e19d..7f2aeae 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1677,7 +1677,7 @@ static void stmmac_mac_enable_rx_queues(struct stmmac_priv *priv)
 static void stmmac_start_rx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA RX processes started in channel %d\n", chan);
-	priv->hw->dma->start_rx(priv->ioaddr, chan);
+	stmmac_start_rx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1690,7 +1690,7 @@ static void stmmac_start_rx_dma(struct stmmac_priv *priv, u32 chan)
 static void stmmac_start_tx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA TX processes started in channel %d\n", chan);
-	priv->hw->dma->start_tx(priv->ioaddr, chan);
+	stmmac_start_tx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1703,7 +1703,7 @@ static void stmmac_start_tx_dma(struct stmmac_priv *priv, u32 chan)
 static void stmmac_stop_rx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA RX processes stopped in channel %d\n", chan);
-	priv->hw->dma->stop_rx(priv->ioaddr, chan);
+	stmmac_stop_rx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1716,7 +1716,7 @@ static void stmmac_stop_rx_dma(struct stmmac_priv *priv, u32 chan)
 static void stmmac_stop_tx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA TX processes stopped in channel %d\n", chan);
-	priv->hw->dma->stop_tx(priv->ioaddr, chan);
+	stmmac_stop_tx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1807,19 +1807,18 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
 		for (chan = 0; chan < rx_channels_count; chan++) {
 			qmode = priv->plat->rx_queues_cfg[chan].mode_to_use;
 
-			priv->hw->dma->dma_rx_mode(priv->ioaddr, rxmode, chan,
-						   rxfifosz, qmode);
+			stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan,
+					rxfifosz, qmode);
 		}
 
 		for (chan = 0; chan < tx_channels_count; chan++) {
 			qmode = priv->plat->tx_queues_cfg[chan].mode_to_use;
 
-			priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan,
-						   txfifosz, qmode);
+			stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan,
+					txfifosz, qmode);
 		}
 	} else {
-		priv->hw->dma->dma_mode(priv->ioaddr, txmode, rxmode,
-					rxfifosz);
+		stmmac_dma_mode(priv, priv->ioaddr, txmode, rxmode, rxfifosz);
 	}
 }
 
@@ -1927,16 +1926,6 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 	netif_tx_unlock(priv->dev);
 }
 
-static inline void stmmac_enable_dma_irq(struct stmmac_priv *priv, u32 chan)
-{
-	priv->hw->dma->enable_dma_irq(priv->ioaddr, chan);
-}
-
-static inline void stmmac_disable_dma_irq(struct stmmac_priv *priv, u32 chan)
-{
-	priv->hw->dma->disable_dma_irq(priv->ioaddr, chan);
-}
-
 /**
  * stmmac_tx_err - to manage the tx error
  * @priv: driver private structure
@@ -2000,13 +1989,12 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode,
 	txfifosz /= tx_channels_count;
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		priv->hw->dma->dma_rx_mode(priv->ioaddr, rxmode, chan,
-					   rxfifosz, rxqmode);
-		priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan,
-					   txfifosz, txqmode);
+		stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan, rxfifosz,
+				rxqmode);
+		stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan, txfifosz,
+				txqmode);
 	} else {
-		priv->hw->dma->dma_mode(priv->ioaddr, txmode, rxmode,
-					rxfifosz);
+		stmmac_dma_mode(priv, priv->ioaddr, txmode, rxmode, rxfifosz);
 	}
 }
 
@@ -2050,16 +2038,15 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 	 * all tx queues rather than just a single tx queue.
 	 */
 	for (chan = 0; chan < channels_to_check; chan++)
-		status[chan] = priv->hw->dma->dma_interrupt(priv->ioaddr,
-							    &priv->xstats,
-							    chan);
+		status[chan] = stmmac_dma_interrupt_status(priv, priv->ioaddr,
+				&priv->xstats, chan);
 
 	for (chan = 0; chan < rx_channel_count; chan++) {
 		if (likely(status[chan] & handle_rx)) {
 			struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan];
 
 			if (likely(napi_schedule_prep(&rx_q->napi))) {
-				stmmac_disable_dma_irq(priv, chan);
+				stmmac_disable_dma_irq(priv, priv->ioaddr, chan);
 				__napi_schedule(&rx_q->napi);
 				poll_scheduled = true;
 			}
@@ -2080,7 +2067,8 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 					&priv->rx_queue[0];
 
 				if (likely(napi_schedule_prep(&rx_q->napi))) {
-					stmmac_disable_dma_irq(priv, chan);
+					stmmac_disable_dma_irq(priv,
+							priv->ioaddr, chan);
 					__napi_schedule(&rx_q->napi);
 				}
 				break;
@@ -2176,15 +2164,7 @@ static void stmmac_selec_desc_mode(struct stmmac_priv *priv)
  */
 static int stmmac_get_hw_features(struct stmmac_priv *priv)
 {
-	u32 ret = 0;
-
-	if (priv->hw->dma->get_hw_feature) {
-		priv->hw->dma->get_hw_feature(priv->ioaddr,
-					      &priv->dma_cap);
-		ret = 1;
-	}
-
-	return ret;
+	return stmmac_get_hw_feature(priv, priv->ioaddr, &priv->dma_cap) == 0;
 }
 
 /**
@@ -2234,7 +2214,7 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
 		atds = 1;
 
-	ret = priv->hw->dma->reset(priv->ioaddr);
+	ret = stmmac_reset(priv, priv->ioaddr);
 	if (ret) {
 		dev_err(priv->device, "Failed to reset the dma\n");
 		return ret;
@@ -2242,51 +2222,48 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
 		/* DMA Configuration */
-		priv->hw->dma->init(priv->ioaddr, priv->plat->dma_cfg,
-				    dummy_dma_tx_phy, dummy_dma_rx_phy, atds);
+		stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg,
+				dummy_dma_tx_phy, dummy_dma_rx_phy, atds);
 
 		/* DMA RX Channel Configuration */
 		for (chan = 0; chan < rx_channels_count; chan++) {
 			rx_q = &priv->rx_queue[chan];
 
-			priv->hw->dma->init_rx_chan(priv->ioaddr,
-						    priv->plat->dma_cfg,
-						    rx_q->dma_rx_phy, chan);
+			stmmac_init_rx_chan(priv, priv->ioaddr,
+					priv->plat->dma_cfg, rx_q->dma_rx_phy,
+					chan);
 
 			rx_q->rx_tail_addr = rx_q->dma_rx_phy +
 				    (DMA_RX_SIZE * sizeof(struct dma_desc));
-			priv->hw->dma->set_rx_tail_ptr(priv->ioaddr,
-						       rx_q->rx_tail_addr,
-						       chan);
+			stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
+					rx_q->rx_tail_addr, chan);
 		}
 
 		/* DMA TX Channel Configuration */
 		for (chan = 0; chan < tx_channels_count; chan++) {
 			tx_q = &priv->tx_queue[chan];
 
-			priv->hw->dma->init_chan(priv->ioaddr,
-						 priv->plat->dma_cfg,
-						 chan);
+			stmmac_init_chan(priv, priv->ioaddr,
+					priv->plat->dma_cfg, chan);
 
-			priv->hw->dma->init_tx_chan(priv->ioaddr,
-						    priv->plat->dma_cfg,
-						    tx_q->dma_tx_phy, chan);
+			stmmac_init_tx_chan(priv, priv->ioaddr,
+					priv->plat->dma_cfg, tx_q->dma_tx_phy,
+					chan);
 
 			tx_q->tx_tail_addr = tx_q->dma_tx_phy +
 				    (DMA_TX_SIZE * sizeof(struct dma_desc));
-			priv->hw->dma->set_tx_tail_ptr(priv->ioaddr,
-						       tx_q->tx_tail_addr,
-						       chan);
+			stmmac_set_tx_tail_ptr(priv, priv->ioaddr,
+					tx_q->tx_tail_addr, chan);
 		}
 	} else {
 		rx_q = &priv->rx_queue[chan];
 		tx_q = &priv->tx_queue[chan];
-		priv->hw->dma->init(priv->ioaddr, priv->plat->dma_cfg,
-				    tx_q->dma_tx_phy, rx_q->dma_rx_phy, atds);
+		stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg,
+				tx_q->dma_tx_phy, rx_q->dma_rx_phy, atds);
 	}
 
-	if (priv->plat->axi && priv->hw->dma->axi)
-		priv->hw->dma->axi(priv->ioaddr, priv->plat->axi);
+	if (priv->plat->axi)
+		stmmac_axi(priv, priv->ioaddr, priv->plat->axi);
 
 	return ret;
 }
@@ -2332,18 +2309,14 @@ static void stmmac_set_rings_length(struct stmmac_priv *priv)
 	u32 chan;
 
 	/* set TX ring length */
-	if (priv->hw->dma->set_tx_ring_len) {
-		for (chan = 0; chan < tx_channels_count; chan++)
-			priv->hw->dma->set_tx_ring_len(priv->ioaddr,
-						       (DMA_TX_SIZE - 1), chan);
-	}
+	for (chan = 0; chan < tx_channels_count; chan++)
+		stmmac_set_tx_ring_len(priv, priv->ioaddr,
+				(DMA_TX_SIZE - 1), chan);
 
 	/* set RX ring length */
-	if (priv->hw->dma->set_rx_ring_len) {
-		for (chan = 0; chan < rx_channels_count; chan++)
-			priv->hw->dma->set_rx_ring_len(priv->ioaddr,
-						       (DMA_RX_SIZE - 1), chan);
-	}
+	for (chan = 0; chan < rx_channels_count; chan++)
+		stmmac_set_rx_ring_len(priv, priv->ioaddr,
+				(DMA_RX_SIZE - 1), chan);
 }
 
 /**
@@ -2619,9 +2592,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 
 	priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
 
-	if ((priv->use_riwt) && (priv->hw->dma->rx_watchdog)) {
-		priv->rx_riwt = MAX_DMA_RIWT;
-		priv->hw->dma->rx_watchdog(priv->ioaddr, MAX_DMA_RIWT, rx_cnt);
+	if (priv->use_riwt) {
+		ret = stmmac_rx_watchdog(priv, priv->ioaddr, MAX_DMA_RIWT, rx_cnt);
+		if (!ret)
+			priv->rx_riwt = MAX_DMA_RIWT;
 	}
 
 	if (priv->hw->pcs && priv->hw->mac->pcs_ctrl_ane)
@@ -2633,7 +2607,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	/* Enable TSO */
 	if (priv->tso) {
 		for (chan = 0; chan < tx_cnt; chan++)
-			priv->hw->dma->enable_tso(priv->ioaddr, 1, chan);
+			stmmac_enable_tso(priv, priv->ioaddr, 1, chan);
 	}
 
 	return 0;
@@ -3058,8 +3032,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
 
-	priv->hw->dma->set_tx_tail_ptr(priv->ioaddr, tx_q->tx_tail_addr,
-				       queue);
+	stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
 
 	return NETDEV_TX_OK;
 
@@ -3271,10 +3244,10 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
 
 	if (priv->synopsys_id < DWMAC_CORE_4_00)
-		priv->hw->dma->enable_dma_transmission(priv->ioaddr);
+		stmmac_enable_dma_transmission(priv, priv->ioaddr);
 	else
-		priv->hw->dma->set_tx_tail_ptr(priv->ioaddr, tx_q->tx_tail_addr,
-					       queue);
+		stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr,
+				queue);
 
 	return NETDEV_TX_OK;
 
@@ -3608,7 +3581,7 @@ static int stmmac_poll(struct napi_struct *napi, int budget)
 	work_done = stmmac_rx(priv, budget, rx_q->queue_index);
 	if (work_done < budget) {
 		napi_complete_done(napi, work_done);
-		stmmac_enable_dma_irq(priv, chan);
+		stmmac_enable_dma_irq(priv, priv->ioaddr, chan);
 	}
 	return work_done;
 }
@@ -3778,11 +3751,11 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 				priv->hw->mac->host_mtl_irq_status(priv->hw,
 								   queue);
 
-				if (status & CORE_IRQ_MTL_RX_OVERFLOW &&
-				    priv->hw->dma->set_rx_tail_ptr)
-					priv->hw->dma->set_rx_tail_ptr(priv->ioaddr,
-								rx_q->rx_tail_addr,
-								queue);
+				if (status & CORE_IRQ_MTL_RX_OVERFLOW)
+					stmmac_set_rx_tail_ptr(priv,
+							priv->ioaddr,
+							rx_q->rx_tail_addr,
+							queue);
 			}
 		}
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 1/5] net: stmmac: Switch stmmac_desc_ops to generic HW Interface Helpers
From: Jose Abreu @ 2018-04-06 13:08 UTC (permalink / raw)
  To: netdev
  Cc: Jose Abreu, David S. Miller, Joao Pinto, Giuseppe Cavallaro,
	Alexandre Torgue
In-Reply-To: <cover.1523019346.git.joabreu@synopsys.com>

Switch stmmac_desc_ops to generic Hardware Interface Helpers instead of
using hard-coded callbacks. This makes the code more readable and more
flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
---
 drivers/net/ethernet/stmicro/stmmac/chain_mode.c   |  14 +--
 drivers/net/ethernet/stmicro/stmmac/common.h       |  55 +--------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c |   4 +-
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     |   4 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 125 +++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    |   4 +-
 drivers/net/ethernet/stmicro/stmmac/ring_mode.c    |  15 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 110 +++++++++---------
 8 files changed, 195 insertions(+), 136 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/hwif.h

diff --git a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
index e93c40b..ca0f9c2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
@@ -51,8 +51,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 	tx_q->tx_skbuff_dma[entry].buf = des2;
 	tx_q->tx_skbuff_dma[entry].len = bmax;
 	/* do not close the descriptor and do not set own bit */
-	priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum, STMMAC_CHAIN_MODE,
-					0, false, skb->len);
+	stmmac_prepare_tx_desc(priv, desc, 1, bmax, csum, STMMAC_CHAIN_MODE,
+			0, false, skb->len);
 
 	while (len != 0) {
 		tx_q->tx_skbuff[entry] = NULL;
@@ -68,9 +68,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 				return -1;
 			tx_q->tx_skbuff_dma[entry].buf = des2;
 			tx_q->tx_skbuff_dma[entry].len = bmax;
-			priv->hw->desc->prepare_tx_desc(desc, 0, bmax, csum,
-							STMMAC_CHAIN_MODE, 1,
-							false, skb->len);
+			stmmac_prepare_tx_desc(priv, desc, 0, bmax, csum,
+					STMMAC_CHAIN_MODE, 1, false, skb->len);
 			len -= bmax;
 			i++;
 		} else {
@@ -83,9 +82,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 			tx_q->tx_skbuff_dma[entry].buf = des2;
 			tx_q->tx_skbuff_dma[entry].len = len;
 			/* last descriptor can be set now */
-			priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
-							STMMAC_CHAIN_MODE, 1,
-							true, skb->len);
+			stmmac_prepare_tx_desc(priv, desc, 0, len, csum,
+					STMMAC_CHAIN_MODE, 1, true, skb->len);
 			len = 0;
 		}
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index ad2388a..2c50d8c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -32,6 +32,7 @@
 #endif
 
 #include "descs.h"
+#include "hwif.h"
 #include "mmc.h"
 
 /* Synopsys Core versions */
@@ -377,60 +378,6 @@ struct dma_features {
 
 #define JUMBO_LEN		9000
 
-/* Descriptors helpers */
-struct stmmac_desc_ops {
-	/* DMA RX descriptor ring initialization */
-	void (*init_rx_desc) (struct dma_desc *p, int disable_rx_ic, int mode,
-			      int end);
-	/* DMA TX descriptor ring initialization */
-	void (*init_tx_desc) (struct dma_desc *p, int mode, int end);
-
-	/* Invoked by the xmit function to prepare the tx descriptor */
-	void (*prepare_tx_desc) (struct dma_desc *p, int is_fs, int len,
-				 bool csum_flag, int mode, bool tx_own,
-				 bool ls, unsigned int tot_pkt_len);
-	void (*prepare_tso_tx_desc)(struct dma_desc *p, int is_fs, int len1,
-				    int len2, bool tx_own, bool ls,
-				    unsigned int tcphdrlen,
-				    unsigned int tcppayloadlen);
-	/* Set/get the owner of the descriptor */
-	void (*set_tx_owner) (struct dma_desc *p);
-	int (*get_tx_owner) (struct dma_desc *p);
-	/* Clean the tx descriptor as soon as the tx irq is received */
-	void (*release_tx_desc) (struct dma_desc *p, int mode);
-	/* Clear interrupt on tx frame completion. When this bit is
-	 * set an interrupt happens as soon as the frame is transmitted */
-	void (*set_tx_ic)(struct dma_desc *p);
-	/* Last tx segment reports the transmit status */
-	int (*get_tx_ls) (struct dma_desc *p);
-	/* Return the transmit status looking at the TDES1 */
-	int (*tx_status) (void *data, struct stmmac_extra_stats *x,
-			  struct dma_desc *p, void __iomem *ioaddr);
-	/* Get the buffer size from the descriptor */
-	int (*get_tx_len) (struct dma_desc *p);
-	/* Handle extra events on specific interrupts hw dependent */
-	void (*set_rx_owner) (struct dma_desc *p);
-	/* Get the receive frame size */
-	int (*get_rx_frame_len) (struct dma_desc *p, int rx_coe_type);
-	/* Return the reception status looking at the RDES1 */
-	int (*rx_status) (void *data, struct stmmac_extra_stats *x,
-			  struct dma_desc *p);
-	void (*rx_extended_status) (void *data, struct stmmac_extra_stats *x,
-				    struct dma_extended_desc *p);
-	/* Set tx timestamp enable bit */
-	void (*enable_tx_timestamp) (struct dma_desc *p);
-	/* get tx timestamp status */
-	int (*get_tx_timestamp_status) (struct dma_desc *p);
-	/* get timestamp value */
-	 u64(*get_timestamp) (void *desc, u32 ats);
-	/* get rx timestamp status */
-	int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats);
-	/* Display ring */
-	void (*display_ring)(void *head, unsigned int size, bool rx);
-	/* set MSS via context descriptor */
-	void (*set_mss)(struct dma_desc *p, unsigned int mss);
-};
-
 extern const struct stmmac_desc_ops enh_desc_ops;
 extern const struct stmmac_desc_ops ndesc_ops;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index 2a6521d..65ed896c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -223,7 +223,7 @@ static int dwmac4_wrback_get_tx_timestamp_status(struct dma_desc *p)
 	return 0;
 }
 
-static inline u64 dwmac4_get_timestamp(void *desc, u32 ats)
+static inline void dwmac4_get_timestamp(void *desc, u32 ats, u64 *ts)
 {
 	struct dma_desc *p = (struct dma_desc *)desc;
 	u64 ns;
@@ -232,7 +232,7 @@ static inline u64 dwmac4_get_timestamp(void *desc, u32 ats)
 	/* convert high/sec time stamp value to nanosecond */
 	ns += le32_to_cpu(p->des1) * 1000000000ULL;
 
-	return ns;
+	*ts = ns;
 }
 
 static int dwmac4_rx_check_timestamp(void *desc)
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 6768a25..3bfb3f5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -382,7 +382,7 @@ static int enh_desc_get_tx_timestamp_status(struct dma_desc *p)
 	return (le32_to_cpu(p->des0) & ETDES0_TIME_STAMP_STATUS) >> 17;
 }
 
-static u64 enh_desc_get_timestamp(void *desc, u32 ats)
+static void enh_desc_get_timestamp(void *desc, u32 ats, u64 *ts)
 {
 	u64 ns;
 
@@ -397,7 +397,7 @@ static u64 enh_desc_get_timestamp(void *desc, u32 ats)
 		ns += le32_to_cpu(p->des3) * 1000000000ULL;
 	}
 
-	return ns;
+	*ts = ns;
 }
 
 static int enh_desc_get_rx_timestamp_status(void *desc, void *next_desc,
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
new file mode 100644
index 0000000..4994677
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+// Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
+// stmmac HW Interface Callbacks
+
+#ifndef __STMMAC_HWIF_H__
+#define __STMMAC_HWIF_H__
+
+#define stmmac_do_void_callback(__priv, __module, __cname,  __arg0, __args...) \
+({ \
+	int __result = -EINVAL; \
+	if ((__priv)->hw->__module->__cname) { \
+		(__priv)->hw->__module->__cname((__arg0), ##__args); \
+		__result = 0; \
+	} \
+	__result; \
+})
+#define stmmac_do_callback(__priv, __module, __cname,  __arg0, __args...) \
+({ \
+	int __result = -EINVAL; \
+	if ((__priv)->hw->__module->__cname) \
+		__result = (__priv)->hw->__module->__cname((__arg0), ##__args); \
+	__result; \
+})
+
+struct stmmac_extra_stats;
+struct stmmac_safety_stats;
+struct dma_desc;
+struct dma_extended_desc;
+
+/* Descriptors helpers */
+struct stmmac_desc_ops {
+	/* DMA RX descriptor ring initialization */
+	void (*init_rx_desc)(struct dma_desc *p, int disable_rx_ic, int mode,
+			int end);
+	/* DMA TX descriptor ring initialization */
+	void (*init_tx_desc)(struct dma_desc *p, int mode, int end);
+	/* Invoked by the xmit function to prepare the tx descriptor */
+	void (*prepare_tx_desc)(struct dma_desc *p, int is_fs, int len,
+			bool csum_flag, int mode, bool tx_own, bool ls,
+			unsigned int tot_pkt_len);
+	void (*prepare_tso_tx_desc)(struct dma_desc *p, int is_fs, int len1,
+			int len2, bool tx_own, bool ls, unsigned int tcphdrlen,
+			unsigned int tcppayloadlen);
+	/* Set/get the owner of the descriptor */
+	void (*set_tx_owner)(struct dma_desc *p);
+	int (*get_tx_owner)(struct dma_desc *p);
+	/* Clean the tx descriptor as soon as the tx irq is received */
+	void (*release_tx_desc)(struct dma_desc *p, int mode);
+	/* Clear interrupt on tx frame completion. When this bit is
+	 * set an interrupt happens as soon as the frame is transmitted */
+	void (*set_tx_ic)(struct dma_desc *p);
+	/* Last tx segment reports the transmit status */
+	int (*get_tx_ls)(struct dma_desc *p);
+	/* Return the transmit status looking at the TDES1 */
+	int (*tx_status)(void *data, struct stmmac_extra_stats *x,
+			struct dma_desc *p, void __iomem *ioaddr);
+	/* Get the buffer size from the descriptor */
+	int (*get_tx_len)(struct dma_desc *p);
+	/* Handle extra events on specific interrupts hw dependent */
+	void (*set_rx_owner)(struct dma_desc *p);
+	/* Get the receive frame size */
+	int (*get_rx_frame_len)(struct dma_desc *p, int rx_coe_type);
+	/* Return the reception status looking at the RDES1 */
+	int (*rx_status)(void *data, struct stmmac_extra_stats *x,
+			struct dma_desc *p);
+	void (*rx_extended_status)(void *data, struct stmmac_extra_stats *x,
+			struct dma_extended_desc *p);
+	/* Set tx timestamp enable bit */
+	void (*enable_tx_timestamp) (struct dma_desc *p);
+	/* get tx timestamp status */
+	int (*get_tx_timestamp_status) (struct dma_desc *p);
+	/* get timestamp value */
+	void (*get_timestamp)(void *desc, u32 ats, u64 *ts);
+	/* get rx timestamp status */
+	int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats);
+	/* Display ring */
+	void (*display_ring)(void *head, unsigned int size, bool rx);
+	/* set MSS via context descriptor */
+	void (*set_mss)(struct dma_desc *p, unsigned int mss);
+};
+
+#define stmmac_init_rx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, init_rx_desc, __args)
+#define stmmac_init_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, init_tx_desc, __args)
+#define stmmac_prepare_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, prepare_tx_desc, __args)
+#define stmmac_prepare_tso_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, prepare_tso_tx_desc, __args)
+#define stmmac_set_tx_owner(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_tx_owner, __args)
+#define stmmac_get_tx_owner(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_owner, __args)
+#define stmmac_release_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, release_tx_desc, __args)
+#define stmmac_set_tx_ic(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_tx_ic, __args)
+#define stmmac_get_tx_ls(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_ls, __args)
+#define stmmac_tx_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, tx_status, __args)
+#define stmmac_get_tx_len(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_len, __args)
+#define stmmac_set_rx_owner(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_rx_owner, __args)
+#define stmmac_get_rx_frame_len(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_rx_frame_len, __args)
+#define stmmac_rx_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, rx_status, __args)
+#define stmmac_rx_extended_status(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, rx_extended_status, __args)
+#define stmmac_enable_tx_timestamp(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, enable_tx_timestamp, __args)
+#define stmmac_get_tx_timestamp_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_timestamp_status, __args)
+#define stmmac_get_timestamp(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, get_timestamp, __args)
+#define stmmac_get_rx_timestamp_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_rx_timestamp_status, __args)
+#define stmmac_display_ring(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, display_ring, __args)
+#define stmmac_set_mss(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_mss, __args)
+
+#endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index ebd9e5e..7b1d901 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -253,7 +253,7 @@ static int ndesc_get_tx_timestamp_status(struct dma_desc *p)
 	return (le32_to_cpu(p->des0) & TDES0_TIME_STAMP_STATUS) >> 17;
 }
 
-static u64 ndesc_get_timestamp(void *desc, u32 ats)
+static void ndesc_get_timestamp(void *desc, u32 ats, u64 *ts)
 {
 	struct dma_desc *p = (struct dma_desc *)desc;
 	u64 ns;
@@ -262,7 +262,7 @@ static u64 ndesc_get_timestamp(void *desc, u32 ats)
 	/* convert high/sec time stamp value to nanosecond */
 	ns += le32_to_cpu(p->des3) * 1000000000ULL;
 
-	return ns;
+	*ts = ns;
 }
 
 static int ndesc_get_rx_timestamp_status(void *desc, void *next_desc, u32 ats)
diff --git a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
index 28e4b5d..808ca75 100644
--- a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
@@ -58,9 +58,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
 
 		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
-		priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum,
-						STMMAC_RING_MODE, 0,
-						false, skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 1, bmax, csum,
+				STMMAC_RING_MODE, 0, false, skb->len);
 		tx_q->tx_skbuff[entry] = NULL;
 		entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
 
@@ -79,9 +78,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
 
 		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
-		priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
-						STMMAC_RING_MODE, 1,
-						true, skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 0, len, csum,
+				STMMAC_RING_MODE, 1, true, skb->len);
 	} else {
 		des2 = dma_map_single(priv->device, skb->data,
 				      nopaged_len, DMA_TO_DEVICE);
@@ -92,9 +90,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 		tx_q->tx_skbuff_dma[entry].len = nopaged_len;
 		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
 		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
-		priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len, csum,
-						STMMAC_RING_MODE, 0,
-						true, skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 1, nopaged_len, csum,
+				STMMAC_RING_MODE, 0, true, skb->len);
 	}
 
 	tx_q->cur_tx = entry;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 9a16931..c44e19d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -50,6 +50,7 @@
 #include <linux/reset.h>
 #include <linux/of_mdio.h>
 #include "dwmac1000.h"
+#include "hwif.h"
 
 #define STMMAC_ALIGN(x)	L1_CACHE_ALIGN(x)
 #define	TSO_MAX_BUFF_SIZE	(SZ_16K - 1)
@@ -464,9 +465,9 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv,
 		return;
 
 	/* check tx tstamp status */
-	if (priv->hw->desc->get_tx_timestamp_status(p)) {
+	if (stmmac_get_tx_timestamp_status(priv, p)) {
 		/* get the valid tstamp */
-		ns = priv->hw->desc->get_timestamp(p, priv->adv_ts);
+		stmmac_get_timestamp(priv, p, priv->adv_ts, &ns);
 
 		memset(&shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
 		shhwtstamp.hwtstamp = ns_to_ktime(ns);
@@ -502,8 +503,8 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
 		desc = np;
 
 	/* Check if timestamp is available */
-	if (priv->hw->desc->get_rx_timestamp_status(p, np, priv->adv_ts)) {
-		ns = priv->hw->desc->get_timestamp(desc, priv->adv_ts);
+	if (stmmac_get_rx_timestamp_status(priv, p, np, priv->adv_ts)) {
+		stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns);
 		netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns);
 		shhwtstamp = skb_hwtstamps(skb);
 		memset(shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
@@ -1008,7 +1009,7 @@ static void stmmac_display_rx_rings(struct stmmac_priv *priv)
 			head_rx = (void *)rx_q->dma_rx;
 
 		/* Display RX ring */
-		priv->hw->desc->display_ring(head_rx, DMA_RX_SIZE, true);
+		stmmac_display_ring(priv, head_rx, DMA_RX_SIZE, true);
 	}
 }
 
@@ -1029,7 +1030,7 @@ static void stmmac_display_tx_rings(struct stmmac_priv *priv)
 		else
 			head_tx = (void *)tx_q->dma_tx;
 
-		priv->hw->desc->display_ring(head_tx, DMA_TX_SIZE, false);
+		stmmac_display_ring(priv, head_tx, DMA_TX_SIZE, false);
 	}
 }
 
@@ -1073,13 +1074,13 @@ static void stmmac_clear_rx_descriptors(struct stmmac_priv *priv, u32 queue)
 	/* Clear the RX descriptors */
 	for (i = 0; i < DMA_RX_SIZE; i++)
 		if (priv->extend_desc)
-			priv->hw->desc->init_rx_desc(&rx_q->dma_erx[i].basic,
-						     priv->use_riwt, priv->mode,
-						     (i == DMA_RX_SIZE - 1));
+			stmmac_init_rx_desc(priv, &rx_q->dma_erx[i].basic,
+					priv->use_riwt, priv->mode,
+					(i == DMA_RX_SIZE - 1));
 		else
-			priv->hw->desc->init_rx_desc(&rx_q->dma_rx[i],
-						     priv->use_riwt, priv->mode,
-						     (i == DMA_RX_SIZE - 1));
+			stmmac_init_rx_desc(priv, &rx_q->dma_rx[i],
+					priv->use_riwt, priv->mode,
+					(i == DMA_RX_SIZE - 1));
 }
 
 /**
@@ -1097,13 +1098,11 @@ static void stmmac_clear_tx_descriptors(struct stmmac_priv *priv, u32 queue)
 	/* Clear the TX descriptors */
 	for (i = 0; i < DMA_TX_SIZE; i++)
 		if (priv->extend_desc)
-			priv->hw->desc->init_tx_desc(&tx_q->dma_etx[i].basic,
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_etx[i].basic,
+					priv->mode, (i == DMA_TX_SIZE - 1));
 		else
-			priv->hw->desc->init_tx_desc(&tx_q->dma_tx[i],
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_tx[i],
+					priv->mode, (i == DMA_TX_SIZE - 1));
 }
 
 /**
@@ -1851,9 +1850,8 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 		else
 			p = tx_q->dma_tx + entry;
 
-		status = priv->hw->desc->tx_status(&priv->dev->stats,
-						      &priv->xstats, p,
-						      priv->ioaddr);
+		status = stmmac_tx_status(priv, &priv->dev->stats,
+				&priv->xstats, p, priv->ioaddr);
 		/* Check if the descriptor is owned by the DMA */
 		if (unlikely(status & tx_dma_own))
 			break;
@@ -1904,7 +1902,7 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 			tx_q->tx_skbuff[entry] = NULL;
 		}
 
-		priv->hw->desc->release_tx_desc(p, priv->mode);
+		stmmac_release_tx_desc(priv, p, priv->mode);
 
 		entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
 	}
@@ -1957,13 +1955,11 @@ static void stmmac_tx_err(struct stmmac_priv *priv, u32 chan)
 	dma_free_tx_skbufs(priv, chan);
 	for (i = 0; i < DMA_TX_SIZE; i++)
 		if (priv->extend_desc)
-			priv->hw->desc->init_tx_desc(&tx_q->dma_etx[i].basic,
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_etx[i].basic,
+					priv->mode, (i == DMA_TX_SIZE - 1));
 		else
-			priv->hw->desc->init_tx_desc(&tx_q->dma_tx[i],
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_tx[i],
+					priv->mode, (i == DMA_TX_SIZE - 1));
 	tx_q->dirty_tx = 0;
 	tx_q->cur_tx = 0;
 	tx_q->mss = 0;
@@ -2851,10 +2847,10 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, unsigned int des,
 		buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ?
 			    TSO_MAX_BUFF_SIZE : tmp_len;
 
-		priv->hw->desc->prepare_tso_tx_desc(desc, 0, buff_size,
-			0, 1,
-			(last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE),
-			0, 0);
+		stmmac_prepare_tso_tx_desc(priv, desc, 0, buff_size,
+				0, 1,
+				(last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE),
+				0, 0);
 
 		tmp_len -= TSO_MAX_BUFF_SIZE;
 	}
@@ -2926,7 +2922,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* set new MSS value if needed */
 	if (mss != tx_q->mss) {
 		mss_desc = tx_q->dma_tx + tx_q->cur_tx;
-		priv->hw->desc->set_mss(mss_desc, mss);
+		stmmac_set_mss(priv, mss_desc, mss);
 		tx_q->mss = mss;
 		tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, DMA_TX_SIZE);
 		WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
@@ -3012,7 +3008,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
 	} else {
 		priv->tx_count_frames = 0;
-		priv->hw->desc->set_tx_ic(desc);
+		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
 	}
 
@@ -3022,11 +3018,11 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		     priv->hwts_tx_en)) {
 		/* declare that device is doing timestamping */
 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-		priv->hw->desc->enable_tx_timestamp(first);
+		stmmac_enable_tx_timestamp(priv, first);
 	}
 
 	/* Complete the first descriptor before granting the DMA */
-	priv->hw->desc->prepare_tso_tx_desc(first, 1,
+	stmmac_prepare_tso_tx_desc(priv, first, 1,
 			proto_hdr_len,
 			pay_len,
 			1, tx_q->tx_skbuff_dma[first_entry].last_segment,
@@ -3040,7 +3036,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * sure that MSS's own bit is the last thing written.
 		 */
 		dma_wmb();
-		priv->hw->desc->set_tx_owner(mss_desc);
+		stmmac_set_tx_owner(priv, mss_desc);
 	}
 
 	/* The own bit must be the latest setting done when prepare the
@@ -3054,8 +3050,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 			__func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry,
 			tx_q->cur_tx, first, nfrags);
 
-		priv->hw->desc->display_ring((void *)tx_q->dma_tx, DMA_TX_SIZE,
-					     0);
+		stmmac_display_ring(priv, (void *)tx_q->dma_tx, DMA_TX_SIZE, 0);
 
 		pr_info(">>> frame to be transmitted: ");
 		print_pkt(skb->data, skb_headlen(skb));
@@ -3174,9 +3169,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		tx_q->tx_skbuff_dma[entry].last_segment = last_segment;
 
 		/* Prepare the descriptor and set the own bit too */
-		priv->hw->desc->prepare_tx_desc(desc, 0, len, csum_insertion,
-						priv->mode, 1, last_segment,
-						skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 0, len, csum_insertion,
+				priv->mode, 1, last_segment, skb->len);
 	}
 
 	/* Only the last descriptor gets to point to the skb. */
@@ -3203,7 +3197,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		else
 			tx_head = (void *)tx_q->dma_tx;
 
-		priv->hw->desc->display_ring(tx_head, DMA_TX_SIZE, false);
+		stmmac_display_ring(priv, tx_head, DMA_TX_SIZE, false);
 
 		netdev_dbg(priv->dev, ">>> frame to be transmitted: ");
 		print_pkt(skb->data, skb->len);
@@ -3228,7 +3222,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
 	} else {
 		priv->tx_count_frames = 0;
-		priv->hw->desc->set_tx_ic(desc);
+		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
 	}
 
@@ -3259,13 +3253,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			     priv->hwts_tx_en)) {
 			/* declare that device is doing timestamping */
 			skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-			priv->hw->desc->enable_tx_timestamp(first);
+			stmmac_enable_tx_timestamp(priv, first);
 		}
 
 		/* Prepare the first descriptor setting the OWN bit too */
-		priv->hw->desc->prepare_tx_desc(first, 1, nopaged_len,
-						csum_insertion, priv->mode, 1,
-						last_segment, skb->len);
+		stmmac_prepare_tx_desc(priv, first, 1, nopaged_len,
+				csum_insertion, priv->mode, 1, last_segment,
+				skb->len);
 
 		/* The own bit must be the latest setting done when prepare the
 		 * descriptor and then barrier is needed to make sure that
@@ -3382,9 +3376,9 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 		dma_wmb();
 
 		if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
-			priv->hw->desc->init_rx_desc(p, priv->use_riwt, 0, 0);
+			stmmac_init_rx_desc(priv, p, priv->use_riwt, 0, 0);
 		else
-			priv->hw->desc->set_rx_owner(p);
+			stmmac_set_rx_owner(priv, p);
 
 		dma_wmb();
 
@@ -3418,7 +3412,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 		else
 			rx_head = (void *)rx_q->dma_rx;
 
-		priv->hw->desc->display_ring(rx_head, DMA_RX_SIZE, true);
+		stmmac_display_ring(priv, rx_head, DMA_RX_SIZE, true);
 	}
 	while (count < limit) {
 		int status;
@@ -3431,8 +3425,8 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			p = rx_q->dma_rx + entry;
 
 		/* read the status of the incoming frame */
-		status = priv->hw->desc->rx_status(&priv->dev->stats,
-						   &priv->xstats, p);
+		status = stmmac_rx_status(priv, &priv->dev->stats,
+				&priv->xstats, p);
 		/* check if managed by the DMA otherwise go ahead */
 		if (unlikely(status & dma_own))
 			break;
@@ -3449,11 +3443,9 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 
 		prefetch(np);
 
-		if ((priv->extend_desc) && (priv->hw->desc->rx_extended_status))
-			priv->hw->desc->rx_extended_status(&priv->dev->stats,
-							   &priv->xstats,
-							   rx_q->dma_erx +
-							   entry);
+		if (priv->extend_desc)
+			stmmac_rx_extended_status(priv, &priv->dev->stats,
+					&priv->xstats, rx_q->dma_erx + entry);
 		if (unlikely(status == discard_frame)) {
 			priv->dev->stats.rx_errors++;
 			if (priv->hwts_rx_en && !priv->extend_desc) {
@@ -3479,7 +3471,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			else
 				des = le32_to_cpu(p->des2);
 
-			frame_len = priv->hw->desc->get_rx_frame_len(p, coe);
+			frame_len = stmmac_get_rx_frame_len(priv, p, coe);
 
 			/*  If frame length is greater than skb buffer size
 			 *  (preallocated during init) then the packet is
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 0/5] net: stmmac: Stop using hard-coded callbacks
From: Jose Abreu @ 2018-04-06 13:08 UTC (permalink / raw)
  To: netdev
  Cc: Jose Abreu, David S. Miller, Joao Pinto, Giuseppe Cavallaro,
	Alexandre Torgue

This a starting point for a cleanup and re-organization of stmmac.

In this series we stop using hard-coded callbacks along the code and use
instead helpers which are defined in a single place ("hwif.h").

This brings several advantages:
	1) Less typing :)
	2) Guaranteed function pointer check
	3) More flexibility

By 2) we stop using the repeated pattern of:
	if (priv->hw->mac->some_func)
		priv->hw->mac->some_func(...)

I didn't check but I expect the final .ko will be bigger with this series
because *all* of function pointers are checked.

Anyway, I hope this can make the code more readable and more flexible now.

Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>

Jose Abreu (5):
  net: stmmac: Switch stmmac_desc_ops to generic HW Interface Helpers
  net: stmmac: Switch stmmac_dma_ops to generic HW Interface Helpers
  net: stmmac: Switch stmmac_ops to generic HW Interface Helpers
  net: stmmac: Switch stmmac_hwtimestamp to generic HW Interface Helpers
  net: stmmac: Switch stmmac_mode_ops to generic HW Interface Helpers

 drivers/net/ethernet/stmicro/stmmac/chain_mode.c   |  34 +-
 drivers/net/ethernet/stmicro/stmmac/common.h       | 199 +---------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c |   4 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c       |  19 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h       |   6 +-
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     |   4 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 421 ++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    |   4 +-
 drivers/net/ethernet/stmicro/stmmac/ring_mode.c    |  39 +-
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  82 ++--
 .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c  |  34 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 439 +++++++++------------
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   |  18 +-
 13 files changed, 726 insertions(+), 577 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/hwif.h

-- 
2.9.3

^ permalink raw reply

* Re: [PATCH net-next] netns: filter uevents correctly
From: Christian Brauner @ 2018-04-06 13:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Kirill Tkhai, davem, gregkh, netdev, linux-kernel, avagin, serge
In-Reply-To: <87in953ryi.fsf@xmission.com>

On Thu, Apr 05, 2018 at 10:59:49PM -0500, Eric W. Biederman wrote:
> Christian Brauner <christian.brauner@canonical.com> writes:
> 
> > On Thu, Apr 05, 2018 at 05:26:59PM +0300, Kirill Tkhai wrote:
> >> On 05.04.2018 17:07, Christian Brauner wrote:
> >> > On Thu, Apr 05, 2018 at 04:01:03PM +0300, Kirill Tkhai wrote:
> >> >> On 04.04.2018 22:48, Christian Brauner wrote:
> >> >>> commit 07e98962fa77 ("kobject: Send hotplug events in all network namespaces")
> >> >>>
> >> >>> enabled sending hotplug events into all network namespaces back in 2010.
> >> >>> Over time the set of uevents that get sent into all network namespaces has
> >> >>> shrunk. We have now reached the point where hotplug events for all devices
> >> >>> that carry a namespace tag are filtered according to that namespace.
> >> >>>
> >> >>> Specifically, they are filtered whenever the namespace tag of the kobject
> >> >>> does not match the namespace tag of the netlink socket. One example are
> >> >>> network devices. Uevents for network devices only show up in the network
> >> >>> namespaces these devices are moved to or created in.
> >> >>>
> >> >>> However, any uevent for a kobject that does not have a namespace tag
> >> >>> associated with it will not be filtered and we will *try* to broadcast it
> >> >>> into all network namespaces.
> >> >>>
> >> >>> The original patchset was written in 2010 before user namespaces were a
> >> >>> thing. With the introduction of user namespaces sending out uevents became
> >> >>> partially isolated as they were filtered by user namespaces:
> >> >>>
> >> >>> net/netlink/af_netlink.c:do_one_broadcast()
> >> >>>
> >> >>> if (!net_eq(sock_net(sk), p->net)) {
> >> >>>         if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
> >> >>>                 return;
> >> >>>
> >> >>>         if (!peernet_has_id(sock_net(sk), p->net))
> >> >>>                 return;
> >> >>>
> >> >>>         if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
> >> >>>                              CAP_NET_BROADCAST))
> >> >>>         j       return;
> >> >>> }
> >> >>>
> >> >>> The file_ns_capable() check will check whether the caller had
> >> >>> CAP_NET_BROADCAST at the time of opening the netlink socket in the user
> >> >>> namespace of interest. This check is fine in general but seems insufficient
> >> >>> to me when paired with uevents. The reason is that devices always belong to
> >> >>> the initial user namespace so uevents for kobjects that do not carry a
> >> >>> namespace tag should never be sent into another user namespace. This has
> >> >>> been the intention all along. But there's one case where this breaks,
> >> >>> namely if a new user namespace is created by root on the host and an
> >> >>> identity mapping is established between root on the host and root in the
> >> >>> new user namespace. Here's a reproducer:
> >> >>>
> >> >>>  sudo unshare -U --map-root
> >> >>>  udevadm monitor -k
> >> >>>  # Now change to initial user namespace and e.g. do
> >> >>>  modprobe kvm
> >> >>>  # or
> >> >>>  rmmod kvm
> >> >>>
> >> >>> will allow the non-initial user namespace to retrieve all uevents from the
> >> >>> host. This seems very anecdotal given that in the general case user
> >> >>> namespaces do not see any uevents and also can't really do anything useful
> >> >>> with them.
> >> >>>
> >> >>> Additionally, it is now possible to send uevents from userspace. As such we
> >> >>> can let a sufficiently privileged (CAP_SYS_ADMIN in the owning user
> >> >>> namespace of the network namespace of the netlink socket) userspace process
> >> >>> make a decision what uevents should be sent.
> >> >>>
> >> >>> This makes me think that we should simply ensure that uevents for kobjects
> >> >>> that do not carry a namespace tag are *always* filtered by user namespace
> >> >>> in kobj_bcast_filter(). Specifically:
> >> >>> - If the owning user namespace of the uevent socket is not init_user_ns the
> >> >>>   event will always be filtered.
> >> >>> - If the network namespace the uevent socket belongs to was created in the
> >> >>>   initial user namespace but was opened from a non-initial user namespace
> >> >>>   the event will be filtered as well.
> >> >>> Put another way, uevents for kobjects not carrying a namespace tag are now
> >> >>> always only sent to the initial user namespace. The regression potential
> >> >>> for this is near to non-existent since user namespaces can't really do
> >> >>> anything with interesting devices.
> >> >>>
> >> >>> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
> >> >>> ---
> >> >>>  lib/kobject_uevent.c | 10 +++++++++-
> >> >>>  1 file changed, 9 insertions(+), 1 deletion(-)
> >> >>>
> >> >>> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
> >> >>> index 15ea216a67ce..cb98cddb6e3b 100644
> >> >>> --- a/lib/kobject_uevent.c
> >> >>> +++ b/lib/kobject_uevent.c
> >> >>> @@ -251,7 +251,15 @@ static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data)
> >> >>>  		return sock_ns != ns;
> >> >>>  	}
> >> >>>  
> >> >>> -	return 0;
> >> >>> +	/*
> >> >>> +	 * The kobject does not carry a namespace tag so filter by user
> >> >>> +	 * namespace below.
> >> >>> +	 */
> >> >>> +	if (sock_net(dsk)->user_ns != &init_user_ns)
> >> >>> +		return 1;
> >> >>> +
> >> >>> +	/* Check if socket was opened from non-initial user namespace. */
> >> >>> +	return sk_user_ns(dsk) != &init_user_ns;
> >> >>>  }
> >> >>>  #endif
> >> >>
> >> >> So, this prohibits to listen events of all devices except network-related
> >> >> in containers? If it's so, I don't think it's a good solution. Uevents is not
> >> > 
> >> > No, this is not correct: As it is right now *without my patch* no
> >> > non-initial user namespace is receiving *any uevents* but those
> >> > specifically namespaced such as those for network devices. This patch
> >> > doesn't change that at all. The commit message outlines this in detail
> >> > how this comes about.
> >> > There is only one case where this currently breaks and this is as I
> >> > outlined explicitly in my commit message when you create a new user
> >> > namespace and map container(0) -> host(0). This patch fixes this.
> >> 
> >> Could you please point the place, where non-initial user namespaces are filtered?
> >> I only see the kobj_bcast_filter() logic, and it used to return 0, which means "accepted".
> >> Now it will return 1 sometimes.
> >
> > Oh sure, it's in the commit message though. The callchain is
> > lib/kobject_uevent.c:kobject_uevent_net_broadcast() ->
> > nnet/netlink/af_netlink.c:netlink_broadcast_filtered() ->
> > net/netlink/af_netlink.c:do_one_broadcast():
> >
> > This codepiece will check whether the openened socket holds
> > CAP_NET_BROADCAST in the user namespace of the target network namespace
> > which it won't because we don't have device namespaces and all devices
> > belong to the initial set of namespaces.
> >
> >         if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
> >                              CAP_NET_BROADCAST))
> >         j       return;
> >
> 
> The above that only applies if someone has set NETLINK_F_LISTEN_ALL_NSID
> on their socket and has had someone with the appropriate privileges
> assign a peerrnetid.
> 
> All of which is to say that file_ns_capable is not nearly as applicable
> as it might be, and if you can pass the other two checks I think it is
> pointless (because the peernet attributes are not generated for
> kobj_uevents) but valid to receive events from outside your network
> namespace.
> 
> 
> I might be missing something but I don't see anything excluding network
> namespaces owned by !init_user_ns excluded from the kobject_uevent
> logic.
> 
> The uevent_sock_list has one entry per network namespace. And
> kobject_uevent_net_broacast appears to walk each one.

Yeah, it definitely does.

> 
> I had a memory of filtering uevent messages and I had a memory
> that the netlink_has_listeners had a per network namespace effect.
> Neither seems true from my inspection of the code tonight.

Yeah, I drew the same conclusion.

> 
> If we are not filtering ordinary uevents at least at the user namespace
> level that does seem to be at least a nuisance.

This patch would filter based on user namespace and bump it from "we're
accidently doing the right thing" to "we're doing the right on purpose"
and without accidently leaking uevents in some corner cases.
Using kobj_bcast_filter() for this seems like the right thing to do.
This sounds like you don't disagree with the approach I'm taking here?

On a sidenote, it also very much feels like we're leaking information if
we're not filtering based on user namespaces on purpose. Non-initial
user namespaces should not be able to receive information about device
attribues simply via netlink uevent messages. At least not *trivially*
[1] by opening a netlink uevent socket.

> 
> 
> Christian can you dig a little deeper into this.  I have a feeling that
> there are some real efficiency improvements that we could make to
> kobject_uevent_net_broadcast if nothing else.

Yeah, for sure! I already started doing this. This patch here is
basically a preparatory patch for that work which is on my todo.

Christian

> 
> Perhaps you could see where uevents are broadcast by poking
> the sysfs uevent of an existing device, and triggering a retransmit.
> 
> Eric
> 
>                                                      
[1]: I mean they technically could very likely still try to gather the
     same info by constantly parsing all of sysfs and try to detect new
     PCI paths/directories and so on but it shouldn't be as easy as
     opening a netlink uevent socket.

^ permalink raw reply

* Re: [RFC PATCH net-next v5 2/4] net: Introduce generic bypass module
From: Jiri Pirko @ 2018-04-06 12:57 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <1522962503-3963-3-git-send-email-sridhar.samudrala@intel.com>

Thu, Apr 05, 2018 at 11:08:21PM CEST, sridhar.samudrala@intel.com wrote:
>This provides a generic interface for paravirtual drivers to listen
>for netdev register/unregister/link change events from pci ethernet
>devices with the same MAC and takeover their datapath. The notifier and
>event handling code is based on the existing netvsc implementation. A
>paravirtual driver can use this module by registering a set of ops and
>each instance of the device when it is probed.
>
>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>---
> include/net/bypass.h |  80 ++++++++++
> net/Kconfig          |  18 +++
> net/core/Makefile    |   1 +
> net/core/bypass.c    | 406 +++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 505 insertions(+)
> create mode 100644 include/net/bypass.h
> create mode 100644 net/core/bypass.c
>
>diff --git a/include/net/bypass.h b/include/net/bypass.h
>new file mode 100644
>index 000000000000..e2dd122f951a
>--- /dev/null
>+++ b/include/net/bypass.h
>@@ -0,0 +1,80 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/* Copyright (c) 2018, Intel Corporation. */
>+
>+#ifndef _NET_BYPASS_H
>+#define _NET_BYPASS_H
>+
>+#include <linux/netdevice.h>
>+
>+struct bypass_ops {

Perhaps "net_bypass_" would be better prefix for this module structs
and functions. No strong opinion though.


>+	int (*register_child)(struct net_device *bypass_netdev,
>+			      struct net_device *child_netdev);

We have master/slave upper/lower netdevices. This adds "child". Consider
using some existing names. Not sure if possible without loss of meaning.


>+	int (*join_child)(struct net_device *bypass_netdev,
>+			  struct net_device *child_netdev);
>+	int (*unregister_child)(struct net_device *bypass_netdev,
>+				struct net_device *child_netdev);
>+	int (*release_child)(struct net_device *bypass_netdev,
>+			     struct net_device *child_netdev);
>+	int (*update_link)(struct net_device *bypass_netdev,
>+			   struct net_device *child_netdev);
>+	rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>+};
>+
>+struct bypass_instance {
>+	struct list_head list;
>+	struct net_device __rcu *bypass_netdev;
>+	struct bypass *bypass;
>+};
>+
>+struct bypass {
>+	struct list_head list;
>+	const struct bypass_ops *ops;
>+	const struct net_device_ops *netdev_ops;
>+	struct list_head instance_list;
>+	struct mutex lock;
>+};
>+
>+#if IS_ENABLED(CONFIG_NET_BYPASS)
>+
>+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>+				      const struct net_device_ops *netdev_ops);
>+void bypass_unregister_driver(struct bypass *bypass);
>+
>+int bypass_register_instance(struct bypass *bypass, struct net_device *dev);
>+int bypass_unregister_instance(struct bypass *bypass, struct net_device	*dev);
>+
>+int bypass_unregister_child(struct net_device *child_netdev);
>+
>+#else
>+
>+static inline
>+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>+				      const struct net_device_ops *netdev_ops)
>+{
>+	return NULL;
>+}
>+
>+static inline void bypass_unregister_driver(struct bypass *bypass)
>+{
>+}
>+
>+static inline int bypass_register_instance(struct bypass *bypass,
>+					   struct net_device *dev)
>+{
>+	return 0;
>+}
>+
>+static inline int bypass_unregister_instance(struct bypass *bypass,
>+					     struct net_device *dev)
>+{
>+	return 0;
>+}
>+
>+static inline int bypass_unregister_child(struct net_device *child_netdev)
>+{
>+	return 0;
>+}
>+
>+#endif
>+
>+#endif /* _NET_BYPASS_H */
>diff --git a/net/Kconfig b/net/Kconfig
>index 0428f12c25c2..994445f4a96a 100644
>--- a/net/Kconfig
>+++ b/net/Kconfig
>@@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
> 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
> 	  devlink is a loadable module and the driver using it is built-in.
> 
>+config NET_BYPASS
>+	tristate "Bypass interface"
>+	---help---
>+	  This provides a generic interface for paravirtual drivers to listen
>+	  for netdev register/unregister/link change events from pci ethernet
>+	  devices with the same MAC and takeover their datapath. This also
>+	  enables live migration of a VM with direct attached VF by failing
>+	  over to the paravirtual datapath when the VF is unplugged.
>+
>+config MAY_USE_BYPASS
>+	tristate
>+	default m if NET_BYPASS=m
>+	default y if NET_BYPASS=y || NET_BYPASS=n
>+	help
>+	  Drivers using the bypass infrastructure should have a dependency
>+	  on MAY_USE_BYPASS to ensure they do not cause link errors when
>+	  bypass is a loadable module and the driver using it is built-in.
>+
> endif   # if NET
> 
> # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>diff --git a/net/core/Makefile b/net/core/Makefile
>index 6dbbba8c57ae..a9727ed1c8fc 100644
>--- a/net/core/Makefile
>+++ b/net/core/Makefile
>@@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
> obj-$(CONFIG_HWBM) += hwbm.o
> obj-$(CONFIG_NET_DEVLINK) += devlink.o
> obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>+obj-$(CONFIG_NET_BYPASS) += bypass.o
>diff --git a/net/core/bypass.c b/net/core/bypass.c
>new file mode 100644
>index 000000000000..7bde962ec3d4
>--- /dev/null
>+++ b/net/core/bypass.c
>@@ -0,0 +1,406 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/* Copyright (c) 2018, Intel Corporation. */
>+
>+/* A common module to handle registrations and notifications for paravirtual
>+ * drivers to enable accelerated datapath and support VF live migration.
>+ *
>+ * The notifier and event handling code is based on netvsc driver.
>+ */
>+
>+#include <linux/netdevice.h>
>+#include <linux/etherdevice.h>
>+#include <linux/ethtool.h>
>+#include <linux/module.h>
>+#include <linux/slab.h>
>+#include <linux/netdevice.h>
>+#include <linux/netpoll.h>
>+#include <linux/rtnetlink.h>
>+#include <linux/if_vlan.h>
>+#include <net/sch_generic.h>
>+#include <uapi/linux/if_arp.h>
>+#include <net/bypass.h>
>+
>+static LIST_HEAD(bypass_list);
>+
>+static DEFINE_MUTEX(bypass_mutex);

Why mutex? Apparently you don't need to sleep while holding a lock.
Simple spinlock would do.


>+
>+struct bypass_instance *bypass_instance_alloc(struct net_device *dev)
>+{
>+	struct bypass_instance *bypass_instance;
>+
>+	bypass_instance = kzalloc(sizeof(*bypass_instance), GFP_KERNEL);
>+	if (!bypass_instance)
>+		return NULL;
>+
>+	dev_hold(dev);
>+	rcu_assign_pointer(bypass_instance->bypass_netdev, dev);
>+
>+	return bypass_instance;
>+}
>+
>+void bypass_instance_free(struct bypass_instance *bypass_instance)
>+{
>+	struct net_device *bypass_netdev;
>+
>+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+
>+	dev_put(bypass_netdev);
>+	kfree(bypass_instance);
>+}
>+
>+static struct bypass_instance *bypass_get_instance_bymac(u8 *mac)
>+{
>+	struct bypass_instance *bypass_instance;
>+	struct net_device *bypass_netdev;
>+	struct bypass *bypass;
>+
>+	list_for_each_entry(bypass, &bypass_list, list) {
>+		mutex_lock(&bypass->lock);
>+		list_for_each_entry(bypass_instance, &bypass->instance_list,
>+				    list) {
>+			bypass_netdev =
>+				rcu_dereference(bypass_instance->bypass_netdev);
>+			if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>+				mutex_unlock(&bypass->lock);
>+				goto out;
>+			}
>+		}
>+		mutex_unlock(&bypass->lock);
>+	}
>+
>+	bypass_instance = NULL;
>+out:
>+	return bypass_instance;
>+}
>+
>+static int bypass_register_child(struct net_device *child_netdev)
>+{
>+	struct bypass_instance *bypass_instance;
>+	struct bypass *bypass;
>+	struct net_device *bypass_netdev;
>+	int ret, orig_mtu;
>+
>+	ASSERT_RTNL();
>+
>+	mutex_lock(&bypass_mutex);
>+	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>+	if (!bypass_instance) {
>+		mutex_unlock(&bypass_mutex);
>+		goto done;
>+	}
>+
>+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+	bypass = bypass_instance->bypass;
>+	mutex_unlock(&bypass_mutex);
>+
>+	if (!bypass->ops->register_child)
>+		goto done;
>+
>+	ret = bypass->ops->register_child(bypass_netdev, child_netdev);
>+	if (ret != 0)
>+		goto done;
>+
>+	ret = netdev_rx_handler_register(child_netdev,
>+					 bypass->ops->handle_frame,
>+					 bypass_netdev);
>+	if (ret != 0) {
>+		netdev_err(child_netdev,
>+			   "can not register bypass rx handler (err = %d)\n",
>+			   ret);
>+		goto rx_handler_failed;
>+	}
>+
>+	ret = netdev_upper_dev_link(child_netdev, bypass_netdev, NULL);
>+	if (ret != 0) {
>+		netdev_err(child_netdev,

No line-wrap is needed here and in other cases like this.


>+			   "can not set master device %s (err = %d)\n",
>+			   bypass_netdev->name, ret);
>+		goto upper_link_failed;
>+	}
>+
>+	child_netdev->flags |= IFF_SLAVE;

Don't reuse IFF_SLAVE. That is bonding-specific thing. I know that
netvsc uses it, it is wrong.
Please rather introduce:
IFF_BYPASS for master
and IFF_BYPASS_SLAVE for slaves.




>+
>+	if (netif_running(bypass_netdev)) {
>+		ret = dev_open(child_netdev);
>+		if (ret && (ret != -EBUSY)) {
>+			netdev_err(bypass_netdev,
>+				   "Opening child %s failed ret:%d\n",
>+				   child_netdev->name, ret);
>+			goto err_interface_up;
>+		}
>+	}
>+
>+	/* Align MTU of child with master */
>+	orig_mtu = child_netdev->mtu;
>+	ret = dev_set_mtu(child_netdev, bypass_netdev->mtu);
>+	if (ret != 0) {
>+		netdev_err(bypass_netdev,
>+			   "unable to change mtu of %s to %u register failed\n",
>+			   child_netdev->name, bypass_netdev->mtu);
>+		goto err_set_mtu;
>+	}
>+
>+	ret = bypass->ops->join_child(bypass_netdev, child_netdev);
>+	if (ret != 0)
>+		goto err_join;
>+
>+	call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
>+
>+	goto done;
>+
>+err_join:
>+	dev_set_mtu(child_netdev, orig_mtu);
>+err_set_mtu:
>+	dev_close(child_netdev);
>+err_interface_up:
>+	netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>+	child_netdev->flags &= ~IFF_SLAVE;
>+upper_link_failed:
>+	netdev_rx_handler_unregister(child_netdev);
>+rx_handler_failed:
>+	bypass->ops->unregister_child(bypass_netdev, child_netdev);
>+
>+done:
>+	return NOTIFY_DONE;
>+}
>+
>+int bypass_unregister_child(struct net_device *child_netdev)
>+{
>+	struct bypass_instance *bypass_instance;
>+	struct net_device *bypass_netdev;
>+	struct bypass *bypass;
>+	int ret;
>+
>+	ASSERT_RTNL();
>+
>+	mutex_lock(&bypass_mutex);
>+	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>+	if (!bypass_instance) {
>+		mutex_unlock(&bypass_mutex);
>+		goto done;
>+	}
>+
>+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+	bypass = bypass_instance->bypass;
>+	mutex_unlock(&bypass_mutex);
>+
>+	ret = bypass->ops->release_child(bypass_netdev, child_netdev);
>+	if (ret != 0)
>+		goto done;
>+
>+	netdev_rx_handler_unregister(child_netdev);
>+	netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>+	child_netdev->flags &= ~IFF_SLAVE;
>+
>+	if (!bypass->ops->unregister_child)
>+		goto done;
>+
>+	bypass->ops->unregister_child(bypass_netdev, child_netdev);
>+
>+done:
>+	return NOTIFY_DONE;
>+}
>+EXPORT_SYMBOL(bypass_unregister_child);

Please use "EXPORT_SYMBOL_GPL" for all exported symbols.


>+
>+static int bypass_update_link(struct net_device *child_netdev)
>+{
>+	struct bypass_instance *bypass_instance;
>+	struct net_device *bypass_netdev;
>+	struct bypass *bypass;
>+
>+	ASSERT_RTNL();
>+
>+	mutex_lock(&bypass_mutex);
>+	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);

You don't really need this lookup. The kernel knows about the master
device, you can just use netdev_master_upper_dev_get_rcu() to get it.


>+	if (!bypass_instance) {
>+		mutex_unlock(&bypass_mutex);
>+		goto done;
>+	}
>+
>+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+	bypass = bypass_instance->bypass;
>+	mutex_unlock(&bypass_mutex);
>+
>+	if (!bypass->ops->update_link)
>+		goto done;
>+
>+	bypass->ops->update_link(bypass_netdev, child_netdev);
>+
>+done:
>+	return NOTIFY_DONE;
>+}
>+
>+static bool bypass_validate_child_dev(struct net_device *dev)
>+{
>+	/* Avoid non-Ethernet type devices */
>+	if (dev->type != ARPHRD_ETHER)
>+		return false;
>+
>+	/* Avoid Vlan dev with same MAC registering as VF */
>+	if (is_vlan_dev(dev))
>+		return false;
>+
>+	/* Avoid Bonding master dev with same MAC registering as child dev */
>+	if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>+		return false;
>+
>+	return true;
>+}
>+
>+static int
>+bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>+{
>+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>+	struct bypass *bypass;
>+
>+	/* Skip Parent events */
>+	mutex_lock(&bypass_mutex);
>+	list_for_each_entry(bypass, &bypass_list, list) {
>+		if (event_dev->netdev_ops == bypass->netdev_ops) {
>+			mutex_unlock(&bypass_mutex);
>+			return NOTIFY_DONE;
>+		}

What you need instead of this is an identification helper
netif_is_bypass_master()
similar to
netif_is_team_master()
netif_is_bridge_master()
etc

 

>+	}
>+	mutex_unlock(&bypass_mutex);
>+
>+	if (!bypass_validate_child_dev(event_dev))
>+		return NOTIFY_DONE;
>+
>+	switch (event) {
>+	case NETDEV_REGISTER:
>+		return bypass_register_child(event_dev);
>+	case NETDEV_UNREGISTER:
>+		return bypass_unregister_child(event_dev);
>+	case NETDEV_UP:
>+	case NETDEV_DOWN:
>+	case NETDEV_CHANGE:
>+		return bypass_update_link(event_dev);
>+	default:
>+		return NOTIFY_DONE;
>+	}
>+}
>+
>+static struct notifier_block bypass_notifier = {
>+	.notifier_call = bypass_event,
>+};
>+
>+static void bypass_register_existing_child(struct net_device *bypass_netdev)
>+{
>+	struct net *net = dev_net(bypass_netdev);
>+	struct net_device *dev;
>+
>+	rtnl_lock();
>+	for_each_netdev(net, dev) {
>+		if (dev == bypass_netdev)
>+			continue;
>+		if (!bypass_validate_child_dev(dev))
>+			continue;
>+		if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>+			bypass_register_child(dev);
>+	}
>+	rtnl_unlock();
>+}
>+
>+int bypass_register_instance(struct bypass *bypass, struct net_device *dev)
>+{
>+	struct bypass_instance *bypass_instance;

No need to allocate this instace here. You can just have is embedded
inside netdevice priv and pass pointer to it here. You can pass the
pointer back to the driver when you call ops as the driver can get priv
back by it.

I would also call it "struct bypass_master" and this function
"bypass_master_register".

It should contain the ops pointer too.


>+	struct net_device *bypass_netdev;
>+	int ret = 0;
>+
>+	mutex_lock(&bypass->lock);
>+	list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>+		bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+		if (bypass_netdev == dev) {

This means the driver registered one netdev twice. That is a bug in
driver, so WARN_ON would be nice here to point that out.


>+			ret = -EEXIST;
>+			goto done;
>+		}
>+	}
>+
>+	bypass_instance = bypass_instance_alloc(dev);
>+	if (!bypass_instance) {
>+		ret = -ENOMEM;
>+		goto done;
>+	}
>+
>+	bypass_instance->bypass = bypass;
>+	list_add_tail(&bypass_instance->list, &bypass->instance_list);
>+
>+done:
>+	mutex_unlock(&bypass->lock);
>+	bypass_register_existing_child(dev);
>+	return ret;
>+}
>+EXPORT_SYMBOL(bypass_register_instance);
>+
>+int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev)
>+{
>+	struct bypass_instance *bypass_instance;
>+	struct net_device *bypass_netdev;
>+	int ret = 0;
>+
>+	mutex_lock(&bypass->lock);
>+	list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>+		bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+		if (bypass_netdev == dev) {
>+			list_del(&bypass_instance->list);
>+			bypass_instance_free(bypass_instance);
>+			goto done;
>+		}
>+	}
>+
>+	ret = -ENOENT;
>+done:
>+	mutex_unlock(&bypass->lock);
>+	return ret;
>+}
>+EXPORT_SYMBOL(bypass_unregister_instance);
>+
>+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>+				      const struct net_device_ops *netdev_ops)

I don't see why you need a list of drivers. What you need is just a list
of instances - bypass masters (probably to call them like that in the
code as well). Well, you can use the common netdevice list for that
purpose with the identification helper I mentioned above. Then you need
no lists and no mutexes/spinlocks.


>+{
>+	struct bypass *bypass;
>+
>+	bypass = kzalloc(sizeof(*bypass), GFP_KERNEL);
>+	if (!bypass)
>+		return NULL;
>+
>+	bypass->ops = ops;
>+	bypass->netdev_ops = netdev_ops;
>+	INIT_LIST_HEAD(&bypass->instance_list);
>+
>+	mutex_lock(&bypass_mutex);
>+	list_add_tail(&bypass->list, &bypass_list);
>+	mutex_unlock(&bypass_mutex);
>+
>+	return bypass;
>+}
>+EXPORT_SYMBOL_GPL(bypass_register_driver);
>+
>+void bypass_unregister_driver(struct bypass *bypass)
>+{
>+	mutex_lock(&bypass_mutex);
>+	list_del(&bypass->list);
>+	mutex_unlock(&bypass_mutex);
>+
>+	kfree(bypass);
>+}
>+EXPORT_SYMBOL_GPL(bypass_unregister_driver);
>+
>+static __init int
>+bypass_init(void)
>+{
>+	register_netdevice_notifier(&bypass_notifier);
>+
>+	return 0;
>+}
>+module_init(bypass_init);
>+
>+static __exit
>+void bypass_exit(void)
>+{
>+	unregister_netdevice_notifier(&bypass_notifier);
>+}
>+module_exit(bypass_exit);
>+
>+MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>+MODULE_LICENSE("GPL v2");
>-- 
>2.14.3
>

^ permalink raw reply

* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Jiri Pirko @ 2018-04-06 12:48 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <1522962503-3963-4-git-send-email-sridhar.samudrala@intel.com>

Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
>This patch enables virtio_net to switch over to a VF datapath when a VF
>netdev is present with the same MAC address. It allows live migration
>of a VM with a direct attached VF without the need to setup a bond/team
>between a VF and virtio net device in the guest.
>
>The hypervisor needs to enable only one datapath at any time so that
>packets don't get looped back to the VM over the other datapath. When a VF
>is plugged, the virtio datapath link state can be marked as down. The
>hypervisor needs to unplug the VF device from the guest on the source host
>and reset the MAC filter of the VF to initiate failover of datapath to
>virtio before starting the migration. After the migration is completed,
>the destination hypervisor sets the MAC filter on the VF and plugs it back
>to the guest to switch over to VF datapath.
>
>When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>created that acts as a master device and tracks the state of the 2 lower
>netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
>passthru device with the same MAC is registered as 'active' netdev.
>
>This patch is based on the discussion initiated by Jesse on this thread.
>https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>---
> drivers/net/Kconfig      |   1 +
> drivers/net/virtio_net.c | 612 ++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 612 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
>index 891846655000..9e2cf61fd1c1 100644
>--- a/drivers/net/Kconfig
>+++ b/drivers/net/Kconfig
>@@ -331,6 +331,7 @@ config VETH
> config VIRTIO_NET
> 	tristate "Virtio network driver"
> 	depends on VIRTIO
>+	depends on MAY_USE_BYPASS
> 	---help---
> 	  This is the virtual network driver for virtio.  It can be used with
> 	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
>diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>index befb5944f3fd..86b2f8f2947d 100644
>--- a/drivers/net/virtio_net.c
>+++ b/drivers/net/virtio_net.c
>@@ -30,8 +30,11 @@
> #include <linux/cpu.h>
> #include <linux/average.h>
> #include <linux/filter.h>
>+#include <linux/netdevice.h>
>+#include <linux/pci.h>
> #include <net/route.h>
> #include <net/xdp.h>
>+#include <net/bypass.h>
> 
> static int napi_weight = NAPI_POLL_WEIGHT;
> module_param(napi_weight, int, 0444);
>@@ -206,6 +209,9 @@ struct virtnet_info {
> 	u32 speed;
> 
> 	unsigned long guest_offloads;
>+
>+	/* upper netdev created when BACKUP feature enabled */
>+	struct net_device __rcu *bypass_netdev;
> };
> 
> struct padded_vnet_hdr {
>@@ -2275,6 +2281,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> 	}
> }
> 
>+static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
>+				      size_t len)
>+{
>+	struct virtnet_info *vi = netdev_priv(dev);
>+	int ret;
>+
>+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
>+		return -EOPNOTSUPP;
>+
>+	ret = snprintf(buf, len, "_bkup");
>+	if (ret >= len)
>+		return -EOPNOTSUPP;
>+
>+	return 0;
>+}
>+
> static const struct net_device_ops virtnet_netdev = {
> 	.ndo_open            = virtnet_open,
> 	.ndo_stop   	     = virtnet_close,
>@@ -2292,6 +2314,7 @@ static const struct net_device_ops virtnet_netdev = {
> 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
> 	.ndo_xdp_flush		= virtnet_xdp_flush,
> 	.ndo_features_check	= passthru_features_check,
>+	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
> };
> 
> static void virtnet_config_changed_work(struct work_struct *work)
>@@ -2689,6 +2712,576 @@ static int virtnet_validate(struct virtio_device *vdev)
> 	return 0;
> }
> 
>+/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
>+ * When BACKUP feature is enabled, an additional netdev(bypass netdev)
>+ * is created that acts as a master device and tracks the state of the
>+ * 2 lower netdevs. The original virtio_net netdev is registered as
>+ * 'backup' netdev and a passthru device with the same MAC is registered
>+ * as 'active' netdev.
>+ */
>+
>+/* bypass state maintained when BACKUP feature is enabled */
>+struct virtnet_bypass_info {
>+	/* passthru netdev with same MAC */
>+	struct net_device __rcu *active_netdev;
>+
>+	/* virtio_net netdev */
>+	struct net_device __rcu *backup_netdev;
>+
>+	/* active netdev stats */
>+	struct rtnl_link_stats64 active_stats;
>+
>+	/* backup netdev stats */
>+	struct rtnl_link_stats64 backup_stats;
>+
>+	/* aggregated stats */
>+	struct rtnl_link_stats64 bypass_stats;
>+
>+	/* spinlock while updating stats */
>+	spinlock_t stats_lock;
>+};
>+
>+static int virtnet_bypass_open(struct net_device *dev)
>+{
>+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>+	struct net_device *active_netdev, *backup_netdev;
>+	int err;
>+
>+	netif_carrier_off(dev);
>+	netif_tx_wake_all_queues(dev);
>+
>+	active_netdev = rtnl_dereference(vbi->active_netdev);
>+	if (active_netdev) {
>+		err = dev_open(active_netdev);
>+		if (err)
>+			goto err_active_open;
>+	}
>+
>+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>+	if (backup_netdev) {
>+		err = dev_open(backup_netdev);
>+		if (err)
>+			goto err_backup_open;
>+	}

This should be moved to bypass module.
See "***" below.

>+
>+	return 0;
>+
>+err_backup_open:
>+	dev_close(active_netdev);
>+err_active_open:
>+	netif_tx_disable(dev);
>+	return err;
>+}
>+
>+static int virtnet_bypass_close(struct net_device *dev)
>+{
>+	struct virtnet_bypass_info *vi = netdev_priv(dev);
>+	struct net_device *child_netdev;
>+
>+	netif_tx_disable(dev);
>+
>+	child_netdev = rtnl_dereference(vi->active_netdev);
>+	if (child_netdev)
>+		dev_close(child_netdev);
>+
>+	child_netdev = rtnl_dereference(vi->backup_netdev);
>+	if (child_netdev)
>+		dev_close(child_netdev);

This should be moved to bypass module.

>+
>+	return 0;
>+}
>+
>+static netdev_tx_t virtnet_bypass_drop_xmit(struct sk_buff *skb,
>+					    struct net_device *dev)
>+{
>+	atomic_long_inc(&dev->tx_dropped);
>+	dev_kfree_skb_any(skb);
>+	return NETDEV_TX_OK;
>+}
>+
>+static bool virtnet_bypass_xmit_ready(struct net_device *dev)
>+{
>+	return netif_running(dev) && netif_carrier_ok(dev);
>+}
>+
>+static netdev_tx_t virtnet_bypass_start_xmit(struct sk_buff *skb,
>+					     struct net_device *dev)
>+{
>+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>+	struct net_device *xmit_dev;
>+
>+	/* Try xmit via active netdev followed by backup netdev */
>+	xmit_dev = rcu_dereference_bh(vbi->active_netdev);
>+	if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev)) {
>+		xmit_dev = rcu_dereference_bh(vbi->backup_netdev);

This should be moved to bypass module.
	
>+		if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev))
>+			return virtnet_bypass_drop_xmit(skb, dev);
>+	}
>+
>+	skb->dev = xmit_dev;
>+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
>+
>+	return dev_queue_xmit(skb);
>+}
>+
>+static u16 virtnet_bypass_select_queue(struct net_device *dev,
>+				       struct sk_buff *skb, void *accel_priv,
>+				       select_queue_fallback_t fallback)
>+{
>+	/* This helper function exists to help dev_pick_tx get the correct
>+	 * destination queue.  Using a helper function skips a call to
>+	 * skb_tx_hash and will put the skbs in the queue we expect on their
>+	 * way down to the bonding driver.
>+	 */
>+	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
>+
>+	/* Save the original txq to restore before passing to the driver */
>+	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
>+
>+	if (unlikely(txq >= dev->real_num_tx_queues)) {
>+		do {
>+			txq -= dev->real_num_tx_queues;
>+		} while (txq >= dev->real_num_tx_queues);
>+	}
>+
>+	return txq;
>+}
>+
>+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
>+ * that some drivers can provide 32bit values only.
>+ */
>+static void virtnet_bypass_fold_stats(struct rtnl_link_stats64 *_res,
>+				      const struct rtnl_link_stats64 *_new,
>+				      const struct rtnl_link_stats64 *_old)
>+{
>+	const u64 *new = (const u64 *)_new;
>+	const u64 *old = (const u64 *)_old;
>+	u64 *res = (u64 *)_res;
>+	int i;
>+
>+	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
>+		u64 nv = new[i];
>+		u64 ov = old[i];
>+		s64 delta = nv - ov;
>+
>+		/* detects if this particular field is 32bit only */
>+		if (((nv | ov) >> 32) == 0)
>+			delta = (s64)(s32)((u32)nv - (u32)ov);
>+
>+		/* filter anomalies, some drivers reset their stats
>+		 * at down/up events.
>+		 */
>+		if (delta > 0)
>+			res[i] += delta;
>+	}
>+}
>+
>+static void virtnet_bypass_get_stats(struct net_device *dev,
>+				     struct rtnl_link_stats64 *stats)
>+{
>+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>+	const struct rtnl_link_stats64 *new;
>+	struct rtnl_link_stats64 temp;
>+	struct net_device *child_netdev;
>+
>+	spin_lock(&vbi->stats_lock);
>+	memcpy(stats, &vbi->bypass_stats, sizeof(*stats));
>+
>+	rcu_read_lock();
>+
>+	child_netdev = rcu_dereference(vbi->active_netdev);
>+	if (child_netdev) {
>+		new = dev_get_stats(child_netdev, &temp);
>+		virtnet_bypass_fold_stats(stats, new, &vbi->active_stats);
>+		memcpy(&vbi->active_stats, new, sizeof(*new));
>+	}
>+
>+	child_netdev = rcu_dereference(vbi->backup_netdev);
>+	if (child_netdev) {
>+		new = dev_get_stats(child_netdev, &temp);
>+		virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
>+		memcpy(&vbi->backup_stats, new, sizeof(*new));
>+	}
>+
>+	rcu_read_unlock();
>+
>+	memcpy(&vbi->bypass_stats, stats, sizeof(*stats));
>+	spin_unlock(&vbi->stats_lock);
>+}

This should be moved to bypass module.

>+
>+static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
>+{
>+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>+	struct net_device *active_netdev, *backup_netdev;
>+	int ret = 0;
>+
>+	active_netdev = rcu_dereference(vbi->active_netdev);
>+	if (active_netdev) {
>+		ret = dev_set_mtu(active_netdev, new_mtu);
>+		if (ret)
>+			return ret;
>+	}
>+
>+	backup_netdev = rcu_dereference(vbi->backup_netdev);
>+	if (backup_netdev) {
>+		ret = dev_set_mtu(backup_netdev, new_mtu);
>+		if (ret) {
>+			dev_set_mtu(active_netdev, dev->mtu);
>+			return ret;
>+		}
>+	}
>+
>+	dev->mtu = new_mtu;
>+	return 0;
>+}

This should be moved to bypass module.

	
>+
>+static void virtnet_bypass_set_rx_mode(struct net_device *dev)
>+{
>+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>+	struct net_device *child_netdev;
>+
>+	rcu_read_lock();
>+
>+	child_netdev = rcu_dereference(vbi->active_netdev);
>+	if (child_netdev) {
>+		dev_uc_sync_multiple(child_netdev, dev);
>+		dev_mc_sync_multiple(child_netdev, dev);
>+	}
>+
>+	child_netdev = rcu_dereference(vbi->backup_netdev);
>+	if (child_netdev) {
>+		dev_uc_sync_multiple(child_netdev, dev);
>+		dev_mc_sync_multiple(child_netdev, dev);
>+	}
>+
>+	rcu_read_unlock();
>+}

This should be moved to bypass module.


>+
>+static const struct net_device_ops virtnet_bypass_netdev_ops = {
>+	.ndo_open		= virtnet_bypass_open,
>+	.ndo_stop		= virtnet_bypass_close,
>+	.ndo_start_xmit		= virtnet_bypass_start_xmit,
>+	.ndo_select_queue	= virtnet_bypass_select_queue,
>+	.ndo_get_stats64	= virtnet_bypass_get_stats,
>+	.ndo_change_mtu		= virtnet_bypass_change_mtu,
>+	.ndo_set_rx_mode	= virtnet_bypass_set_rx_mode,
>+	.ndo_validate_addr	= eth_validate_addr,
>+	.ndo_features_check	= passthru_features_check,
>+};
>+
>+static int
>+virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
>+					  struct ethtool_link_ksettings *cmd)
>+{
>+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>+	struct net_device *child_netdev;
>+
>+	child_netdev = rtnl_dereference(vbi->active_netdev);
>+	if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
>+		child_netdev = rtnl_dereference(vbi->backup_netdev);
>+		if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
>+			cmd->base.duplex = DUPLEX_UNKNOWN;
>+			cmd->base.port = PORT_OTHER;
>+			cmd->base.speed = SPEED_UNKNOWN;
>+
>+			return 0;
>+		}
>+	}
>+
>+	return __ethtool_get_link_ksettings(child_netdev, cmd);
>+}
>+
>+#define BYPASS_DRV_NAME "virtnet_bypass"
>+#define BYPASS_DRV_VERSION "0.1"
>+
>+static void virtnet_bypass_ethtool_get_drvinfo(struct net_device *dev,
>+					       struct ethtool_drvinfo *drvinfo)
>+{
>+	strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
>+	strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
>+}
>+
>+static const struct ethtool_ops virtnet_bypass_ethtool_ops = {
>+	.get_drvinfo            = virtnet_bypass_ethtool_get_drvinfo,
>+	.get_link               = ethtool_op_get_link,
>+	.get_link_ksettings     = virtnet_bypass_ethtool_get_link_ksettings,
>+};
>+
>+static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>+				     struct net_device *child_netdev)
>+{
>+	struct virtnet_bypass_info *vbi;
>+	bool backup;
>+
>+	vbi = netdev_priv(bypass_netdev);
>+	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>+	if (backup ? rtnl_dereference(vbi->backup_netdev) :
>+			rtnl_dereference(vbi->active_netdev)) {
>+		netdev_info(bypass_netdev,
>+			    "%s attempting to join bypass dev when %s already present\n",
>+			    child_netdev->name, backup ? "backup" : "active");

Bypass module should check if there is already some other netdev
enslaved and refuse right there.

The active/backup terminology is quite confusing. From the bonding world
that means active is the one which is currently used for tx of the
packets. And it depends on link and other things what netdev is declared
active. However here, it is different. Backup is always the virtio_net
instance even when it is active. Odd. Please change the terminology.
For "active" I suggest to use name "stolen".

*** Also, the 2 slave netdev pointers should be stored in the bypass
module instance, not in the drivers.



>+		return -EEXIST;
>+	}
>+
>+	dev_hold(child_netdev);
>+
>+	if (backup) {
>+		rcu_assign_pointer(vbi->backup_netdev, child_netdev);
>+		dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
>+	} else {
>+		rcu_assign_pointer(vbi->active_netdev, child_netdev);
>+		dev_get_stats(vbi->active_netdev, &vbi->active_stats);
>+		bypass_netdev->min_mtu = child_netdev->min_mtu;
>+		bypass_netdev->max_mtu = child_netdev->max_mtu;
>+	}
>+
>+	netdev_info(bypass_netdev, "child:%s joined\n", child_netdev->name);
>+
>+	return 0;
>+}
>+
>+static int virtnet_bypass_register_child(struct net_device *bypass_netdev,
>+					 struct net_device *child_netdev)
>+{
>+	struct virtnet_bypass_info *vbi;
>+	bool backup;
>+
>+	vbi = netdev_priv(bypass_netdev);
>+	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>+	if (backup ? rtnl_dereference(vbi->backup_netdev) :
>+			rtnl_dereference(vbi->active_netdev)) {
>+		netdev_info(bypass_netdev,
>+			    "%s attempting to register bypass dev when %s already present\n",
>+			    child_netdev->name, backup ? "backup" : "active");
>+		return -EEXIST;
>+	}
>+
>+	/* Avoid non pci devices as active netdev */
>+	if (!backup && (!child_netdev->dev.parent ||
>+			!dev_is_pci(child_netdev->dev.parent)))
>+		return -EINVAL;
>+
>+	netdev_info(bypass_netdev, "child:%s registered\n", child_netdev->name);
>+
>+	return 0;
>+}
>+
>+static int virtnet_bypass_release_child(struct net_device *bypass_netdev,
>+					struct net_device *child_netdev)
>+{
>+	struct net_device *backup_netdev, *active_netdev;
>+	struct virtnet_bypass_info *vbi;
>+
>+	vbi = netdev_priv(bypass_netdev);
>+	active_netdev = rtnl_dereference(vbi->active_netdev);
>+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>+
>+	if (child_netdev != active_netdev && child_netdev != backup_netdev)
>+		return -EINVAL;
>+
>+	netdev_info(bypass_netdev, "child:%s released\n", child_netdev->name);
>+
>+	return 0;
>+}
>+
>+static int virtnet_bypass_unregister_child(struct net_device *bypass_netdev,
>+					   struct net_device *child_netdev)
>+{
>+	struct net_device *backup_netdev, *active_netdev;
>+	struct virtnet_bypass_info *vbi;
>+
>+	vbi = netdev_priv(bypass_netdev);
>+	active_netdev = rtnl_dereference(vbi->active_netdev);
>+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>+
>+	if (child_netdev != active_netdev && child_netdev != backup_netdev)
>+		return -EINVAL;
>+
>+	if (child_netdev == backup_netdev) {
>+		RCU_INIT_POINTER(vbi->backup_netdev, NULL);
>+	} else {
>+		RCU_INIT_POINTER(vbi->active_netdev, NULL);
>+		if (backup_netdev) {
>+			bypass_netdev->min_mtu = backup_netdev->min_mtu;
>+			bypass_netdev->max_mtu = backup_netdev->max_mtu;
>+		}
>+	}
>+
>+	dev_put(child_netdev);
>+
>+	netdev_info(bypass_netdev, "child:%s unregistered\n",
>+		    child_netdev->name);
>+
>+	return 0;
>+}
>+
>+static int virtnet_bypass_update_link(struct net_device *bypass_netdev,
>+				      struct net_device *child_netdev)
>+{
>+	struct net_device *active_netdev, *backup_netdev;
>+	struct virtnet_bypass_info *vbi;
>+
>+	if (!netif_running(bypass_netdev))
>+		return 0;
>+
>+	vbi = netdev_priv(bypass_netdev);
>+
>+	active_netdev = rtnl_dereference(vbi->active_netdev);
>+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>+
>+	if (child_netdev != active_netdev && child_netdev != backup_netdev)
>+		return -EINVAL;
>+
>+	if ((active_netdev && virtnet_bypass_xmit_ready(active_netdev)) ||
>+	    (backup_netdev && virtnet_bypass_xmit_ready(backup_netdev))) {
>+		netif_carrier_on(bypass_netdev);
>+		netif_tx_wake_all_queues(bypass_netdev);
>+	} else {
>+		netif_carrier_off(bypass_netdev);
>+		netif_tx_stop_all_queues(bypass_netdev);
>+	}
>+
>+	return 0;
>+}
>+
>+/* Called when child dev is injecting data into network stack.
>+ * Change the associated network device from lower dev to virtio.
>+ * note: already called with rcu_read_lock
>+ */
>+static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
>+{
>+	struct sk_buff *skb = *pskb;
>+	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>+
>+	skb->dev = ndev;
>+
>+	return RX_HANDLER_ANOTHER;
>+}

Hmm, you have the rx_handler defined in drivers and you register it in
bypass module. It is odd because here you assume that the bypass module
passed "ndev" and rx_handler_data. Also, you don't need advanced
features rx_handler provides.
Instead, just register a common rx_handler defined in the bypass module
and have simple skb_rx callback here (static void).


>+
>+static const struct bypass_ops virtnet_bypass_ops = {
>+	.register_child		= virtnet_bypass_register_child,
>+	.join_child		= virtnet_bypass_join_child,
>+	.unregister_child	= virtnet_bypass_unregister_child,
>+	.release_child		= virtnet_bypass_release_child,
>+	.update_link		= virtnet_bypass_update_link,
>+	.handle_frame		= virtnet_bypass_handle_frame,
>+};
>+
>+static struct bypass *virtnet_bypass;
>+
>+static int virtnet_bypass_create(struct virtnet_info *vi)
>+{
>+	struct net_device *backup_netdev = vi->dev;
>+	struct device *dev = &vi->vdev->dev;
>+	struct net_device *bypass_netdev;
>+	int res;
>+
>+	/* Alloc at least 2 queues, for now we are going with 16 assuming
>+	 * that most devices being bonded won't have too many queues.
>+	 */
>+	bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
>+					  16);
>+	if (!bypass_netdev) {
>+		dev_err(dev, "Unable to allocate bypass_netdev!\n");
>+		return -ENOMEM;
>+	}
>+
>+	dev_net_set(bypass_netdev, dev_net(backup_netdev));
>+	SET_NETDEV_DEV(bypass_netdev, dev);
>+
>+	bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
>+	bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
>+
>+	/* Initialize the device options */
>+	bypass_netdev->flags |= IFF_MASTER;

I think I pointed that out already. Don't use "IFF_MASTER". That is
specific to bonding. As I suggested in the reply to the patch #2, you
should introduce IFF_BYPASS. Also, this flag should be set by the bypass
module. Just create the netdev and do things specific to virtio and then
call to bypass module, pass the netdev so it can do the rest. I think
that the flags, features etc would be also fine to set there.


>+	bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>+	bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>+				       IFF_TX_SKB_SHARING);
>+
>+	/* don't acquire bypass netdev's netif_tx_lock when transmitting */
>+	bypass_netdev->features |= NETIF_F_LLTX;
>+
>+	/* Don't allow bypass devices to change network namespaces. */
>+	bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
>+
>+	bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>+				     NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>+				     NETIF_F_HIGHDMA | NETIF_F_LRO;
>+
>+	bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>+	bypass_netdev->features |= bypass_netdev->hw_features;
>+
>+	/* For now treat bypass netdev as VLAN challenged since we
>+	 * cannot assume VLAN functionality with a VF

Why? I don't see such drivers. But to be 100% correct, you should check
the NETIF_F_VLAN_CHALLENGED feature in bypass module during VF enslave
and forbid to do so if it is on.


>+	 */
>+	bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
>+
>+	memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
>+	       bypass_netdev->addr_len);
>+
>+	bypass_netdev->min_mtu = backup_netdev->min_mtu;
>+	bypass_netdev->max_mtu = backup_netdev->max_mtu;
>+
>+	res = register_netdev(bypass_netdev);
>+	if (res < 0) {
>+		dev_err(dev, "Unable to register bypass_netdev!\n");
>+		goto err_register_netdev;
>+	}
>+
>+	netif_carrier_off(bypass_netdev);
>+
>+	res = bypass_register_instance(virtnet_bypass, bypass_netdev);
>+	if (res < 0)
>+		goto err_bypass;
>+
>+	rcu_assign_pointer(vi->bypass_netdev, bypass_netdev);
>+
>+	return 0;
>+
>+err_bypass:
>+	unregister_netdev(bypass_netdev);
>+err_register_netdev:
>+	free_netdev(bypass_netdev);
>+
>+	return res;
>+}
>+
>+static void virtnet_bypass_destroy(struct virtnet_info *vi)
>+{
>+	struct net_device *bypass_netdev;
>+	struct virtnet_bypass_info *vbi;
>+	struct net_device *child_netdev;
>+
>+	bypass_netdev = rcu_dereference(vi->bypass_netdev);
>+	/* no device found, nothing to free */
>+	if (!bypass_netdev)
>+		return;
>+
>+	vbi = netdev_priv(bypass_netdev);
>+
>+	netif_device_detach(bypass_netdev);
>+
>+	rtnl_lock();
>+
>+	child_netdev = rtnl_dereference(vbi->active_netdev);
>+	if (child_netdev)
>+		bypass_unregister_child(child_netdev);
>+
>+	child_netdev = rtnl_dereference(vbi->backup_netdev);
>+	if (child_netdev)
>+		bypass_unregister_child(child_netdev);
>+
>+	unregister_netdevice(bypass_netdev);
>+
>+	bypass_unregister_instance(virtnet_bypass, bypass_netdev);
>+
>+	rtnl_unlock();
>+
>+	free_netdev(bypass_netdev);
>+}
>+
>+/* END of functions supporting VIRTIO_NET_F_BACKUP feature. */
>+
> static int virtnet_probe(struct virtio_device *vdev)
> {
> 	int i, err = -ENOMEM;
>@@ -2839,10 +3432,15 @@ static int virtnet_probe(struct virtio_device *vdev)
> 
> 	virtnet_init_settings(dev);
> 
>+	if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
>+		if (virtnet_bypass_create(vi) != 0)

You need to do:
		err = virtnet_bypass_create(vi);
		if (err)
otherwise you ignore err and virtnet_probe would return 0;


>+			goto free_vqs;
>+	}
>+
> 	err = register_netdev(dev);
> 	if (err) {
> 		pr_debug("virtio_net: registering device failed\n");
>-		goto free_vqs;
>+		goto free_bypass;
> 	}
> 
> 	virtio_device_ready(vdev);
>@@ -2879,6 +3477,8 @@ static int virtnet_probe(struct virtio_device *vdev)
> 	vi->vdev->config->reset(vdev);
> 
> 	unregister_netdev(dev);
>+free_bypass:
>+	virtnet_bypass_destroy(vi);
> free_vqs:
> 	cancel_delayed_work_sync(&vi->refill);
> 	free_receive_page_frags(vi);
>@@ -2913,6 +3513,8 @@ static void virtnet_remove(struct virtio_device *vdev)
> 
> 	unregister_netdev(vi->dev);
> 
>+	virtnet_bypass_destroy(vi);
>+
> 	remove_vq_common(vi);
> 
> 	free_netdev(vi->dev);
>@@ -2996,6 +3598,11 @@ static __init int virtio_net_driver_init(void)
> {
> 	int ret;
> 
>+	virtnet_bypass = bypass_register_driver(&virtnet_bypass_ops,
>+						&virtnet_bypass_netdev_ops);
>+	if (!virtnet_bypass)
>+		return -ENOMEM;

If CONFIG_NET_BYPASS is undefined, you will always return -ENOMEM here.


>+
> 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
> 				      virtnet_cpu_online,
> 				      virtnet_cpu_down_prep);
>@@ -3010,12 +3617,14 @@ static __init int virtio_net_driver_init(void)
>         ret = register_virtio_driver(&virtio_net_driver);
> 	if (ret)
> 		goto err_virtio;
>+
> 	return 0;
> err_virtio:
> 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> err_dead:
> 	cpuhp_remove_multi_state(virtionet_online);
> out:
>+	bypass_unregister_driver(virtnet_bypass);
> 	return ret;
> }
> module_init(virtio_net_driver_init);
>@@ -3025,6 +3634,7 @@ static __exit void virtio_net_driver_exit(void)
> 	unregister_virtio_driver(&virtio_net_driver);
> 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> 	cpuhp_remove_multi_state(virtionet_online);
>+	bypass_unregister_driver(virtnet_bypass);
> }
> module_exit(virtio_net_driver_exit);
> 
>-- 
>2.14.3
>

^ permalink raw reply

* [PATCH] ARM: dts: ls1021a: Specify TBIPA register address
From: Esben Haabendal @ 2018-04-06 12:46 UTC (permalink / raw)
  To: netdev
  Cc: Rob Herring, Mark Rutland, devicetree, linux-kernel,
	Claudiu Manoil, David S . Miller, Gustavo A . R . Silva,
	Esben Haabendal
In-Reply-To: <20180406123836.12019-2-esben.haabendal@gmail.com>

From: Esben Haabendal <eha@deif.com>

The current (mildly evil) fsl_pq_mdio code uses an undocumented shadow of
the TBIPA register on LS1021A, which happens to be read-only.
Changing TBI PHY address therefore does not work on LS1021A.

The real (and documented) address of the TBIPA registere lies in the eTSEC
block and not in MDIO/MII, which is read/write, so using that fixes
the problem.

Signed-off-by: Esben Haabendal <eha@deif.com>
---
 arch/arm/boot/dts/ls1021a.dtsi | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi
index c31dad98f989..728e206009ea 100644
--- a/arch/arm/boot/dts/ls1021a.dtsi
+++ b/arch/arm/boot/dts/ls1021a.dtsi
@@ -587,7 +587,8 @@
 			device_type = "mdio";
 			#address-cells = <1>;
 			#size-cells = <0>;
-			reg = <0x0 0x2d24000 0x0 0x4000>;
+			reg = <0x0 0x2d24000 0x0 0x4000>,
+			      <0x0 0x2d10030 0x0 0x4>;
 		};
 
 		ptp_clock@2d10e00 {
-- 
2.16.3

^ permalink raw reply related

* [PATCH 1/2] net/fsl_pq_mdio: Allow explicit speficition of TBIPA address
From: Esben Haabendal @ 2018-04-06 12:38 UTC (permalink / raw)
  To: netdev
  Cc: Esben Haabendal, Claudiu Manoil, Rob Herring, Mark Rutland,
	David S. Miller, Gustavo A. R. Silva, devicetree, linux-kernel

From: Esben Haabendal <eha@deif.com>

This introduces a simpler and generic method for for finding (and mapping)
the TBIPA register.

Instead of relying of complicated logic for finding the TBIPA register
address based on the MDIO or MII register block base
address, which even in some cases relies on undocumented shadow registers,
a second "reg" entry for the mdio bus devicetree node specifies the TBIPA
register.

Backwards compatibility is kept, as the existing logic is applied when
only a single "reg" mapping is specified.

Signed-off-by: Esben Haabendal <eha@deif.com>
---
 .../devicetree/bindings/net/fsl-tsec-phy.txt       |  6 ++-
 drivers/net/ethernet/freescale/fsl_pq_mdio.c       | 50 +++++++++++++++-------
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
index 594982c6b9f9..79bf352e659c 100644
--- a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
+++ b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
@@ -6,7 +6,11 @@ the definition of the PHY node in booting-without-of.txt for an example
 of how to define a PHY.
 
 Required properties:
-  - reg : Offset and length of the register set for the device
+  - reg : Offset and length of the register set for the device, and optionally
+          the offset and length of the TBIPA register (TBI PHY address
+	  register).  If TBIPA register is not specified, the driver will
+	  attempt to infer it from the register set specified (your mileage may
+	  vary).
   - compatible : Should define the compatible device type for the
     mdio. Currently supported strings/devices are:
 	- "fsl,gianfar-tbi"
diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
index 80ad16acf0f1..ac2c3f6a12bc 100644
--- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c
+++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
@@ -377,6 +377,38 @@ static const struct of_device_id fsl_pq_mdio_match[] = {
 };
 MODULE_DEVICE_TABLE(of, fsl_pq_mdio_match);
 
+static void set_tbipa(const u32 tbipa_val, struct platform_device *pdev,
+		      uint32_t __iomem * (*get_tbipa)(void __iomem *),
+		      void __iomem *reg_map, struct resource *reg_res)
+{
+	struct device_node *np = pdev->dev.of_node;
+	uint32_t __iomem *tbipa;
+	bool tbipa_mapped;
+
+	tbipa = of_iomap(np, 1);
+	if (tbipa) {
+		tbipa_mapped = true;
+	} else {
+		tbipa_mapped = false;
+		tbipa = (*get_tbipa)(reg_map);
+
+		/*
+		 * Add consistency check to make sure TBI is contained within
+		 * the mapped range (not because we would get a segfault,
+		 * rather to catch bugs in computing TBI address). Print error
+		 * message but continue anyway.
+		 */
+		if ((void *)tbipa > reg_map + resource_size(reg_res) - 4)
+			dev_err(&pdev->dev, "invalid register map (should be at least 0x%04zx to contain TBI address)\n",
+				((void *)tbipa - reg_map) + 4);
+	}
+
+	iowrite32be(be32_to_cpu(tbipa_val), tbipa);
+
+	if (tbipa_mapped)
+		iounmap(tbipa);
+}
+
 static int fsl_pq_mdio_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *id =
@@ -450,8 +482,6 @@ static int fsl_pq_mdio_probe(struct platform_device *pdev)
 
 		if (tbi) {
 			const u32 *prop = of_get_property(tbi, "reg", NULL);
-			uint32_t __iomem *tbipa;
-
 			if (!prop) {
 				dev_err(&pdev->dev,
 					"missing 'reg' property in node %pOF\n",
@@ -459,20 +489,8 @@ static int fsl_pq_mdio_probe(struct platform_device *pdev)
 				err = -EBUSY;
 				goto error;
 			}
-
-			tbipa = data->get_tbipa(priv->map);
-
-			/*
-			 * Add consistency check to make sure TBI is contained
-			 * within the mapped range (not because we would get a
-			 * segfault, rather to catch bugs in computing TBI
-			 * address). Print error message but continue anyway.
-			 */
-			if ((void *)tbipa > priv->map + resource_size(&res) - 4)
-				dev_err(&pdev->dev, "invalid register map (should be at least 0x%04zx to contain TBI address)\n",
-					((void *)tbipa - priv->map) + 4);
-
-			iowrite32be(be32_to_cpup(prop), tbipa);
+			set_tbipa(*prop, pdev,
+				  data->get_tbipa, priv->map, &res);
 		}
 	}
 
-- 
2.16.3

^ permalink raw reply related

* Re: [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Vadim Lomovtsev @ 2018-04-06 12:07 UTC (permalink / raw)
  To: Robin Murphy
  Cc: sgoutham, sunil.kovvuri, rric, linux-arm-kernel, netdev,
	linux-kernel, davem, dnelson, Vadim Lomovtsev, gustavo
In-Reply-To: <32d75865-6407-9e9e-a78d-38ba761b2575@arm.com>

Hi Robin,

On Fri, Apr 06, 2018 at 12:48:40PM +0100, Robin Murphy wrote:
> On 06/04/18 12:14, Vadim Lomovtsev wrote:
> > From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> > 
> > It is too expensive to pass u64 values via linked list, instead
> > allocate array for them by overall number of mac addresses from netdev.
> > 
> > This eventually removes multiple kmalloc() calls, aviod memory
> > fragmentation and allow to put single null check on kmalloc
> > return value in order to prevent a potential null pointer dereference.
> > 
> > Addresses-Coverity-ID: 1467429 ("Dereference null return value")
> > Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
> > Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> > ---
> > Changes from v1 to v2:
> >   - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
> > 
> > ---
> >   drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
> >   drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
> >   2 files changed, 11 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
> > index 5fc46c5a4f36..448d1fafc827 100644
> > --- a/drivers/net/ethernet/cavium/thunder/nic.h
> > +++ b/drivers/net/ethernet/cavium/thunder/nic.h
> > @@ -265,14 +265,9 @@ struct nicvf_drv_stats {
> >   struct cavium_ptp;
> > -struct xcast_addr {
> > -	struct list_head list;
> > -	u64              addr;
> > -};
> > -
> >   struct xcast_addr_list {
> > -	struct list_head list;
> >   	int              count;
> > +	u64              mc[];
> >   };
> >   struct nicvf_work {
> > diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > index 1e9a31fef729..a26d8bc92e01 100644
> > --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
> >   						  work.work);
> >   	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
> >   	union nic_mbx mbx = {};
> > -	struct xcast_addr *xaddr, *next;
> > +	u8 idx = 0;
> >   	if (!vf_work)
> >   		return;
> > @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
> >   	/* check if we have any specific MACs to be added to PF DMAC filter */
> >   	if (vf_work->mc) {
> >   		/* now go through kernel list of MACs and add them one by one */
> > -		list_for_each_entry_safe(xaddr, next,
> > -					 &vf_work->mc->list, list) {
> > +		for (idx = 0; idx < vf_work->mc->count; idx++) {
> >   			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
> > -			mbx.xcast.data.mac = xaddr->addr;
> > +			mbx.xcast.data.mac = vf_work->mc->mc[idx];
> >   			nicvf_send_msg_to_pf(nic, &mbx);
> > -
> > -			/* after receiving ACK from PF release memory */
> > -			list_del(&xaddr->list);
> > -			kfree(xaddr);
> > -			vf_work->mc->count--;
> >   		}
> >   		kfree(vf_work->mc);
> >   	}
> > @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
> >   			mode |= BGX_XCAST_MCAST_FILTER;
> >   			/* here we need to copy mc addrs */
> >   			if (netdev_mc_count(netdev)) {
> > -				struct xcast_addr *xaddr;
> > -
> > -				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
> > -				INIT_LIST_HEAD(&mc_list->list);
> > +				mc_list = kmalloc(sizeof(*mc_list) +
> > +						  sizeof(u64) * netdev_mc_count(netdev),
> 
> FWIW if you really wanted to disambiguate that it's a structure and not just
> an array being allocated, then it could be expressed without explicit
> arithmetic:
> 
> 	size = offsetof(typeof(*mc_list), mc[netdev_mc_count(netdev)]);
> 
> but that's probably just a matter of personal preference at this point.
> 
> Robin.
> 

Thanks for reviewing it and for hint. From style perspective, I believe,
I should get rid off direct types names also, and use your suggestion here.

I'll update patch to v3, test and re-post.
Thank you for your time.

WBR,
Vadim

> > +						  GFP_ATOMIC);
> > +				if (unlikely(!mc_list))
> > +					return;
> > +				mc_list->count = 0;
> >   				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
> > -					xaddr = kmalloc(sizeof(*xaddr),
> > -							GFP_ATOMIC);
> > -					xaddr->addr =
> > +					mc_list->mc[mc_list->count] =
> >   						ether_addr_to_u64(ha->addr);
> > -					list_add_tail(&xaddr->list,
> > -						      &mc_list->list);
> >   					mc_list->count++;
> >   				}
> >   			}
> > 

^ permalink raw reply

* Re: TCP one-by-one acking - RFC interpretation question
From: Eric Dumazet @ 2018-04-06 12:01 UTC (permalink / raw)
  To: Michal Kubecek, netdev
In-Reply-To: <20180406100511.oify5ej2zvxr3isg@unicorn.suse.cz>



On 04/06/2018 03:05 AM, Michal Kubecek wrote:
> Hello,
> 
> I encountered a strange behaviour of some (non-linux) TCP stack which
> I believe is incorrect but support engineers from the company producing
> it claim is OK.
> 
> Assume a client (sender, Linux 4.4 kernel) sends a stream of MSS sized
> segments but segments 2, 4 and 6 do not reach the server (receiver):
> 
>          ACK             SAK             SAK             SAK
>       +-------+-------+-------+-------+-------+-------+-------+
>       |   1   |   2   |   3   |   4   |   5   |   6   |   7   |
>       +-------+-------+-------+-------+-------+-------+-------+
>     34273   35701   37129   38557   39985   41413   42841   44269
> 
> When segment 2 is retransmitted after RTO timeout, normal response would
> be ACK-ing segment 3 (38557) with SACK for 5 and 7 (39985-41413 and
> 42841-44269).
> 
> However, this server stack responds with two separate ACKs:
> 
>   - ACK 37129, SACK 37129-38557 39985-41413 42841-44269
>   - ACK 38557, SACK 39985-41413 42841-44269

Hmmm... Yes this seems very very wrong and lazy.

Have you verified behavior of more recent linux kernel to such threats ?

packetdrill test would be relatively easy to write.

Regardless of this broken alien stack, we might be able to work around this faster
than the vendor is able to fix and deploy a new stack.

( https://en.wikipedia.org/wiki/Robustness_principle )
Be conservative in what you do, be liberal in what you accept from others...



> 
> There is no payload from server, no window update and it happens even if
> there is no other packet received by server between those two. The
> result is that as segment 3 was never retransmitted, second ACK is
> interpreted as acking a newly arrived segment by 4.4 kernel so that the
> whole interval between first transmission of segment 3 and this second
> ACK is used for RTT estimator; even worse, when the same happens again
> for segment 5, both timeouts (for 2 and 4) are counted into its RTT.
> The result is RTO growing exponentially until it reaches the maximum
> (120 seconds) and the connection is effectively stalled.
> 
> In my opinion, server behaviour violates the last paragraph of RFC 5681,
> section 4.2:
> 
>   A TCP receiver MUST NOT generate more than one ACK for every incoming
>   segment, other than to update the offered window as the receiving
>   application consumes new data (see [RFC813] and page 42 of [RFC793]).
> 
> Server vendor claims that their behaviour is correct as first ACK is
> sent in response to segment 2 and second ACK in response to segment 3
> (which has just been delayed in the out of order queue).
> 
> Note that SACK doesn't really help here. First SACK block in first ACK
> (37129-38557) is actually invalid as it violates the "the bytes just
> below the block ... have not been received" condition from RFC 2018
> section 3. Therefore Linux 4.4 stack ignores this SACK block, detects
> (spurious) SACK reneging and unmarks the "previously sacked" flag of
> segment 3 so that when second ACK arrives, there is no trace of it
> having been sacked before. They already admitted this SACK block is
> incorrect but there is still disagreement about the "one-by-one acking"
> behaviour in general.
> 
> My question is: is my interpretation correct? If so, is there an even
> less ambiguous statement somewhere that receiver is supposed to send one
> ACK for "everything they got so far" rather than acking the segments one
> by one? While reading the RFCs, I always considered this obvious but
> apparently some people may think otherwise.
> 
> Thanks in advance,
> Michal Kubecek
> 

^ permalink raw reply

* Re: [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Vadim Lomovtsev @ 2018-04-06 11:53 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: sgoutham, sunil.kovvuri, rric, linux-arm-kernel, netdev,
	linux-kernel, davem, dnelson, Vadim Lomovtsev
In-Reply-To: <b2288b61-09d3-0f76-23db-3fed10d3647a@embeddedor.com>

On Fri, Apr 06, 2018 at 06:47:26AM -0500, Gustavo A. R. Silva wrote:
> 
> 
> On 04/06/2018 06:43 AM, Vadim Lomovtsev wrote:
> > Hi Gustavo,
> > 
> > On Fri, Apr 06, 2018 at 06:36:28AM -0500, Gustavo A. R. Silva wrote:
> > > Hi Vadim,
> > > 
> > > On 04/06/2018 06:14 AM, Vadim Lomovtsev wrote:
> > > > From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> > > > 
> > > > It is too expensive to pass u64 values via linked list, instead
> > > > allocate array for them by overall number of mac addresses from netdev.
> > > > 
> > > > This eventually removes multiple kmalloc() calls, aviod memory
> > > > fragmentation and allow to put single null check on kmalloc
> > > > return value in order to prevent a potential null pointer dereference.
> > > > 
> > > > Addresses-Coverity-ID: 1467429 ("Dereference null return value")
> > > > Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
> > > 
> > > It'd be nice if you add:
> > > 
> > > Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> > 
> > Ok, I could do that.
> > 
> > Just to explain .. I didn't do it yet since I get such report from
> > Dan Carpenter intially (https://www.spinics.net/lists/linux-kernel-janitors/msg40720.html)
> > and was working on it when found you patches, so then asking you to give
> > me some time to prepare and test update to driver.
> > 
> 
> Oh I got it. Not big deal. I think in this case you should add Dan's
> Reported-by instead.

Ok then.

> 
> BTW nice patch. :)
>

Thank you for reviewing it.

WBR,
Vadim

> Thanks
> --
> Gustavo
> 
> > So would it be acceptable put two tags 'Reported-by:' then ?
> > 
> > WBR,
> > Vadim
> > 
> > > 
> > > Thanks
> > > --
> > > Gustavo
> > > 
> > > > Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> > > > ---
> > > > Changes from v1 to v2:
> > > >    - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
> > > > 
> > > > ---
> > > >    drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
> > > >    drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
> > > >    2 files changed, 11 insertions(+), 24 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
> > > > index 5fc46c5a4f36..448d1fafc827 100644
> > > > --- a/drivers/net/ethernet/cavium/thunder/nic.h
> > > > +++ b/drivers/net/ethernet/cavium/thunder/nic.h
> > > > @@ -265,14 +265,9 @@ struct nicvf_drv_stats {
> > > >    struct cavium_ptp;
> > > > -struct xcast_addr {
> > > > -	struct list_head list;
> > > > -	u64              addr;
> > > > -};
> > > > -
> > > >    struct xcast_addr_list {
> > > > -	struct list_head list;
> > > >    	int              count;
> > > > +	u64              mc[];
> > > >    };
> > > >    struct nicvf_work {
> > > > diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > > > index 1e9a31fef729..a26d8bc92e01 100644
> > > > --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > > > +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > > > @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
> > > >    						  work.work);
> > > >    	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
> > > >    	union nic_mbx mbx = {};
> > > > -	struct xcast_addr *xaddr, *next;
> > > > +	u8 idx = 0;
> > > >    	if (!vf_work)
> > > >    		return;
> > > > @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
> > > >    	/* check if we have any specific MACs to be added to PF DMAC filter */
> > > >    	if (vf_work->mc) {
> > > >    		/* now go through kernel list of MACs and add them one by one */
> > > > -		list_for_each_entry_safe(xaddr, next,
> > > > -					 &vf_work->mc->list, list) {
> > > > +		for (idx = 0; idx < vf_work->mc->count; idx++) {
> > > >    			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
> > > > -			mbx.xcast.data.mac = xaddr->addr;
> > > > +			mbx.xcast.data.mac = vf_work->mc->mc[idx];
> > > >    			nicvf_send_msg_to_pf(nic, &mbx);
> > > > -
> > > > -			/* after receiving ACK from PF release memory */
> > > > -			list_del(&xaddr->list);
> > > > -			kfree(xaddr);
> > > > -			vf_work->mc->count--;
> > > >    		}
> > > >    		kfree(vf_work->mc);
> > > >    	}
> > > > @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
> > > >    			mode |= BGX_XCAST_MCAST_FILTER;
> > > >    			/* here we need to copy mc addrs */
> > > >    			if (netdev_mc_count(netdev)) {
> > > > -				struct xcast_addr *xaddr;
> > > > -
> > > > -				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
> > > > -				INIT_LIST_HEAD(&mc_list->list);
> > > > +				mc_list = kmalloc(sizeof(*mc_list) +
> > > > +						  sizeof(u64) * netdev_mc_count(netdev),
> > > > +						  GFP_ATOMIC);
> > > > +				if (unlikely(!mc_list))
> > > > +					return;
> > > > +				mc_list->count = 0;
> > > >    				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
> > > > -					xaddr = kmalloc(sizeof(*xaddr),
> > > > -							GFP_ATOMIC);
> > > > -					xaddr->addr =
> > > > +					mc_list->mc[mc_list->count] =
> > > >    						ether_addr_to_u64(ha->addr);
> > > > -					list_add_tail(&xaddr->list,
> > > > -						      &mc_list->list);
> > > >    					mc_list->count++;
> > > >    				}
> > > >    			}
> > > > 

^ permalink raw reply

* Re: [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Robin Murphy @ 2018-04-06 11:48 UTC (permalink / raw)
  To: Vadim Lomovtsev, sgoutham, sunil.kovvuri, rric, linux-arm-kernel,
	netdev, linux-kernel, davem
  Cc: dnelson, Vadim Lomovtsev, gustavo
In-Reply-To: <20180406111425.14636-1-Vadim.Lomovtsev@caviumnetworks.com>

On 06/04/18 12:14, Vadim Lomovtsev wrote:
> From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> 
> It is too expensive to pass u64 values via linked list, instead
> allocate array for them by overall number of mac addresses from netdev.
> 
> This eventually removes multiple kmalloc() calls, aviod memory
> fragmentation and allow to put single null check on kmalloc
> return value in order to prevent a potential null pointer dereference.
> 
> Addresses-Coverity-ID: 1467429 ("Dereference null return value")
> Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
> Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> ---
> Changes from v1 to v2:
>   - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
> 
> ---
>   drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
>   drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
>   2 files changed, 11 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
> index 5fc46c5a4f36..448d1fafc827 100644
> --- a/drivers/net/ethernet/cavium/thunder/nic.h
> +++ b/drivers/net/ethernet/cavium/thunder/nic.h
> @@ -265,14 +265,9 @@ struct nicvf_drv_stats {
>   
>   struct cavium_ptp;
>   
> -struct xcast_addr {
> -	struct list_head list;
> -	u64              addr;
> -};
> -
>   struct xcast_addr_list {
> -	struct list_head list;
>   	int              count;
> +	u64              mc[];
>   };
>   
>   struct nicvf_work {
> diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> index 1e9a31fef729..a26d8bc92e01 100644
> --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
>   						  work.work);
>   	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
>   	union nic_mbx mbx = {};
> -	struct xcast_addr *xaddr, *next;
> +	u8 idx = 0;
>   
>   	if (!vf_work)
>   		return;
> @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
>   	/* check if we have any specific MACs to be added to PF DMAC filter */
>   	if (vf_work->mc) {
>   		/* now go through kernel list of MACs and add them one by one */
> -		list_for_each_entry_safe(xaddr, next,
> -					 &vf_work->mc->list, list) {
> +		for (idx = 0; idx < vf_work->mc->count; idx++) {
>   			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
> -			mbx.xcast.data.mac = xaddr->addr;
> +			mbx.xcast.data.mac = vf_work->mc->mc[idx];
>   			nicvf_send_msg_to_pf(nic, &mbx);
> -
> -			/* after receiving ACK from PF release memory */
> -			list_del(&xaddr->list);
> -			kfree(xaddr);
> -			vf_work->mc->count--;
>   		}
>   		kfree(vf_work->mc);
>   	}
> @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
>   			mode |= BGX_XCAST_MCAST_FILTER;
>   			/* here we need to copy mc addrs */
>   			if (netdev_mc_count(netdev)) {
> -				struct xcast_addr *xaddr;
> -
> -				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
> -				INIT_LIST_HEAD(&mc_list->list);
> +				mc_list = kmalloc(sizeof(*mc_list) +
> +						  sizeof(u64) * netdev_mc_count(netdev),

FWIW if you really wanted to disambiguate that it's a structure and not 
just an array being allocated, then it could be expressed without 
explicit arithmetic:

	size = offsetof(typeof(*mc_list), mc[netdev_mc_count(netdev)]);

but that's probably just a matter of personal preference at this point.

Robin.

> +						  GFP_ATOMIC);
> +				if (unlikely(!mc_list))
> +					return;
> +				mc_list->count = 0;
>   				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
> -					xaddr = kmalloc(sizeof(*xaddr),
> -							GFP_ATOMIC);
> -					xaddr->addr =
> +					mc_list->mc[mc_list->count] =
>   						ether_addr_to_u64(ha->addr);
> -					list_add_tail(&xaddr->list,
> -						      &mc_list->list);
>   					mc_list->count++;
>   				}
>   			}
> 

^ permalink raw reply

* Re: [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Gustavo A. R. Silva @ 2018-04-06 11:47 UTC (permalink / raw)
  To: Vadim Lomovtsev
  Cc: sgoutham, sunil.kovvuri, rric, linux-arm-kernel, netdev,
	linux-kernel, davem, dnelson, Vadim Lomovtsev
In-Reply-To: <20180406114330.GA14704@localhost.localdomain>



On 04/06/2018 06:43 AM, Vadim Lomovtsev wrote:
> Hi Gustavo,
> 
> On Fri, Apr 06, 2018 at 06:36:28AM -0500, Gustavo A. R. Silva wrote:
>> Hi Vadim,
>>
>> On 04/06/2018 06:14 AM, Vadim Lomovtsev wrote:
>>> From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
>>>
>>> It is too expensive to pass u64 values via linked list, instead
>>> allocate array for them by overall number of mac addresses from netdev.
>>>
>>> This eventually removes multiple kmalloc() calls, aviod memory
>>> fragmentation and allow to put single null check on kmalloc
>>> return value in order to prevent a potential null pointer dereference.
>>>
>>> Addresses-Coverity-ID: 1467429 ("Dereference null return value")
>>> Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
>>
>> It'd be nice if you add:
>>
>> Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> 
> Ok, I could do that.
> 
> Just to explain .. I didn't do it yet since I get such report from
> Dan Carpenter intially (https://www.spinics.net/lists/linux-kernel-janitors/msg40720.html)
> and was working on it when found you patches, so then asking you to give
> me some time to prepare and test update to driver.
> 

Oh I got it. Not big deal. I think in this case you should add Dan's 
Reported-by instead.

BTW nice patch. :)

Thanks
--
Gustavo

> So would it be acceptable put two tags 'Reported-by:' then ?
> 
> WBR,
> Vadim
> 
>>
>> Thanks
>> --
>> Gustavo
>>
>>> Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
>>> ---
>>> Changes from v1 to v2:
>>>    - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
>>>
>>> ---
>>>    drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
>>>    drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
>>>    2 files changed, 11 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
>>> index 5fc46c5a4f36..448d1fafc827 100644
>>> --- a/drivers/net/ethernet/cavium/thunder/nic.h
>>> +++ b/drivers/net/ethernet/cavium/thunder/nic.h
>>> @@ -265,14 +265,9 @@ struct nicvf_drv_stats {
>>>    struct cavium_ptp;
>>> -struct xcast_addr {
>>> -	struct list_head list;
>>> -	u64              addr;
>>> -};
>>> -
>>>    struct xcast_addr_list {
>>> -	struct list_head list;
>>>    	int              count;
>>> +	u64              mc[];
>>>    };
>>>    struct nicvf_work {
>>> diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
>>> index 1e9a31fef729..a26d8bc92e01 100644
>>> --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
>>> +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
>>> @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
>>>    						  work.work);
>>>    	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
>>>    	union nic_mbx mbx = {};
>>> -	struct xcast_addr *xaddr, *next;
>>> +	u8 idx = 0;
>>>    	if (!vf_work)
>>>    		return;
>>> @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
>>>    	/* check if we have any specific MACs to be added to PF DMAC filter */
>>>    	if (vf_work->mc) {
>>>    		/* now go through kernel list of MACs and add them one by one */
>>> -		list_for_each_entry_safe(xaddr, next,
>>> -					 &vf_work->mc->list, list) {
>>> +		for (idx = 0; idx < vf_work->mc->count; idx++) {
>>>    			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
>>> -			mbx.xcast.data.mac = xaddr->addr;
>>> +			mbx.xcast.data.mac = vf_work->mc->mc[idx];
>>>    			nicvf_send_msg_to_pf(nic, &mbx);
>>> -
>>> -			/* after receiving ACK from PF release memory */
>>> -			list_del(&xaddr->list);
>>> -			kfree(xaddr);
>>> -			vf_work->mc->count--;
>>>    		}
>>>    		kfree(vf_work->mc);
>>>    	}
>>> @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
>>>    			mode |= BGX_XCAST_MCAST_FILTER;
>>>    			/* here we need to copy mc addrs */
>>>    			if (netdev_mc_count(netdev)) {
>>> -				struct xcast_addr *xaddr;
>>> -
>>> -				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
>>> -				INIT_LIST_HEAD(&mc_list->list);
>>> +				mc_list = kmalloc(sizeof(*mc_list) +
>>> +						  sizeof(u64) * netdev_mc_count(netdev),
>>> +						  GFP_ATOMIC);
>>> +				if (unlikely(!mc_list))
>>> +					return;
>>> +				mc_list->count = 0;
>>>    				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
>>> -					xaddr = kmalloc(sizeof(*xaddr),
>>> -							GFP_ATOMIC);
>>> -					xaddr->addr =
>>> +					mc_list->mc[mc_list->count] =
>>>    						ether_addr_to_u64(ha->addr);
>>> -					list_add_tail(&xaddr->list,
>>> -						      &mc_list->list);
>>>    					mc_list->count++;
>>>    				}
>>>    			}
>>>

^ permalink raw reply

* Re: [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Vadim Lomovtsev @ 2018-04-06 11:43 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: sgoutham, sunil.kovvuri, rric, linux-arm-kernel, netdev,
	linux-kernel, davem, dnelson, Vadim Lomovtsev
In-Reply-To: <18c48972-ef34-5b14-f91b-0ceffc286769@embeddedor.com>

Hi Gustavo,

On Fri, Apr 06, 2018 at 06:36:28AM -0500, Gustavo A. R. Silva wrote:
> Hi Vadim,
> 
> On 04/06/2018 06:14 AM, Vadim Lomovtsev wrote:
> > From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> > 
> > It is too expensive to pass u64 values via linked list, instead
> > allocate array for them by overall number of mac addresses from netdev.
> > 
> > This eventually removes multiple kmalloc() calls, aviod memory
> > fragmentation and allow to put single null check on kmalloc
> > return value in order to prevent a potential null pointer dereference.
> > 
> > Addresses-Coverity-ID: 1467429 ("Dereference null return value")
> > Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
> 
> It'd be nice if you add:
> 
> Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>

Ok, I could do that.

Just to explain .. I didn't do it yet since I get such report from
Dan Carpenter intially (https://www.spinics.net/lists/linux-kernel-janitors/msg40720.html)
and was working on it when found you patches, so then asking you to give
me some time to prepare and test update to driver.

So would it be acceptable put two tags 'Reported-by:' then ?

WBR,
Vadim

> 
> Thanks
> --
> Gustavo
> 
> > Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> > ---
> > Changes from v1 to v2:
> >   - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
> > 
> > ---
> >   drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
> >   drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
> >   2 files changed, 11 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
> > index 5fc46c5a4f36..448d1fafc827 100644
> > --- a/drivers/net/ethernet/cavium/thunder/nic.h
> > +++ b/drivers/net/ethernet/cavium/thunder/nic.h
> > @@ -265,14 +265,9 @@ struct nicvf_drv_stats {
> >   struct cavium_ptp;
> > -struct xcast_addr {
> > -	struct list_head list;
> > -	u64              addr;
> > -};
> > -
> >   struct xcast_addr_list {
> > -	struct list_head list;
> >   	int              count;
> > +	u64              mc[];
> >   };
> >   struct nicvf_work {
> > diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > index 1e9a31fef729..a26d8bc92e01 100644
> > --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
> >   						  work.work);
> >   	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
> >   	union nic_mbx mbx = {};
> > -	struct xcast_addr *xaddr, *next;
> > +	u8 idx = 0;
> >   	if (!vf_work)
> >   		return;
> > @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
> >   	/* check if we have any specific MACs to be added to PF DMAC filter */
> >   	if (vf_work->mc) {
> >   		/* now go through kernel list of MACs and add them one by one */
> > -		list_for_each_entry_safe(xaddr, next,
> > -					 &vf_work->mc->list, list) {
> > +		for (idx = 0; idx < vf_work->mc->count; idx++) {
> >   			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
> > -			mbx.xcast.data.mac = xaddr->addr;
> > +			mbx.xcast.data.mac = vf_work->mc->mc[idx];
> >   			nicvf_send_msg_to_pf(nic, &mbx);
> > -
> > -			/* after receiving ACK from PF release memory */
> > -			list_del(&xaddr->list);
> > -			kfree(xaddr);
> > -			vf_work->mc->count--;
> >   		}
> >   		kfree(vf_work->mc);
> >   	}
> > @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
> >   			mode |= BGX_XCAST_MCAST_FILTER;
> >   			/* here we need to copy mc addrs */
> >   			if (netdev_mc_count(netdev)) {
> > -				struct xcast_addr *xaddr;
> > -
> > -				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
> > -				INIT_LIST_HEAD(&mc_list->list);
> > +				mc_list = kmalloc(sizeof(*mc_list) +
> > +						  sizeof(u64) * netdev_mc_count(netdev),
> > +						  GFP_ATOMIC);
> > +				if (unlikely(!mc_list))
> > +					return;
> > +				mc_list->count = 0;
> >   				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
> > -					xaddr = kmalloc(sizeof(*xaddr),
> > -							GFP_ATOMIC);
> > -					xaddr->addr =
> > +					mc_list->mc[mc_list->count] =
> >   						ether_addr_to_u64(ha->addr);
> > -					list_add_tail(&xaddr->list,
> > -						      &mc_list->list);
> >   					mc_list->count++;
> >   				}
> >   			}
> > 

^ permalink raw reply

* Re: [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Gustavo A. R. Silva @ 2018-04-06 11:36 UTC (permalink / raw)
  To: Vadim Lomovtsev, sgoutham, sunil.kovvuri, rric, linux-arm-kernel,
	netdev, linux-kernel, davem
  Cc: dnelson, Vadim Lomovtsev
In-Reply-To: <20180406111425.14636-1-Vadim.Lomovtsev@caviumnetworks.com>

Hi Vadim,

On 04/06/2018 06:14 AM, Vadim Lomovtsev wrote:
> From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> 
> It is too expensive to pass u64 values via linked list, instead
> allocate array for them by overall number of mac addresses from netdev.
> 
> This eventually removes multiple kmalloc() calls, aviod memory
> fragmentation and allow to put single null check on kmalloc
> return value in order to prevent a potential null pointer dereference.
> 
> Addresses-Coverity-ID: 1467429 ("Dereference null return value")
> Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")

It'd be nice if you add:

Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>

Thanks
--
Gustavo

> Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
> ---
> Changes from v1 to v2:
>   - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
> 
> ---
>   drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
>   drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
>   2 files changed, 11 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
> index 5fc46c5a4f36..448d1fafc827 100644
> --- a/drivers/net/ethernet/cavium/thunder/nic.h
> +++ b/drivers/net/ethernet/cavium/thunder/nic.h
> @@ -265,14 +265,9 @@ struct nicvf_drv_stats {
>   
>   struct cavium_ptp;
>   
> -struct xcast_addr {
> -	struct list_head list;
> -	u64              addr;
> -};
> -
>   struct xcast_addr_list {
> -	struct list_head list;
>   	int              count;
> +	u64              mc[];
>   };
>   
>   struct nicvf_work {
> diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> index 1e9a31fef729..a26d8bc92e01 100644
> --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
> @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
>   						  work.work);
>   	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
>   	union nic_mbx mbx = {};
> -	struct xcast_addr *xaddr, *next;
> +	u8 idx = 0;
>   
>   	if (!vf_work)
>   		return;
> @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
>   	/* check if we have any specific MACs to be added to PF DMAC filter */
>   	if (vf_work->mc) {
>   		/* now go through kernel list of MACs and add them one by one */
> -		list_for_each_entry_safe(xaddr, next,
> -					 &vf_work->mc->list, list) {
> +		for (idx = 0; idx < vf_work->mc->count; idx++) {
>   			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
> -			mbx.xcast.data.mac = xaddr->addr;
> +			mbx.xcast.data.mac = vf_work->mc->mc[idx];
>   			nicvf_send_msg_to_pf(nic, &mbx);
> -
> -			/* after receiving ACK from PF release memory */
> -			list_del(&xaddr->list);
> -			kfree(xaddr);
> -			vf_work->mc->count--;
>   		}
>   		kfree(vf_work->mc);
>   	}
> @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
>   			mode |= BGX_XCAST_MCAST_FILTER;
>   			/* here we need to copy mc addrs */
>   			if (netdev_mc_count(netdev)) {
> -				struct xcast_addr *xaddr;
> -
> -				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
> -				INIT_LIST_HEAD(&mc_list->list);
> +				mc_list = kmalloc(sizeof(*mc_list) +
> +						  sizeof(u64) * netdev_mc_count(netdev),
> +						  GFP_ATOMIC);
> +				if (unlikely(!mc_list))
> +					return;
> +				mc_list->count = 0;
>   				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
> -					xaddr = kmalloc(sizeof(*xaddr),
> -							GFP_ATOMIC);
> -					xaddr->addr =
> +					mc_list->mc[mc_list->count] =
>   						ether_addr_to_u64(ha->addr);
> -					list_add_tail(&xaddr->list,
> -						      &mc_list->list);
>   					mc_list->count++;
>   				}
>   			}
> 

^ permalink raw reply

* [PATCH iproute2] bridge: fix typo in hairpin error message
From: Guillaume Nault @ 2018-04-06 11:33 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger

No 'g' to hairpin.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 bridge/link.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bridge/link.c b/bridge/link.c
index 579d57e7..8d89aca2 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -312,7 +312,7 @@ static int brlink_modify(int argc, char **argv)
 				return -1;
 		} else if (strcmp(*argv, "hairpin") == 0) {
 			NEXT_ARG();
-			if (!on_off("hairping", &hairpin, *argv))
+			if (!on_off("hairpin", &hairpin, *argv))
 				return -1;
 		} else if (strcmp(*argv, "fastleave") == 0) {
 			NEXT_ARG();
-- 
2.17.0

^ permalink raw reply related

* [PATCH v2] net: thunderx: rework mac addresses list to u64 array
From: Vadim Lomovtsev @ 2018-04-06 11:14 UTC (permalink / raw)
  To: sgoutham, sunil.kovvuri, rric, linux-arm-kernel, netdev,
	linux-kernel, davem
  Cc: dnelson, gustavo, Vadim Lomovtsev
In-Reply-To: <20180405145756.12633-1-Vadim.Lomovtsev@caviumnetworks.com>

From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>

It is too expensive to pass u64 values via linked list, instead
allocate array for them by overall number of mac addresses from netdev.

This eventually removes multiple kmalloc() calls, aviod memory
fragmentation and allow to put single null check on kmalloc
return value in order to prevent a potential null pointer dereference.

Addresses-Coverity-ID: 1467429 ("Dereference null return value")
Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
---
Changes from v1 to v2:
 - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];

---
 drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 5fc46c5a4f36..448d1fafc827 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -265,14 +265,9 @@ struct nicvf_drv_stats {
 
 struct cavium_ptp;
 
-struct xcast_addr {
-	struct list_head list;
-	u64              addr;
-};
-
 struct xcast_addr_list {
-	struct list_head list;
 	int              count;
+	u64              mc[];
 };
 
 struct nicvf_work {
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 1e9a31fef729..a26d8bc92e01 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
 						  work.work);
 	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
 	union nic_mbx mbx = {};
-	struct xcast_addr *xaddr, *next;
+	u8 idx = 0;
 
 	if (!vf_work)
 		return;
@@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
 	/* check if we have any specific MACs to be added to PF DMAC filter */
 	if (vf_work->mc) {
 		/* now go through kernel list of MACs and add them one by one */
-		list_for_each_entry_safe(xaddr, next,
-					 &vf_work->mc->list, list) {
+		for (idx = 0; idx < vf_work->mc->count; idx++) {
 			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
-			mbx.xcast.data.mac = xaddr->addr;
+			mbx.xcast.data.mac = vf_work->mc->mc[idx];
 			nicvf_send_msg_to_pf(nic, &mbx);
-
-			/* after receiving ACK from PF release memory */
-			list_del(&xaddr->list);
-			kfree(xaddr);
-			vf_work->mc->count--;
 		}
 		kfree(vf_work->mc);
 	}
@@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
 			mode |= BGX_XCAST_MCAST_FILTER;
 			/* here we need to copy mc addrs */
 			if (netdev_mc_count(netdev)) {
-				struct xcast_addr *xaddr;
-
-				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
-				INIT_LIST_HEAD(&mc_list->list);
+				mc_list = kmalloc(sizeof(*mc_list) +
+						  sizeof(u64) * netdev_mc_count(netdev),
+						  GFP_ATOMIC);
+				if (unlikely(!mc_list))
+					return;
+				mc_list->count = 0;
 				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
-					xaddr = kmalloc(sizeof(*xaddr),
-							GFP_ATOMIC);
-					xaddr->addr =
+					mc_list->mc[mc_list->count] =
 						ether_addr_to_u64(ha->addr);
-					list_add_tail(&xaddr->list,
-						      &mc_list->list);
 					mc_list->count++;
 				}
 			}
-- 
2.14.3

^ permalink raw reply related

* [RFC bpf-next] bpf: document eBPF helpers and add a script to generate man page
From: Quentin Monnet @ 2018-04-06 11:11 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man

eBPF helper functions can be called from within eBPF programs to perform
a variety of tasks that would be otherwise hard or impossible to do with
eBPF itself. There is a growing number of such helper functions in the
kernel, but documentation is scarce. The main user space header file
does contain a short commented description of most helpers, but it is
somewhat outdated and not complete. It is more a "cheat sheet" than a
real documentation accessible to new eBPF developers.

This commit attempts to improve the situation by replacing the existing
overview for the helpers with a more developed description. Furthermore,
a Python script is added to generate a manual page for eBPF helpers. The
workflow is the following, and requires the rst2man utility:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

The objective is to keep all documentation related to the helpers in a
single place, and to be able to generate from here a manual page that
could be packaged in the man-pages repository and shipped with most
distributions [1].

Additionally, parsing the prototypes of the helper functions could
hopefully be reused, with a different Printer object, to generate
header files needed in some eBPF-related projects.

Regarding the description of each helper, it comprises several items:

- The function prototype.
- A description of the function and of its arguments (except for a
  couple of cases, when there are no arguments and the return value
  makes the function usage really obvious).
- A description of return values (if not void).
- A listing of eBPF program types (if relevant, map types) compatible
  with the helper.
- Information about the helper being restricted to GPL programs, or not.
- The kernel version in which the helper was introduced.
- The commit that introduced the helper (this is mostly to have it in
  the source of the man page, as it can be used to track changes and
  update the page).

For several helpers, descriptions are inspired (at times, nearly copied)
from the commit logs introducing them in the kernel--Many thanks to
their respective authors! They were completed as much as possible, the
objective being to have something easily accessible even for people just
starting with eBPF. There is probably a bit more work to do in this
direction for some helpers.

Some RST formatting is used in the descriptions (not in function
prototypes, to keep them readable, but the Python script provided in
order to generate the RST for the manual page does add formatting to
prototypes, to produce something pretty) to get "bold" and "italics" in
manual pages. Hopefully, the descriptions in bpf.h file remains
perfectly readable. Note that the few trailing white spaces are
intentional, removing them would break paragraphs for rst2man.

The descriptions should ideally be updated each time someone adds a new
helper, or updates the behaviour (compatibility extended to new program
types, new socket option supported...) or the interface (new flags
available, ...) of existing ones.

[1] I have not contacted people from the man-pages project prior to
    sending this RFC, so I can offer no guaranty at this time that they
    would accept to take the generated man page.

Cc: linux-doc@vger.kernel.org
Cc: linux-man@vger.kernel.org
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h   | 2237 ++++++++++++++++++++++++++++++++++++--------
 scripts/bpf_helpers_doc.py |  568 +++++++++++
 2 files changed, 2429 insertions(+), 376 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..f47aeddbbe0a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -367,394 +367,1879 @@ union bpf_attr {
 
 /* BPF helper function descriptions:
  *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
+ * void *bpf_map_lookup_elem(struct bpf_map *map, void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ * 	For
+ * 		All types of programs. Limited to maps of types
+ * 		**BPF_MAP_TYPE_HASH**,
+ * 		**BPF_MAP_TYPE_ARRAY**,
+ * 		**BPF_MAP_TYPE_PERCPU_HASH**,
+ * 		**BPF_MAP_TYPE_PERCPU_ARRAY**,
+ * 		**BPF_MAP_TYPE_LRU_HASH**,
+ * 		**BPF_MAP_TYPE_LRU_PERCPU_HASH**,
+ * 		**BPF_MAP_TYPE_LPM_TRIE**,
+ * 		**BPF_MAP_TYPE_ARRAY_OF_MAPS**,
+ * 		and **BPF_MAP_TYPE_HASH_OF_MAPS**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 3.19
+ *
+ * .. commit d0003ec01c66
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		These flags are only useful for maps of type
+ * 		**BPF_MAP_TYPE_HASH**. For all other map types, **BPF_ANY**
+ * 		should be used.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		All types of programs. Limited to maps of types
+ * 		**BPF_MAP_TYPE_HASH**,
+ * 		**BPF_MAP_TYPE_ARRAY**,
+ * 		**BPF_MAP_TYPE_PERCPU_HASH**,
+ * 		**BPF_MAP_TYPE_PERCPU_ARRAY**,
+ * 		**BPF_MAP_TYPE_LRU_HASH**,
+ * 		**BPF_MAP_TYPE_LRU_PERCPU_HASH**,
+ * 		and **BPF_MAP_TYPE_LPM_TRIE**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 3.19
+ *
+ * .. commit d0003ec01c66
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		All types of programs. Limited to maps of types
+ * 		**BPF_MAP_TYPE_HASH**,
+ * 		**BPF_MAP_TYPE_ARRAY**,
+ * 		**BPF_MAP_TYPE_PERCPU_HASH**,
+ * 		**BPF_MAP_TYPE_PERCPU_ARRAY**,
+ * 		**BPF_MAP_TYPE_LRU_HASH**,
+ * 		**BPF_MAP_TYPE_LRU_PERCPU_HASH**,
+ * 		and **BPF_MAP_TYPE_LPM_TRIE**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 3.19
+ *
+ * .. commit d0003ec01c66
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		*Tracing programs*.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit 2541517c32be
  *
  * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ * 	For
+ * 		All program types, except
+ * 		**BPF_PROG_TYPE_CGROUP_DEVICE**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit d9847d310ab4
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five). Each time the helper is called, it appends a
+ * 		line that looks like the following:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: BPF command: 2
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``BPF command: 2`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will silently fail if it encounters an unknown
+ * 		specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For passing values to user
+ * 		space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ * 	For
+ * 		All types of programs.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit 9c959c863f82
  *
  * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ * 	For
+ * 		All program types, except **BPF_PROG_TYPE_CGROUP_DEVICE**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit 03e69b508b6f
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Return
+ * 		The SMP (Symmetric multiprocessing) processor id.
+ * 	For
+ * 		*Networking programs*.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit c04167ce2ca0
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit 91bc4822c3d6
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the IP checksum for the packet associated to *skb*.
+ * 		Computation is incremental, so the helper must know the former
+ * 		value of the header field that was modified (*from*), the new
+ * 		value of this field (*to*), and the number of bytes (2 or 4)
+ * 		for this field, stored in *size*. Alternatively, it is possible
+ * 		to store the difference between the previous and the new values
+ * 		of the header field in *to*, by setting *from* and *size* to 0.
+ * 		For both methods, *offset* indicates the location of the IP
+ * 		checksum within the packet.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit 91bc4822c3d6
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the TCP or UDP checksum for the packet associated to
+ * 		*skb*. Computation is incremental, so the helper must know the
+ * 		former value of the header field that was modified (*from*),
+ * 		the new value of this field (*to*), and the number of bytes (2
+ * 		or 4) for this field, stored on the lowest four bits of
+ * 		*flags*. Alternatively, it is possible to store the difference
+ * 		between the previous and the new values of the header field in
+ * 		*to*, by setting *from* and the four lowest bits of *flags* to
+ * 		0. For both methods, *offset* indicates the location of the IP
+ * 		checksum within the packet. In addition to the size of the
+ * 		field, *flags* can be added (bitwise OR) actual flags. With
+ * 		**BPF_F_MARK_MANGLED_0**, a null checksum is left untouched
+ * 		(unless **BPF_F_MARK_ENFORCE** is added as well), and for
+ * 		updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR**
+ * 		indicates the checksum is to be computed against a
+ * 		pseudo-header.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.1
+ *
+ * .. commit 91bc4822c3d6
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The contents of
+ * 		eBPF registers and stack are not modified, the new program
+ * 		"inherits" them from the caller. This mechanism allows for
+ * 		program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never goes back to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its own instructions. A call can fail if the destination
+ * 		program for the jump does not exist (i.e. *index* is superior
+ * 		to the number of entries in *prog_array_map*), or if the
+ * 		maximum number of tail calls has been reached for this chain of
+ * 		programs. This limit is defined in the kernel by the macro
+ * 		**MAX_TAIL_CALL_CNT** (not accessible to user space), which
+ * 		is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		All program types, except **BPF_PROG_TYPE_CGROUP_DEVICE**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_PROG_ARRAY**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.2
+ *
+ * .. commit 04fd61ab36ec
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. The only flag supported for now
+ * 		is **BPF_F_INGRESS**, which indicates the packet is to be
+ * 		redirected to the ingress interface instead of (by default)
+ * 		egress.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.2
+ *
+ * .. commit 3896d655f4d4
  *
  * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ * 	For
+ * 		*Tracing programs*.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.2
+ *
+ * .. commit ffeedafbf023
  *
  * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ * 	For
+ * 		All tracing programs, as well as programs of types
+ * 		**BPF_PROG_TYPE_CGROUP_DEVICE**,
+ * 		**BPF_PROG_TYPE_CGROUP_SOCK**,
+ * 		**BPF_PROG_TYPE_SOCK_OPS**,
+ * 		**BPF_PROG_TYPE_CGROUP_SOCK_ADDR**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.2
+ *
+ * .. commit ffeedafbf023
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		*Tracing programs*.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.2
+ *
+ * .. commit ffeedafbf023
+ *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the
+ * 		net_cls (network classifier) cgroup to which *skb* belongs.
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**, **BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.3
+ *
+ * .. commit 8d20aabe1c76
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.3
+ *
+ * .. commit 4e10df9a60d9
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.3
+ *
+ * .. commit 4e10df9a60d9
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		This is typically used on the receive path to perform a lookup
+ * 		or a packet redirection based on the value of *key*:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key = {};
+ * 			bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			     lookup or redirect based on key ...
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.3
+ *
+ * .. commit d3aa45ce6b94
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.3
+ *
+ * .. commit d3aa45ce6b94
+ *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The
+ * 		nature of the perf event counter is selected at the creation of
+ * 		the *map*. The *map* is an array whose size is the number of
+ * 		available CPU cores, and each cell contains a value relative to
+ * 		one core. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the core to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU core should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ * 	For
+ * 		*Tracing programs*. Limited to maps of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.3
+ *
+ * .. commit 35578d798400
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		For hooks other than XDP, *flags* can be set to
+ * 		**BPF_F_INGRESS**, which indicates the packet is to be
+ * 		redirected to the ingress interface instead of (by default)
+ * 		egress. Currently, XDP does not support any flag.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORT** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_XDP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.4 for most program types, but XDP support was only
+ * 		added in Linux 4.14.
+ *
+ * .. commit 27b29f63058d
+ *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*.
+ * 	Return
+ * 		The realm of the route for the packet associated to *sdb*, or 0
+ * 		if none was found.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**,
+ * 		**BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.4
+ *
+ * .. commit c46646d0484f
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write perf raw sample into a perf event held by *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf event must
+ * 		have the following attributes: **PERF_SAMPLE_RAW** as
+ * 		**sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper, and will get interpreted as a pointer to a **struct
+ * 		pt_reg**.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*). It looks like the
+ * 		following snippet:
+ *
+ * 		::
+ *
+ * 			volatile struct perf_event_mmap_page *header;
+ * 			struct perf_event_attr attr = {
+ * 			        .sample_type = PERF_SAMPLE_RAW,
+ * 			        .type = PERF_TYPE_SOFTWARE,
+ * 			        .config = PERF_COUNT_SW_BPF_OUTPUT,
+ * 			};
+ * 			int page_size;
+ * 			int mmap_size;
+ * 			int key = 0;
+ * 			int pmu_fd;
+ * 			void *base;
+ * 			
+ * 			if (load_bpf_file(filename))
+ * 			        return -1;
+ * 			
+ * 			pmu_fd = sys_perf_event_open(&attr,
+ * 			                             -1, // pid
+ * 			                              0, // cpu
+ * 			                             -1, // group_fd
+ * 			                              0);
+ * 			
+ * 			assert(pmu_fd >= 0);
+ * 			assert(bpf_map_update_elem(map_fd[0], &key,
+ * 			                           &pmu_fd, BPF_ANY) == 0);
+ * 			assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
+ * 			
+ * 			page_size = getpagesize();
+ * 			mmap_size = page_size * (page_cnt + 1);
+ * 			
+ * 			base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ * 			            MAP_SHARED, fd, 0);
+ * 			if (base == MAP_FAILED)
+ * 			        return -1;
+ * 			
+ * 			header = base;
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		*Tracing programs*, and programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**,
+ * 		**BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_XDP**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.4
+ *
+ * .. commit a43eec304259
+ *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, this helper is deprecated in favor of
+ * 		"direct packet access", enabling packet data to be manipulated
+ * 		with *skb*\ **->data** and *skb*\ **->data_end** pointing
+ * 		respectively to the first byte of packet data and to the byte
+ * 		after the last byte of packet data.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SOCKET_FILTER**,
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**,
+ * 		**BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.5
+ *
+ * .. commit 05c74e5e53f6
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames.
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ * 	For
+ * 		*Tracing programs*. Limited to maps of type
+ * 		**BPF_MAP_TYPE_STACK_TRACE**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.6
+ *
+ * .. commit d5a3b1f69186
+ *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value.
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**,
+ * 		**BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.6
+ *
+ * .. commit 7d672345ed29
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ * 	Return
+ * 		The size of the option data retrieved.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.6
+ *
+ * .. commit 14ca0751c96f
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.6
+ *
+ * .. commit 14ca0751c96f
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		().
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.8
+ *
+ * .. commit 6578171a7ff0
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.8
+ *
+ * .. commit d2485c4242a82
+ *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**,
+ * 		**BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_CGROUP_ARRAY**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.8
+ *
+ * .. commit 4a482f34afcc
+ *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**,
+ * 		**BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.8
+ *
+ * .. commit 13c5c240f789
  *
  * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
+ * 	Return
+ * 		A pointer to the current task struct.
+ * 	For
+ * 		*Tracing programs*.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.8
+ *
+ * .. commit 606274c5abd8
+ *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		*Tracing programs*.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.8
+ *
+ * .. commit 96ae52279594
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ * 	For
+ * 		*Tracing programs*.
+ * 		Limited to maps of type **BPF_MAP_TYPE_CGROUP_ARRAY**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.9
+ *
+ * .. commit 60d20f9195b2
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.9
+ *
+ * .. commit 5293efe62df8
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, when testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**)
+ * 		fails, programs just bail out, or, in the direct read case, use
+ * 		**bpf_skb_load_bytes()** as an alternative to overcome this
+ * 		limitation. If such data sits in non-linear parts, it is
+ * 		possible to pull them in once with the new helper, retest and
+ * 		eventually access them.
+ *
+ * 		At the same time, this also makes sure the skb is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the skb from the
+ * 		very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_IN**, **BPF_PROG_TYPE_LWT_OUT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.9
+ *
+ * .. commit 36bbef52c7eb
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver fed us an IP checksum. Return an error otherwise. This
+ * 		header is intended to be used in combination with
+ * 		**bpf_csum_diff()** helper, in particular when the checksum
+ * 		needs to be updated after data has been written into the packet
+ * 		through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.9
+ *
+ * .. commit 36bbef52c7eb
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_LWT_XMIT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.9
+ *
+ * .. commit 7a4b28c6cc9f
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**).
+ * 	Return
+ * 		The id of current NUMA node.
+ * 	For
+ * 		All program types, except
+ * 		**BPF_PROG_TYPE_CGROUP_DEVICE**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.10
+ *
+ * .. commit 2d0e30c30f84
+ *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_LWT_XMIT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.10
+ *
+ * .. commit 3a0af8fd61f9
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_XDP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.10
+ *
+ * .. commit 17bedab27231
  *
  * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ * 	For
+ * 		*Tracing programs*.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.11
+ *
+ * .. commit a5e8c07059d0
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the socket cookie generated by the kernel from a
+ * 		**struct sk_buff** with a known socket. If none has been set
+ * 		yet, generate a new cookie. This helper can be useful for
+ * 		monitoring per socket networking traffic statistics as it
+ * 		provides a unique socket identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SOCKET_FILTER**,
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.12
+ *
+ * .. commit 91b8270f2a4d
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SOCKET_FILTER**,
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**,
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.12
+ *
+ * .. commit 6acc5c291068
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.13
+ *
+ * .. commit ded092cd73c2c
+ *
+ * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SOCK_OPS**,
+ * 		**BPF_PROG_TYPE_CGROUP_SOCK**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.13
+ *
+ * .. commit 8c4b4c7e9ff04
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SCHED_CLS**,
+ * 		**BPF_PROG_TYPE_SCHED_ACT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.13
+ *
+ * .. commit 2be7e212d541
+ *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, his *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is not fully implemented as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORT** on error.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_XDP**.
+ * 		Limited to maps of types
+ * 		**BPF_MAP_TYPE_DEVMAP** and
+ * 		**BPF_MAP_TYPE_CPUMAP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.14
+ *
+ * .. commit 97f91a7cf04ff
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. The only flag
+ * 		supported for now is **BPF_F_INGRESS**, which indicates the
+ * 		packet is to be redirected to the ingress side of the socket
+ * 		instead of (by default) egress.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SK_SKB**,
+ * 		**BPF_PROG_TYPE_CGROUP_SKB**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_SOCKMAP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.14
+ *
+ * .. commit 174a79ff9515
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SOCK_OPS**,
+ * 		**BPF_PROG_TYPE_CGROUP_SOCK**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_SOCKMAP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.14
+ *
+ * .. commit 174a79ff9515
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_XDP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.15
+ *
+ * .. commit 908432ca84fc
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf
+ * 		event counter is selected at the creation of the *map*. The
+ * 		*map* is an array whose size is the number of available CPU
+ * 		cores, and each cell contains a value relative to one core. The
+ * 		value to retrieve is indicated by *flags*, that contains the
+ * 		index of the core to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU core should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_KPROBE**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.15
+ *
+ * .. commit 4bebdc7a85aa4
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_PERF_EVENT**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.15
+ *
+ * .. commit cd86d1fd21025
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of types
+ * 		**BPF_PROG_TYPE_SOCK_OPS**,
+ * 		**BPF_PROG_TYPE_CGROUP_SOCK**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.15
+ *
+ * .. commit f3edacbd697f9
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_KPROBE**.
+ * 	GPL only
+ * 		Yes
+ * 	Since
+ * 		Linux 4.16
+ *
+ * .. commit 9802d86585db9
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ * 	For
+ * 		Programs of types **BPF_PROG_TYPE_SOCK_OPS**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.16
+ *
+ * .. commit b13d880721729
+ *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. The only flag
+ * 		supported for now is **BPF_F_INGRESS**, which indicates the
+ * 		packet is to be redirected to the ingress side of the socket
+ * 		instead of (by default) egress.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_SK_MSG**.
+ * 		Limited to maps of type **BPF_MAP_TYPE_SOCKMAP**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.17
+ *
+ * .. commit 4f738adba30a7
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_SK_MSG**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.17
+ *
+ * .. commit 2a100317c9ebc
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_SK_MSG**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.17
+ *
+ * .. commit 91843d540a139
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_SK_MSG**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.17
+ *
+ * .. commit 015632bb30daa
+ *
+ * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ *
+ * 		As for the remote end, both parts of it can be overridden,
+ * 		remote IP and remote port. This can be useful if an application
+ * 		inside a cgroup wants to connect to another application inside
+ * 		the same cgroup or to itself, but knows nothing about the IP
+ * 		address assigned to the cgroup.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ * 	For
+ * 		Programs of type **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**.
+ * 		The attach type of the programs must be
+ * 		**BPF_CGROUP_INET4_CONNECT** or
+ * 		**BPF_CGROUP_INET6_CONNECT**.
+ * 	GPL only
+ * 		No
+ * 	Since
+ * 		Linux 4.17
+ *
+ * .. commit d74bad4e74ee3
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
new file mode 100755
index 000000000000..aed59793a9b9
--- /dev/null
+++ b/scripts/bpf_helpers_doc.py
@@ -0,0 +1,568 @@
+#!/usr/bin/python3
+#
+# Copyright (C) 2018 Netronome Systems, Inc.
+#
+# This software is licensed under the GNU General License Version 2,
+# June 1991 as shown in the file COPYING in the top-level directory of this
+# source tree.
+#
+# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    @progtype: listing of the eBPF program types compatible with the helper
+    @gpl: attribute telling if this helper is only compatible with GPL programs
+    @since: Linux kernel version in which this helper was introduced
+    @commit: commit that introduced this helper in the source
+    """
+    def __init__(self, proto='', desc='', ret='', progtype='', gpl='',
+                 since='', commit=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+        self.progtype = progtype
+        self.gpl = gpl
+        self.since = since
+        self.commit = commit
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(6),
+                'name' : capture.group(7)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        progtype = self.parse_progtype()
+        gpl      = self.parse_gpl()
+        since    = self.parse_since()
+        commit   = self.parse_commit()
+        return Helper(proto=proto, desc=desc, ret=ret, progtype=progtype,
+                      gpl=gpl, since=since, commit=commit)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile('^ \* \tDescription$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile('^ \* \tReturn$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def parse_progtype(self):
+        p = re.compile('^ \* \tFor$')
+        capture = p.match(self.line)
+        if not capture:
+            raise ParsingError(self.line, self.reader)
+        # List of supported program types can be several lines, and it stops
+        # when another subsection title is met.
+        progtype = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                progtype += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    progtype += capture.group(1) + '\n'
+                else:
+                    break
+        return progtype
+
+    def parse_gpl(self):
+        p = re.compile('^ \* \tGPL only$')
+        capture = p.match(self.line)
+        if not capture:
+            raise ParsingError(self.line, self.reader)
+        # GPL info is "Yes" or "No".
+        self.line = self.reader.readline()
+        p = re.compile('^ \* \t\t(Yes|No)$')
+        capture = p.match(self.line)
+        if not capture:
+            raise ParsingError(self.line, self.reader)
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_since(self):
+        p = re.compile('^ \* \tSince$')
+        capture = p.match(self.line)
+        if not capture:
+            raise ParsingError(self.line, self.reader)
+        # Minimal kernel version is usually just "Linux Major.minor"...
+        self.line = self.reader.readline()
+        p = re.compile('^ \* \t\tLinux ([0-9.]+)$')
+        capture = p.match(self.line)
+        if capture:
+            since = capture.group(1)
+            self.line = self.reader.readline()
+            self.line = self.reader.readline()
+        # ... But there are exceptions, spanning on several lines.
+        else:
+            since = ''
+            while True:
+                if self.line == ' *\n':
+                    since += '\n'
+                else:
+                    p = re.compile('^ \* \t\t(.*)')
+                    capture = p.match(self.line)
+                    if capture:
+                        since += capture.group(1) + '\n'
+                    else:
+                        break
+                self.line = self.reader.readline()
+        if not since:
+            raise ParsingError(self.line, self.reader)
+        return since
+
+    def parse_commit(self):
+        # Commit is a in a RST comment.
+        p = re.compile('^ \* .. commit ([a-f0-9]+)$')
+        capture = p.match(self.line)
+        if not capture:
+            raise ParsingError(self.line, self.reader)
+        self.line = self.reader.readline()
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('/* BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+        print('Parsed description of %d helper function(s)' % len(self.helpers),
+              file=sys.stderr)
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def __init__(self, helpers):
+        self.helpers = helpers
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for helper in self.helpers:
+            self.print_one(helper)
+        self.print_footer()
+
+class PrinterRST(Printer):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def print_header(self):
+        header = '''\
+.. Copyright (C) 2018 Netronome Systems, Inc.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. repository (header and footer).
+
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program. For
+security reasons, these functions are restricted to a white-list of helpers
+defined in the kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets metadata. Since there are
+several eBPF program types, and that they do not run in the same context, each
+program type can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+PROGRAM TYPES
+=============
+
+In the description of the helpers below, "*tracing programs*" stands for
+programs of types:
+**BPF_PROG_TYPE_KPROBE**,
+**BPF_PROG_TYPE_TRACEPOINT**,
+**BPF_PROG_TYPE_PERF_EVENT**,
+and **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+
+The set of "*networking programs*" encompasses programs of types
+**BPF_PROG_TYPE_SOCKET_FILTER**,
+**BPF_PROG_TYPE_SCHED_CLS**,
+**BPF_PROG_TYPE_SCHED_ACT**,
+**BPF_PROG_TYPE_LWT_IN**,
+**BPF_PROG_TYPE_LWT_OUT**,
+**BPF_PROG_TYPE_LWT_XMIT**,
+**BPF_PROG_TYPE_SOCK_OPS**,
+**BPF_PROG_TYPE_SK_SKB**,
+**BPF_PROG_TYPE_CGROUP_SOCK**,
+**BPF_PROG_TYPE_CGROUP_SKB**,
+**BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+**BPF_PROG_TYPE_XDP**,
+and **BPF_PROG_TYPE_SK_MSG**.
+
+.. BPF_PROG_TYPE_LWT_SEG6LOCAL
+.. BPF_BTF_LOAD
+
+Program type **BPF_PROG_TYPE_CGROUP_DEVICE** allows very few helpers, and
+enters in none of those categories for the purpose of the description.
+
+LICENCE
+=======
+
+Some helpers are for "GPL-only" eBPF programs. These functions rely on internal
+kernel components that can be only used in accordance to the GNU Privacy
+License (GPL), and so eBPF programs must themselves be compatible with this
+license to be allowed to use such helpers (The licensing rules are the same as
+for kernel modules, so that also dual licenses, such as "Dual BSD/GPL", may be
+used). This requires the programs are loaded with the correct license string
+passed (via **attr**) to the **bpf**\ () system call, and generally translates
+into the C source code of such programs containing a line similar similar to
+the following:
+
+::
+
+	char ____license[] __attribute__((section("license"), used)) = "GPL";
+
+HELPERS
+=======
+'''
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+NOTES
+=====
+
+On the performance side, eBPF programs move to the stack all arguments to pass
+to the helpers, and call directly into the compiled helper functions without
+requiring any foreign-function interface. As a result, calling helpers
+introduce very little overhead.
+
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* contains the full list of all helper functions.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+
+        if (helper.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (helper.ret):
+            print('\tReturn')
+            for line in helper.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('\tFor')
+        for line in helper.progtype.rstrip().split('\n'):
+            print('{}{}'.format('\t\t' if line else '', line))
+
+        print('\tGPL only')
+        print('\t\t%s' % helper.gpl)
+
+        print('\tSince')
+        if re.match(r'^[0-9.]+$', helper.since):
+            print('\t\tLinux %s' % helper.since)
+        else:
+            for line in helper.since.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+        print('.. commit %s' % helper.commit)
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for eBPF helper functions.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+printer = PrinterRST(headerParser.helpers)
+printer.print_all()
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH 2/2] net: phy: dp83640: Read strapped configuration settings
From: Esben Haabendal @ 2018-04-06 11:05 UTC (permalink / raw)
  To: David Miller
  Cc: andrew, f.fainelli, richardcochran, netdev, linux-kernel,
	rasmus.villemoes
In-Reply-To: <20180405.222437.1573582680811455269.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: Andrew Lunn <andrew@lunn.ch>
> Date: Thu, 5 Apr 2018 22:40:49 +0200
>
>> Or could it still contain whatever state the last boot of Linux, or
>> maybe the bootloader, left the PHY in?
>
> Right, this is my concern as well.

I don't think that should happen.
With config_init() being called (in phy_init_hw()) after soft_reset(),
any state set by software should be cleared.

>From DP83620 datasheet description of what happens when BMCR_RESET is
set:

    The software reset will reset the device such that all registers
    will be reset to default values and the hardware configuration
    values will be maintained.

But something else that could be a concern is the risk that there is
boards out there with wrong hardware configuration, which works with
current Linux (because it ignores hardware configuration).  Such designs
could break with this patch.

If we need to safeguard against that, maybe we could just keep the
genphy_read_config() function in the kernel, and let board specific code
use it as a phy_fixup where hardware configuration is to be respected.

Would that be a better approach?

/Esben

^ permalink raw reply

* Re: WARNING: CPU: 3 PID: 0 at net/sched/sch_hfsc.c:1388 hfsc_dequeue+0x319/0x350 [sch_hfsc]
From: Marco Berizzi @ 2018-04-06 10:53 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang; +Cc: Linux Kernel Network Developers, Eric Dumazet
In-Reply-To: <11d36e94-0b07-425d-78d1-dd41b60a586f@mojatatu.com>

> Il 19 marzo 2018 alle 11.07 Jamal Hadi Salim <jhs@mojatatu.com> ha scritto:
> 
> On 18-03-15 08:48 PM, Cong Wang wrote:
> 
> > On Wed, Mar 14, 2018 at 1:10 AM, Marco Berizzi <pupilla@libero.it> wrote:
> > 
> > > > Il 9 marzo 2018 alle 0.14 Cong Wang <xiyou.wangcong@gmail.com> ha scritto:
> > 
> > It has been reported here:
> > https://bugzilla.kernel.org/show_bug.cgi?id=109581
> > 
> > And there is a workaround from Konstantin:
> > https://patchwork.ozlabs.org/patch/803885/
> > 
> > Unfortunately I don't think that is a real fix, we probably need to
> > fix HFSC itself rather than just workaround the qlen==0. It is not
> > trivial since HFSC implementation is not easy to understand.
> > Maybe Jamal knows better than me.
> 
> Sorry for the latency - I looked at this on the plane and it is very
> specific to fq/codel. It is not clear to me why codel needs this but
> i note it has been there from the initial commit and from that
> perspective the patch looks reasonable. In any case:
> Punting it to Eric (on Cc).
> 
> cheers,
> jamal

Hello everyone,

About this bugzilla report https://bugzilla.kernel.org/show_bug.cgi?id=109581

I'm getting this error after 4.16-rc4 (also 4.16.0 is affected).
Till 4.15.7 I did not get this error message (linux 4.15.[1,2,3,4,5,6,7] is fine).

The bugzilla report is about linux 4.3.3

^ permalink raw reply

* Best userspace programming API for XDP features query to kernel?
From: Daniel Borkmann via iovisor-dev @ 2018-04-06 10:36 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: oisf-devel-ZwoEplunGu2j570ONfqVQLVmwVP6tfMwSoIsB4E12gc,
	Jakub Kicinski, Alexei Starovoitov, Victor Julien,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org,
	Peter Manev, Jiri Benc, Saeed Mahameed, Eric Leblond,
	Daniel Borkmann
In-Reply-To: <20180405225133.18a09883-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On 04/05/2018 10:51 PM, Jesper Dangaard Brouer wrote:
> On Thu, 5 Apr 2018 12:37:19 +0200
> Daniel Borkmann <daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org> wrote:
> 
>> On 04/04/2018 02:28 PM, Jesper Dangaard Brouer via iovisor-dev wrote:
>>> Hi Suricata people,
>>>
>>> When Eric Leblond (and I helped) integrated XDP in Suricata, we ran
>>> into the issue, that at Suricata load/start time, we cannot determine
>>> if the chosen XDP config options, like xdp-cpu-redirect[1], is valid on
>>> this HW (e.g require driver XDP_REDIRECT support and bpf cpumap).
>>>
>>> We would have liked a way to report that suricata.yaml config was
>>> invalid for this hardware/setup.  Now, it just loads, and packets gets
>>> silently dropped by XDP (well a WARN_ONCE and catchable via tracepoints).
>>>
>>> My question to suricata developers: (Q1) Do you already have code that
>>> query the kernel or drivers for features?
>>>
>>> At the IOvisor call (2 weeks ago), we discussed two options of exposing
>>> XDP features avail in a given driver.
>>>
>>> Option#1: Extend existing ethtool -k/-K "offload and other features"
>>> with some XDP features, that userspace can query. (Do you already query
>>> offloads, regarding Q1)
>>>
>>> Option#2: Invent a new 'ip link set xdp' netlink msg with a query option.  
>>
>> I don't really mind if you go via ethtool, as long as we handle this
>> generically from there and e.g. call the dev's ndo_bpf handler such that
>> we keep all the information in one place. This can be a new ndo_bpf command
>> e.g. XDP_QUERY_FEATURES or such.
> 
> Just to be clear: notice as Victor points out[2], they are programmable
> going though the IOCTL (SIOCETHTOOL) and not using cmdline tools.

Sure, that was perfectly clear. (But at the same time if you extend the
ioctl, it's obvious to also add support to actual ethtool cmdline tool.)

> [2] https://github.com/OISF/suricata/blob/master/src/util-ioctl.c#L326
> 
> If you want everything to go through the drivers ndo_bpf call anyway
> (which userspace API is netlink based) then at what point to you

Not really, that's the front end. ndo_bpf itself is a plain netdev op
and has no real tie to netlink.

> want drivers to call their own ndo_bpf, when activated though their
> ethtool_ops ? (Sorry, but I don't follow the flow you are proposing)
> 
> Notice, I'm not directly against using the drivers ndo_bpf call.  I can
> see it does provide kernel more flexibility than the ethtool IOCTL.

What I was saying is that even if you go via ethtool ioctl api, where
you end up in dev_ethtool() and have some new ETHTOOL_* query command,
then instead of adding a new ethtool_ops callback, we can and should
reuse ndo_bpf from there.

[...]
> Here, I want to discuss how drivers expose/tell userspace that they
> support a given feature: Specifically a bit for: XDP_REDIRECT action
> support.
> 
>> Same for meta data,
> 
> Well, not really.  It would be a "nice-to-have", but not strictly
> needed as a feature bit.  XDP meta-data is controlled via a helper.
> And the BPF-prog can detect/see runtime, that the helper bpf_xdp_adjust_meta
> returns -ENOTSUPP (and need to check the ret value anyhow).  Thus,
> there is that not much gained by exposing this to be detected setup
> time, as all drivers should eventually support this, and we can detect
> it runtime.
> 
> The missing XDP_REDIRECT action features bit it different, as the
> BPF-prog cannot detect runtime that this is an unsupported action.
> Plus, setup time we cannot query the driver for supported XDP actions.

Ok, so with the example of meta data, you're arguing that it's okay
to load a native XDP program onto a driver, and run actual traffic on
the NIC in order probe for the availability of the feature when you're
saying that it "can detect/see [at] runtime". I totally agree with you
that all drivers should eventually support this (same with XDP_REDIRECT),
but today there are even differences in drivers on bpf_xdp_adjust_meta()/
bpf_xdp_adjust_head() with regards to how much headroom they have available,
etc (e.g. some of them have none), so right now you can either go and
read the code or do a runtime test with running actual traffic through
the NIC to check whether your BPF prog is supported or not. Theoretically,
you can do the same runtime test with XDP_REDIRECT (taking the warn in
bpf_warn_invalid_xdp_action() aside for a moment), but you do have the
trace_xdp_exception() tracepoint to figure it out, yes, it's a painful
hassle, but overall, it's not that different as you were trying to argue
here. For /both/ cases it would be nice to know at setup time whether
this would be supported or not. Hence, such query is not just limited to
XDP_REDIRECT alone. Eventually once such interface is agreed upon,
undoubtedly the list of feature bits will grow is what I'm trying to say;
only arguing on the XDP_REDIRECT here would be short term.

[...]
>> What about keeping this high level to users? E.g. say you have 2 options
>> that drivers can expose as netdev_features_strings 'xdp-native-full' or
>> 'xdp-native-partial'. If a driver truly supports all XDP features for a
>> given kernel e.g. v4.16, then a query like 'ethtool -k foo' will say
>> 'xdp-native-full', if at least one feature is missing to be feature complete
>> from e.g. above list, then ethtool will tell 'xdp-native-partial', and if
>> not even ndo_bpf callback exists then no 'xdp-native-*' is reported.
> 
> I use-to-be, an advocate for this.  I even think I send patches
> implementing this. Later, I've realized that this model is flawed.
> 
> When e.g. suricata loads it need to look at both "xdp-native-full" and
> the kernel version, to determine if XDP_REDIRECT action is avail.
> Later when a new kernel version gets released, the driver is missing a
> new XDP feature.  Then suricata, which doesn't use/need the new
> feature, need to be updated, to check that kernel below this version,
> with 'xdp-native-partial' and this NIC driver is still okay.  Can you
> see the problem?
> 
> Even if Suricate goes though the pain of keeping track of kernel
> version vs drivers vs xdp-native-full/partial.  Then, they also want to
> run their product on distro kernels.  They might convince some distro,
> to backport some XDP features they need.  So, now they also need to
> keep track of distro kernel minor versions... and all they really
> wanted as a single feature bit saying if the running NIC driver
> supports the XDP_REDIRECT action code.

Yep, agree it's not pretty, not claiming any of this is. You kind of
need to be aware of the underlying kernel, similar to the tracing case.
The underlying problem is effectively the decoupling of program verification
that doesn't have/know the context of where it is being attached to in
this case. Thinking out loud for a sec on couple of other options aside
from feature bits, what about i) providing the target ifindex to the
verifier for XDP programs, such that at verification time you have the
full context similar to nfp offload case today, or ii) populating some
XDP specific auxillary data to the BPF program at verification time such
that the driver can check at program attach time whether the requested
features are possible and if not it will reject and respond with netlink
extack message to the user (as we do in various XDP attach cases already
through XDP_SETUP_PROG{,_HW}).

This would, for example, avoid the need for feature bits, and do actual
rejection of the program while retaining flexibility (and avoiding to
expose bits that over time hopefully will deprecate anyway due to all
XDP aware drivers implementing them). For both cases i) and ii), it
would mean we make the verifier a bit smarter with regards to keeping
track of driver related (XDP) requirements. Program return code checking
is already present in the verifier's check_return_code() and we could
extend it for XDP as well, for example. Seems cleaner and more extensible
than feature bits, imho.

Thanks,
Daniel

^ permalink raw reply

* TCP one-by-one acking - RFC interpretation question
From: Michal Kubecek @ 2018-04-06 10:05 UTC (permalink / raw)
  To: netdev

Hello,

I encountered a strange behaviour of some (non-linux) TCP stack which
I believe is incorrect but support engineers from the company producing
it claim is OK.

Assume a client (sender, Linux 4.4 kernel) sends a stream of MSS sized
segments but segments 2, 4 and 6 do not reach the server (receiver):

         ACK             SAK             SAK             SAK
      +-------+-------+-------+-------+-------+-------+-------+
      |   1   |   2   |   3   |   4   |   5   |   6   |   7   |
      +-------+-------+-------+-------+-------+-------+-------+
    34273   35701   37129   38557   39985   41413   42841   44269

When segment 2 is retransmitted after RTO timeout, normal response would
be ACK-ing segment 3 (38557) with SACK for 5 and 7 (39985-41413 and
42841-44269).

However, this server stack responds with two separate ACKs:

  - ACK 37129, SACK 37129-38557 39985-41413 42841-44269
  - ACK 38557, SACK 39985-41413 42841-44269

There is no payload from server, no window update and it happens even if
there is no other packet received by server between those two. The
result is that as segment 3 was never retransmitted, second ACK is
interpreted as acking a newly arrived segment by 4.4 kernel so that the
whole interval between first transmission of segment 3 and this second
ACK is used for RTT estimator; even worse, when the same happens again
for segment 5, both timeouts (for 2 and 4) are counted into its RTT.
The result is RTO growing exponentially until it reaches the maximum
(120 seconds) and the connection is effectively stalled.

In my opinion, server behaviour violates the last paragraph of RFC 5681,
section 4.2:

  A TCP receiver MUST NOT generate more than one ACK for every incoming
  segment, other than to update the offered window as the receiving
  application consumes new data (see [RFC813] and page 42 of [RFC793]).

Server vendor claims that their behaviour is correct as first ACK is
sent in response to segment 2 and second ACK in response to segment 3
(which has just been delayed in the out of order queue).

Note that SACK doesn't really help here. First SACK block in first ACK
(37129-38557) is actually invalid as it violates the "the bytes just
below the block ... have not been received" condition from RFC 2018
section 3. Therefore Linux 4.4 stack ignores this SACK block, detects
(spurious) SACK reneging and unmarks the "previously sacked" flag of
segment 3 so that when second ACK arrives, there is no trace of it
having been sacked before. They already admitted this SACK block is
incorrect but there is still disagreement about the "one-by-one acking"
behaviour in general.

My question is: is my interpretation correct? If so, is there an even
less ambiguous statement somewhere that receiver is supposed to send one
ACK for "everything they got so far" rather than acking the segments one
by one? While reading the RFCs, I always considered this obvious but
apparently some people may think otherwise.

Thanks in advance,
Michal Kubecek

^ permalink raw reply

* tcp hang when socket fills up ?
From: Dominique Martinet @ 2018-04-06  9:07 UTC (permalink / raw)
  To: netdev

(current kernel: vanilla 4.14.29)

I've been running into troubles recently where a sockets "fills up" (as
in, select() will no longer return it in its outfd / attempting to write
to it after setting it to NONBLOCK will return -EWOULDBLOCK) and it
doesn't seem to ever "unfill" until the tcp connexion timeout.

The previous time I pushed it down to the application for not trying to
read the socket either as I assume the buffers could be shared and
just waiting won't take data out, but this time I'm a bit more
skeptical as socat waits for the fd in both read and write...

Let me take a minute to describe my setup, I don't think that how the
socket was created matters but it might be interesting:
 - I have two computers behind NATs, no port forwarding on either side
 - One (call it C for client) runs ssh with a proxycommand ncat/socat to
control the source port, e.g.
$ ssh -o ProxyCommand="socat stdio tcp:<server public ip>:<port1>,sourceport=<port2>" server
 - The server runs another socat to connect to that and forwards to ssh
locally, e.g.
$ socat tcp:<client public ip>:<port2>,sourceport=<port1> tcp:127.0.0.1:22

(yes, both are connect() calls, and that just works™ thanks to initial
syn replay and conntrack on routers)

When things stall, the first socat is in a select with both fd in
reading, so it's waiting data.
The second socat has data coming from ssh and is in a select with the
client-facing socket in both read and write - that select never returns
so the socket is not available for reading or writing and does not free
up until the connection eventually times out a few minutes later.

At this point, I only see tcp replays in tcpdump/wireshark. I've
compared dumps from both sides and there are no lost packets, only
reordering - there always is a batch of acks that were sent regularily
coming in shortly before the hang. Here's the trace on the server:

16:49:26.735042 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 70476:71850, ack 4190, win 307, options [nop,nop,TS val 1313937641 ecr 1617129473], length 1374
16:49:26.735046 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 71850:73224, ack 4190, win 307, options [nop,nop,TS val 1313937641 ecr 1617129473], length 1374
16:49:26.735334 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 41622, win 918, options [nop,nop,TS val 1617129478 ecr 1313937609], length 0
16:49:26.736005 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 42996, win 940, options [nop,nop,TS val 1617129478 ecr 1313937609], length 0
16:49:26.736402 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 73224:74598, ack 4190, win 307, options [nop,nop,TS val 1313937643 ecr 1617129473], length 1374
16:49:26.736408 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 74598:75972, ack 4190, win 307, options [nop,nop,TS val 1313937643 ecr 1617129473], length 1374
16:49:26.738561 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 44370, win 963, options [nop,nop,TS val 1617129482 ecr 1313937616], length 0
16:49:26.739539 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 45744, win 986, options [nop,nop,TS val 1617129482 ecr 1313937616], length 0
16:49:26.739882 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 47118, win 1008, options [nop,nop,TS val 1617129484 ecr 1313937617], length 0
16:49:26.740255 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 48492, win 1031, options [nop,nop,TS val 1617129484 ecr 1313937617], length 0
16:49:26.746756 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 49866, win 1053, options [nop,nop,TS val 1617129493 ecr 1313937627], length 0
16:49:26.747923 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 51240, win 1076, options [nop,nop,TS val 1617129494 ecr 1313937627], length 0
16:49:26.749083 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 52614, win 1099, options [nop,nop,TS val 1617129495 ecr 1313937629], length 0
16:49:26.750171 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 53988, win 1121, options [nop,nop,TS val 1617129496 ecr 1313937629], length 0
16:49:26.750808 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 55362, win 1144, options [nop,nop,TS val 1617129497 ecr 1313937629], length 0
16:49:26.754648 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 56736, win 1167, options [nop,nop,TS val 1617129500 ecr 1313937629], length 0
16:49:26.755985 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 58110, win 1189, options [nop,nop,TS val 1617129501 ecr 1313937630], length 0
16:49:26.758513 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 59484, win 1212, options [nop,nop,TS val 1617129502 ecr 1313937630], length 0
16:49:26.759096 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 60858, win 1234, options [nop,nop,TS val 1617129503 ecr 1313937635], length 0
16:49:26.759421 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 62232, win 1257, options [nop,nop,TS val 1617129503 ecr 1313937635], length 0
16:49:26.759755 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 63606, win 1280, options [nop,nop,TS val 1617129504 ecr 1313937636], length 0
16:49:26.760653 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 64980, win 1302, options [nop,nop,TS val 1617129505 ecr 1313937636], length 0
16:49:26.761453 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 66354, win 1325, options [nop,nop,TS val 1617129506 ecr 1313937638], length 0
16:49:26.762199 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 67728, win 1348, options [nop,nop,TS val 1617129507 ecr 1313937638], length 0
16:49:26.763547 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 67728, win 1348, options [nop,nop,TS val 1617129507 ecr 1313937638], length 36
16:49:26.763553 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 70476, win 1393, options [nop,nop,TS val 1617129508 ecr 1313937639], length 0
16:49:26.764298 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 73224, win 1438, options [nop,nop,TS val 1617129509 ecr 1313937641], length 0
16:49:26.764676 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 75972, win 1444, options [nop,nop,TS val 1617129510 ecr 1313937643], length 0
16:49:26.807754 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 75972:77346, ack 4190, win 307, options [nop,nop,TS val 1313937714 ecr 1617129473], length 1374
16:49:26.876467 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617129620 ecr 1313937714], length 0
16:49:27.048760 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313937955 ecr 1617129473], length 1374
16:49:27.051791 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617129762 ecr 1313937714], length 36
16:49:27.076444 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617129822 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
16:49:27.371182 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617130018 ecr 1313937714], length 36
16:49:27.519862 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313938426 ecr 1617129473], length 1374
16:49:27.547662 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617130293 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
16:49:27.883372 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617130530 ecr 1313937714], length 36
16:49:28.511861 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313939418 ecr 1617129473], length 1374
16:49:28.538891 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617131285 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
16:49:28.907197 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617131554 ecr 1313937714], length 36
16:49:30.431864 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313941338 ecr 1617129473], length 1374
16:49:30.459127 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617133204 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
16:49:30.955388 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617133602 ecr 1313937714], length 36
16:49:34.207879 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313945114 ecr 1617129473], length 1374
16:49:34.235726 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617136981 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
16:49:35.256285 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617137954 ecr 1313937714], length 36
16:49:42.143864 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313953050 ecr 1617129473], length 1374
16:49:42.171531 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617144917 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
16:49:43.448262 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617146146 ecr 1313937714], length 36

(I still have the pcap files if someone wants to see them, but I'd
rather not give my work IP publicly so will send it privately on
request)


At this point, only the same 3 packets keep being replayed over and
over... According to 'ss' the Send-Q isn't empty on either side, the
client has some ~1k to send but the server has much more (87k)
After increasing the window size through net.*wmem sysctl it got stuck
with over 1MB in Send-Q, which makes sense because the socket is full...


So, what I don't get is, why are these acks continuously replayed? Given
the timing it looks like the server doesn't take the client acks into
account, despite having received that precise 33378 ack earlier and I
believe it should accept a higher ack value anyway ?

The ultimate question being, how can I go about debugging that?
I'm working on getting perf probe/crash to work on the server so I can
look at the kernel side now, I can reproduce this semi-easily so I'm
sure I'll get down to it eventually, but if anyone has an idea I'm all
ears.


Thanks!
-- 
Dominique Martinet | Asmadeus

^ permalink raw reply

* Re: [RFC] ethtool: Support for driver private ioctl's
From: Michal Kubecek @ 2018-04-06  9:07 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, Jose Abreu, David Miller, Jakub Jelinek,
	Jeff Garzik, Tim Hockin, Eli Kupermann, Chris Leech,
	Scott Feldman, Ben Hutchings, Joao Pinto
In-Reply-To: <df7773c4-8695-38dd-bedd-39b88f4aaec1@gmail.com>

On Thu, Apr 05, 2018 at 08:50:49AM -0700, Florian Fainelli wrote:
> On 04/05/2018 03:47 AM, Jose Abreu wrote:
> > Background: Synopsys Ethernet IP's have a certain number of
> > features which can be reconfigured at runtime. Giving you two
> > examples: One of the most recent one is the safety features,
> > which can be enabled/disabled and forced at runtime. Another one
> > is a Flexible RX Parser which can route specific packets to
> > specific RX DMA channels. Given that these are features specific
> > to our IP's it would not be useful to add an uniform API for this
> > because the users would only be one or two drivers ...
> 
> Parsing of packets and directing the matched packets to specific
> queues/channels can be done through ethtool rxnfc API, tc/cls_flower as
> well, so you should really check whether those APIs don't already allow
> you to do what you want.
> 
> ethtool already supports a concept of private  flags, not ioctl() though
> which allows you to toggle boolean values for instance (or technically
> up to how many bits a "flag" is used to represent) is that enough or do
> you need to turn on/off the feature as well as pass configuration
> parameters?

Perhaps introducing "driver/device specific tunables" (i.e. something
like tunables or PHY tunables but specific to a particular device) could
be a way. But it could get out of control quickly and users wouldn't be
happy if they had to set the same (or almost the same) parameter under
five different names for five NIC vendors.

Michal Kubecek

^ permalink raw reply

* Re: Best userspace programming API for XDP features query to kernel?
From: Jesper Dangaard Brouer via iovisor-dev @ 2018-04-06  8:47 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: oisf-devel-ZwoEplunGu2j570ONfqVQLVmwVP6tfMwSoIsB4E12gc,
	Alexei Starovoitov, Victor Julien, netdev-u79uwXL29TY76Z2rM5mHXA,
	iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org,
	Peter Manev, Jiri Benc, Saeed Mahameed, Eric Leblond,
	Daniel Borkmann
In-Reply-To: <20180405144716.2bdd4214-68UzVGuGftmUSpRRplVxJ1aTQe2KTcn/@public.gmane.org>

On Thu, 5 Apr 2018 14:47:16 -0700
Jakub Kicinski <jakub.kicinski-wFxRvT7yatFl57MIdRCFDg@public.gmane.org> wrote:

> On Thu, 5 Apr 2018 22:51:33 +0200, Jesper Dangaard Brouer wrote:
> > > What about nfp in terms of XDP
> > > offload capabilities, should they be included as well or is probing to load
> > > the program and see if it loads/JITs as we do today just fine (e.g. you'd
> > > otherwise end up with extra flags on a per BPF helper basis)?    
> > 
> > No, flags per BPF helper basis. As I've described above, helper belong
> > to the BPF core, not the driver.  Here I want to know what the specific
> > driver support.  
> 
> I think Daniel meant for nfp offload.  The offload restrictions are
> quite involved, are we going to be able to express those?

Let's keep thing separate.

I'm requesting something really simple.  I want the driver to tell me
what XDP actions it supports.  We/I can implement an XDP_QUERY_ACTIONS
via ndo_bpf, problem solved.  It is small, specific and simple.

For my other use-case of enabling XDP-xmit on a device, I can
implement another ndo_bpf extension. Current approach today is loading
a dummy XDP prog via ndo_bpf anyway (which is awkward). Again a
specific change that let us move one-step further.

For your nfp offload use-case, you/we have to find a completely
different solution.  You have hit a design choice made by BPF.
Which is that BPF is part of the core kernel, and helpers cannot be
loaded as kernel modules.  As we cannot remove or add helpers after the
verifier certified the program.  And basically your nfp offload driver
comes as a kernel module.
 (Details: and you basically already solved your issue by modifying the
core verifier to do a call back to bpf_prog_offload_verify_insn()).
Point being this is very different from what I'm requesting.  Thus, for
offloading you already have a solution, to my setup time detect
problem, as your program gets rejected setup/load time by the verifier.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox