[net-next PATCH v2 1/2] e1000: add initial XDP support

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [net-next PATCH v2 1/2] e1000: add initial XDP support
@ 2016-09-09 21:29 John Fastabend
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-09 21:29 UTC (permalink / raw)
  To: bblanco, john.fastabend, alexei.starovoitov, jeffrey.t.kirsher,
	brouer, davem
  Cc: xiyou.wangcong, intel-wired-lan, u9012063, netdev

From: Alexei Starovoitov <ast@fb.com>

This patch adds initial support for XDP on e1000 driver. Note e1000
driver does not support page recycling in general which could be
added as a further improvement. However XDP_DROP case will recycle.
XDP_TX and XDP_PASS do not support recycling yet.

I tested this patch running e1000 in a VM using KVM over a tap
device.

CC: William Tu <u9012063@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |    2 
 drivers/net/ethernet/intel/e1000/e1000_main.c |  171 +++++++++++++++++++++++++
 2 files changed, 170 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index d7bdea7..5cf8a0a 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -150,6 +150,7 @@ struct e1000_adapter;
  */
 struct e1000_tx_buffer {
 	struct sk_buff *skb;
+	struct page *page;
 	dma_addr_t dma;
 	unsigned long time_stamp;
 	u16 length;
@@ -279,6 +280,7 @@ struct e1000_adapter {
 			     struct e1000_rx_ring *rx_ring,
 			     int cleaned_count);
 	struct e1000_rx_ring *rx_ring;      /* One per active queue */
+	struct bpf_prog *prog;
 	struct napi_struct napi;
 
 	int num_tx_queues;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index f42129d..91d5c87 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -32,6 +32,7 @@
 #include <linux/prefetch.h>
 #include <linux/bitops.h>
 #include <linux/if_vlan.h>
+#include <linux/bpf.h>
 
 char e1000_driver_name[] = "e1000";
 static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
@@ -842,6 +843,44 @@ static int e1000_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static int e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+	struct e1000_adapter *adapter = netdev_priv(netdev);
+	struct bpf_prog *old_prog;
+
+	old_prog = xchg(&adapter->prog, prog);
+	if (old_prog) {
+		synchronize_net();
+		bpf_prog_put(old_prog);
+	}
+
+	if (netif_running(netdev))
+		e1000_reinit_locked(adapter);
+	else
+		e1000_reset(adapter);
+	return 0;
+}
+
+static bool e1000_xdp_attached(struct net_device *dev)
+{
+	struct e1000_adapter *priv = netdev_priv(dev);
+
+	return !!priv->prog;
+}
+
+static int e1000_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return e1000_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = e1000_xdp_attached(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops e1000_netdev_ops = {
 	.ndo_open		= e1000_open,
 	.ndo_stop		= e1000_close,
@@ -860,6 +899,7 @@ static const struct net_device_ops e1000_netdev_ops = {
 #endif
 	.ndo_fix_features	= e1000_fix_features,
 	.ndo_set_features	= e1000_set_features,
+	.ndo_xdp		= e1000_xdp,
 };
 
 /**
@@ -1276,6 +1316,9 @@ static void e1000_remove(struct pci_dev *pdev)
 	e1000_down_and_stop(adapter);
 	e1000_release_manageability(adapter);
 
+	if (adapter->prog)
+		bpf_prog_put(adapter->prog);
+
 	unregister_netdev(netdev);
 
 	e1000_phy_hw_reset(hw);
@@ -1859,7 +1902,7 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 	u32 rdlen, rctl, rxcsum;
 
-	if (adapter->netdev->mtu > ETH_DATA_LEN) {
+	if (adapter->netdev->mtu > ETH_DATA_LEN || adapter->prog) {
 		rdlen = adapter->rx_ring[0].count *
 			sizeof(struct e1000_rx_desc);
 		adapter->clean_rx = e1000_clean_jumbo_rx_irq;
@@ -1973,6 +2016,11 @@ e1000_unmap_and_free_tx_resource(struct e1000_adapter *adapter,
 		dev_kfree_skb_any(buffer_info->skb);
 		buffer_info->skb = NULL;
 	}
+	if (buffer_info->page) {
+		put_page(buffer_info->page);
+		buffer_info->page = NULL;
+	}
+
 	buffer_info->time_stamp = 0;
 	/* buffer_info must be completely set up in the transmit path */
 }
@@ -3298,6 +3346,63 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
+static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
+				struct e1000_rx_buffer *rx_buffer_info,
+				unsigned int len)
+{
+	struct e1000_tx_buffer *buffer_info;
+	unsigned int i = tx_ring->next_to_use;
+
+	buffer_info = &tx_ring->buffer_info[i];
+
+	buffer_info->length = len;
+	buffer_info->time_stamp = jiffies;
+	buffer_info->mapped_as_page = false;
+	buffer_info->dma = rx_buffer_info->dma;
+	buffer_info->next_to_watch = i;
+	buffer_info->page = rx_buffer_info->rxbuf.page;
+
+	tx_ring->buffer_info[i].skb = NULL;
+	tx_ring->buffer_info[i].segs = 1;
+	tx_ring->buffer_info[i].bytecount = len;
+	tx_ring->buffer_info[i].next_to_watch = i;
+}
+
+static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
+				 unsigned int len,
+				 struct net_device *netdev,
+				 struct e1000_adapter *adapter)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
+	struct e1000_hw *hw = &adapter->hw;
+	struct e1000_tx_ring *tx_ring;
+
+	if (len > E1000_MAX_DATA_PER_TXD)
+		return;
+
+	/* e1000 only support a single txq at the moment so the queue is being
+	 * shared with stack. To support this requires locking to ensure the
+	 * stack and XDP are not running at the same time. Devices with
+	 * multiple queues should allocate a separate queue space.
+	 */
+	HARD_TX_LOCK(netdev, txq, smp_processor_id());
+
+	tx_ring = adapter->tx_ring;
+
+	if (E1000_DESC_UNUSED(tx_ring) < 2) {
+		HARD_TX_UNLOCK(netdev, txq);
+		return;
+	}
+
+	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
+	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
+
+	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
+	mmiowb();
+
+	HARD_TX_UNLOCK(netdev, txq);
+}
+
 #define NUM_REGS 38 /* 1 based count */
 static void e1000_regdump(struct e1000_adapter *adapter)
 {
@@ -4142,6 +4247,19 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
 	return skb;
 }
 
+static inline int e1000_call_bpf(struct bpf_prog *prog, void *data,
+				 unsigned int length)
+{
+	struct xdp_buff xdp;
+	int ret;
+
+	xdp.data = data;
+	xdp.data_end = data + length;
+	ret = BPF_PROG_RUN(prog, (void *)&xdp);
+
+	return ret;
+}
+
 /**
  * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
  * @adapter: board private structure
@@ -4160,12 +4278,15 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 	struct pci_dev *pdev = adapter->pdev;
 	struct e1000_rx_desc *rx_desc, *next_rxd;
 	struct e1000_rx_buffer *buffer_info, *next_buffer;
+	struct bpf_prog *prog;
 	u32 length;
 	unsigned int i;
 	int cleaned_count = 0;
 	bool cleaned = false;
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 
+	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
+	prog = READ_ONCE(adapter->prog);
 	i = rx_ring->next_to_clean;
 	rx_desc = E1000_RX_DESC(*rx_ring, i);
 	buffer_info = &rx_ring->buffer_info[i];
@@ -4191,12 +4312,55 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 
 		cleaned = true;
 		cleaned_count++;
+		length = le16_to_cpu(rx_desc->length);
+
+		if (prog) {
+			struct page *p = buffer_info->rxbuf.page;
+			dma_addr_t dma = buffer_info->dma;
+			int act;
+
+			if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
+				/* attached bpf disallows larger than page
+				 * packets, so this is hw error or corruption
+				 */
+				pr_info_once("%s buggy !eop\n", netdev->name);
+				break;
+			}
+			if (unlikely(rx_ring->rx_skb_top)) {
+				pr_info_once("%s ring resizing bug\n",
+					     netdev->name);
+				break;
+			}
+			dma_sync_single_for_cpu(&pdev->dev, dma,
+						length, DMA_FROM_DEVICE);
+			act = e1000_call_bpf(prog, page_address(p), length);
+			switch (act) {
+			case XDP_PASS:
+				break;
+			case XDP_TX:
+				dma_sync_single_for_device(&pdev->dev,
+							   dma,
+							   length,
+							   DMA_TO_DEVICE);
+				e1000_xmit_raw_frame(buffer_info, length,
+						     netdev, adapter);
+				buffer_info->rxbuf.page = NULL;
+			case XDP_DROP:
+			default:
+				/* re-use mapped page. keep buffer_info->dma
+				 * as-is, so that e1000_alloc_jumbo_rx_buffers
+				 * only needs to put it back into rx ring
+				 */
+				total_rx_bytes += length;
+				total_rx_packets++;
+				goto next_desc;
+			}
+		}
+
 		dma_unmap_page(&pdev->dev, buffer_info->dma,
 			       adapter->rx_buffer_len, DMA_FROM_DEVICE);
 		buffer_info->dma = 0;
 
-		length = le16_to_cpu(rx_desc->length);
-
 		/* errors is only valid for DD + EOP descriptors */
 		if (unlikely((status & E1000_RXD_STAT_EOP) &&
 		    (rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK))) {
@@ -4330,6 +4494,7 @@ next_desc:
 		rx_desc = next_rxd;
 		buffer_info = next_buffer;
 	}
+	rcu_read_unlock();
 	rx_ring->next_to_clean = i;
 
 	cleaned_count = E1000_DESC_UNUSED(rx_ring);

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-09 21:29 [net-next PATCH v2 1/2] e1000: add initial XDP support John Fastabend
@ 2016-09-09 21:29 ` John Fastabend
  2016-09-09 23:37   ` John Fastabend
                     ` (3 more replies)
  2016-09-09 22:04 ` [net-next PATCH v2 1/2] e1000: add initial XDP support Eric Dumazet
  2016-09-21  4:26 ` zhuyj
  2 siblings, 4 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-09 21:29 UTC (permalink / raw)
  To: bblanco, john.fastabend, alexei.starovoitov, jeffrey.t.kirsher,
	brouer, davem
  Cc: xiyou.wangcong, intel-wired-lan, u9012063, netdev

e1000 supports a single TX queue so it is being shared with the stack
when XDP runs XDP_TX action. This requires taking the xmit lock to
ensure we don't corrupt the tx ring. To avoid taking and dropping the
lock per packet this patch adds a bundling implementation to submit
a bundle of packets to the xmit routine.

I tested this patch running e1000 in a VM using KVM over a tap
device using pktgen to generate traffic along with 'ping -f -l 100'.

Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |   10 +++
 drivers/net/ethernet/intel/e1000/e1000_main.c |   81 +++++++++++++++++++------
 2 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index 5cf8a0a..877b377 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -133,6 +133,8 @@ struct e1000_adapter;
 #define E1000_TX_QUEUE_WAKE	16
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
 #define E1000_RX_BUFFER_WRITE	16 /* Must be power of 2 */
+/* How many XDP XMIT buffers to bundle into one xmit transaction */
+#define E1000_XDP_XMIT_BUNDLE_MAX E1000_RX_BUFFER_WRITE
 
 #define AUTO_ALL_MODES		0
 #define E1000_EEPROM_82544_APM	0x0004
@@ -168,6 +170,11 @@ struct e1000_rx_buffer {
 	dma_addr_t dma;
 };
 
+struct e1000_rx_buffer_bundle {
+	struct e1000_rx_buffer *buffer;
+	u32 length;
+};
+
 struct e1000_tx_ring {
 	/* pointer to the descriptor ring memory */
 	void *desc;
@@ -206,6 +213,9 @@ struct e1000_rx_ring {
 	struct e1000_rx_buffer *buffer_info;
 	struct sk_buff *rx_skb_top;
 
+	/* array of XDP buffer information structs */
+	struct e1000_rx_buffer_bundle *xdp_buffer;
+
 	/* cpu for rx queue */
 	int cpu;
 
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 91d5c87..b985271 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
 	struct pci_dev *pdev = adapter->pdev;
 	int size, desc_len;
 
+	size = sizeof(struct e1000_rx_buffer_bundle) *
+			E1000_XDP_XMIT_BUNDLE_MAX;
+	rxdr->xdp_buffer = vzalloc(size);
+	if (!rxdr->xdp_buffer)
+		return -ENOMEM;
+
 	size = sizeof(struct e1000_rx_buffer) * rxdr->count;
 	rxdr->buffer_info = vzalloc(size);
-	if (!rxdr->buffer_info)
+	if (!rxdr->buffer_info) {
+		vfree(rxdr->xdp_buffer);
 		return -ENOMEM;
+	}
 
 	desc_len = sizeof(struct e1000_rx_desc);
 
@@ -1754,6 +1762,7 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
 					GFP_KERNEL);
 	if (!rxdr->desc) {
 setup_rx_desc_die:
+		vfree(rxdr->xdp_buffer);
 		vfree(rxdr->buffer_info);
 		return -ENOMEM;
 	}
@@ -2087,6 +2096,9 @@ static void e1000_free_rx_resources(struct e1000_adapter *adapter,
 
 	e1000_clean_rx_ring(adapter, rx_ring);
 
+	vfree(rx_ring->xdp_buffer);
+	rx_ring->xdp_buffer = NULL;
+
 	vfree(rx_ring->buffer_info);
 	rx_ring->buffer_info = NULL;
 
@@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
 }
 
 static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
-				 unsigned int len,
-				 struct net_device *netdev,
-				 struct e1000_adapter *adapter)
+				 __u32 len,
+				 struct e1000_adapter *adapter,
+				 struct e1000_tx_ring *tx_ring)
 {
-	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
-	struct e1000_hw *hw = &adapter->hw;
-	struct e1000_tx_ring *tx_ring;
-
 	if (len > E1000_MAX_DATA_PER_TXD)
 		return;
 
+	if (E1000_DESC_UNUSED(tx_ring) < 2)
+		return;
+
+	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
+	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
+}
+
+static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
+				  struct net_device *netdev,
+				  struct e1000_adapter *adapter)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
+	struct e1000_tx_ring *tx_ring = adapter->tx_ring;
+	struct e1000_hw *hw = &adapter->hw;
+	int i = 0;
+
 	/* e1000 only support a single txq at the moment so the queue is being
 	 * shared with stack. To support this requires locking to ensure the
 	 * stack and XDP are not running at the same time. Devices with
 	 * multiple queues should allocate a separate queue space.
+	 *
+	 * To amortize the locking cost e1000 bundles the xmits and sends as
+	 * many as possible until either running out of descriptors or failing.
 	 */
 	HARD_TX_LOCK(netdev, txq, smp_processor_id());
 
-	tx_ring = adapter->tx_ring;
-
-	if (E1000_DESC_UNUSED(tx_ring) < 2) {
-		HARD_TX_UNLOCK(netdev, txq);
-		return;
+	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
+		e1000_xmit_raw_frame(buffer_info[i].buffer,
+				     buffer_info[i].length,
+				     adapter, tx_ring);
+		buffer_info[i].buffer->rxbuf.page = NULL;
+		buffer_info[i].buffer = NULL;
+		buffer_info[i].length = 0;
+		i++;
 	}
 
-	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
-	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
+	/* kick hardware to send bundle and return control back to the stack */
+	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
+	mmiowb();
 
 	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
 	mmiowb();
@@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 	struct bpf_prog *prog;
 	u32 length;
 	unsigned int i;
-	int cleaned_count = 0;
+	int cleaned_count = 0, xdp_xmit = 0;
 	bool cleaned = false;
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
 
 	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
 	prog = READ_ONCE(adapter->prog);
@@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 			case XDP_PASS:
 				break;
 			case XDP_TX:
+				xdp_bundle[xdp_xmit].buffer = buffer_info;
+				xdp_bundle[xdp_xmit].length = length;
 				dma_sync_single_for_device(&pdev->dev,
 							   dma,
 							   length,
 							   DMA_TO_DEVICE);
-				e1000_xmit_raw_frame(buffer_info, length,
-						     netdev, adapter);
-				buffer_info->rxbuf.page = NULL;
+				xdp_xmit++;
 			case XDP_DROP:
 			default:
 				/* re-use mapped page. keep buffer_info->dma
@@ -4486,8 +4518,14 @@ next_desc:
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
+			if (xdp_xmit)
+				e1000_xdp_xmit_bundle(xdp_bundle,
+						      netdev,
+						      adapter);
+
 			adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
 			cleaned_count = 0;
+			xdp_xmit = 0;
 		}
 
 		/* use prefetched values */
@@ -4498,8 +4536,11 @@ next_desc:
 	rx_ring->next_to_clean = i;
 
 	cleaned_count = E1000_DESC_UNUSED(rx_ring);
-	if (cleaned_count)
+	if (cleaned_count) {
+		if (xdp_xmit)
+			e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
 		adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
+	}
 
 	adapter->total_rx_packets += total_rx_packets;
 	adapter->total_rx_bytes += total_rx_bytes;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 1/2] e1000: add initial XDP support
  2016-09-09 21:29 [net-next PATCH v2 1/2] e1000: add initial XDP support John Fastabend
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
@ 2016-09-09 22:04 ` Eric Dumazet
  2016-09-09 23:33   ` John Fastabend
  2016-09-21  4:26 ` zhuyj
  2 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2016-09-09 22:04 UTC (permalink / raw)
  To: John Fastabend
  Cc: bblanco, alexei.starovoitov, jeffrey.t.kirsher, brouer, davem,
	xiyou.wangcong, intel-wired-lan, u9012063, netdev

On Fri, 2016-09-09 at 14:29 -0700, John Fastabend wrote:
> From: Alexei Starovoitov <ast@fb.com>
> 


So it looks like e1000_xmit_raw_frame() can return early,
say if there is no available descriptor.

> +static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> +				 unsigned int len,
> +				 struct net_device *netdev,
> +				 struct e1000_adapter *adapter)
> +{
> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +	struct e1000_hw *hw = &adapter->hw;
> +	struct e1000_tx_ring *tx_ring;
> +
> +	if (len > E1000_MAX_DATA_PER_TXD)
> +		return;
> +
> +	/* e1000 only support a single txq at the moment so the queue is being
> +	 * shared with stack. To support this requires locking to ensure the
> +	 * stack and XDP are not running at the same time. Devices with
> +	 * multiple queues should allocate a separate queue space.
> +	 */
> +	HARD_TX_LOCK(netdev, txq, smp_processor_id());
> +
> +	tx_ring = adapter->tx_ring;
> +
> +	if (E1000_DESC_UNUSED(tx_ring) < 2) {
> +		HARD_TX_UNLOCK(netdev, txq);
> +		return;
> +	}
> +
> +	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +
> +	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +	mmiowb();
> +
> +	HARD_TX_UNLOCK(netdev, txq);
> +}
> +
>  #define NUM_REGS 38 /* 1 based count */
>  static void e1000_regdump(struct e1000_adapter *adapter)
>  {
> @@ -4142,6 +4247,19 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
>  	return skb;
>  }
> +			act = e1000_call_bpf(prog, page_address(p), length);
> +			switch (act) {
> +			case XDP_PASS:
> +				break;
> +			case XDP_TX:
> +				dma_sync_single_for_device(&pdev->dev,
> +							   dma,
> +							   length,
> +							   DMA_TO_DEVICE);
> +				e1000_xmit_raw_frame(buffer_info, length,
> +						     netdev, adapter);
> +				buffer_info->rxbuf.page = NULL;


So I am trying to understand how pages are not leaked ?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 1/2] e1000: add initial XDP support
  2016-09-09 22:04 ` [net-next PATCH v2 1/2] e1000: add initial XDP support Eric Dumazet
@ 2016-09-09 23:33   ` John Fastabend
  0 siblings, 0 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-09 23:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: bblanco, alexei.starovoitov, jeffrey.t.kirsher, brouer, davem,
	xiyou.wangcong, intel-wired-lan, u9012063, netdev

On 16-09-09 03:04 PM, Eric Dumazet wrote:
> On Fri, 2016-09-09 at 14:29 -0700, John Fastabend wrote:
>> From: Alexei Starovoitov <ast@fb.com>
>>
> 
> 
> So it looks like e1000_xmit_raw_frame() can return early,
> say if there is no available descriptor.
> 
>> +static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
>> +				 unsigned int len,
>> +				 struct net_device *netdev,
>> +				 struct e1000_adapter *adapter)
>> +{
>> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
>> +	struct e1000_hw *hw = &adapter->hw;
>> +	struct e1000_tx_ring *tx_ring;
>> +
>> +	if (len > E1000_MAX_DATA_PER_TXD)
>> +		return;
>> +
>> +	/* e1000 only support a single txq at the moment so the queue is being
>> +	 * shared with stack. To support this requires locking to ensure the
>> +	 * stack and XDP are not running at the same time. Devices with
>> +	 * multiple queues should allocate a separate queue space.
>> +	 */
>> +	HARD_TX_LOCK(netdev, txq, smp_processor_id());
>> +
>> +	tx_ring = adapter->tx_ring;
>> +
>> +	if (E1000_DESC_UNUSED(tx_ring) < 2) {
>> +		HARD_TX_UNLOCK(netdev, txq);
>> +		return;
>> +	}
>> +
>> +	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
>> +	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
>> +
>> +	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>> +	mmiowb();
>> +
>> +	HARD_TX_UNLOCK(netdev, txq);
>> +}
>> +
>>  #define NUM_REGS 38 /* 1 based count */
>>  static void e1000_regdump(struct e1000_adapter *adapter)
>>  {
>> @@ -4142,6 +4247,19 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
>>  	return skb;
>>  }
>> +			act = e1000_call_bpf(prog, page_address(p), length);
>> +			switch (act) {
>> +			case XDP_PASS:
>> +				break;
>> +			case XDP_TX:
>> +				dma_sync_single_for_device(&pdev->dev,
>> +							   dma,
>> +							   length,
>> +							   DMA_TO_DEVICE);
>> +				e1000_xmit_raw_frame(buffer_info, length,
>> +						     netdev, adapter);
>> +				buffer_info->rxbuf.page = NULL;
> 
> 
> So I am trying to understand how pages are not leaked ?
> 
> 

Pages are being leaked thanks! v3 coming soon.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
@ 2016-09-09 23:37   ` John Fastabend
  2016-09-09 23:44   ` Tom Herbert
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-09 23:37 UTC (permalink / raw)
  To: bblanco, alexei.starovoitov, jeffrey.t.kirsher, brouer, davem
  Cc: xiyou.wangcong, intel-wired-lan, u9012063, netdev

On 16-09-09 02:29 PM, John Fastabend wrote:
> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
> 
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
> 
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---

This patch is a bit bogus in a few spots as well...


> -
> -	if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -		HARD_TX_UNLOCK(netdev, txq);
> -		return;
> +	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
> +		e1000_xmit_raw_frame(buffer_info[i].buffer,
> +				     buffer_info[i].length,
> +				     adapter, tx_ring);
> +		buffer_info[i].buffer->rxbuf.page = NULL;
> +		buffer_info[i].buffer = NULL;
> +		buffer_info[i].length = 0;
> +		i++;
              ^^^^^^^^

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
  2016-09-09 23:37   ` John Fastabend
@ 2016-09-09 23:44   ` Tom Herbert
  2016-09-10  0:01     ` John Fastabend
  2016-09-10 15:36   ` Tom Herbert
  2016-09-12 12:17   ` Jesper Dangaard Brouer
  3 siblings, 1 reply; 24+ messages in thread
From: Tom Herbert @ 2016-09-09 23:44 UTC (permalink / raw)
  To: John Fastabend
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, u9012063, Linux Kernel Network Developers

On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
>
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
>
Hi John,

How does this interact with BQL on e1000?

Tom

> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |   10 +++
>  drivers/net/ethernet/intel/e1000/e1000_main.c |   81 +++++++++++++++++++------
>  2 files changed, 71 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index 5cf8a0a..877b377 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -133,6 +133,8 @@ struct e1000_adapter;
>  #define E1000_TX_QUEUE_WAKE    16
>  /* How many Rx Buffers do we bundle into one write to the hardware ? */
>  #define E1000_RX_BUFFER_WRITE  16 /* Must be power of 2 */
> +/* How many XDP XMIT buffers to bundle into one xmit transaction */
> +#define E1000_XDP_XMIT_BUNDLE_MAX E1000_RX_BUFFER_WRITE
>
>  #define AUTO_ALL_MODES         0
>  #define E1000_EEPROM_82544_APM 0x0004
> @@ -168,6 +170,11 @@ struct e1000_rx_buffer {
>         dma_addr_t dma;
>  };
>
> +struct e1000_rx_buffer_bundle {
> +       struct e1000_rx_buffer *buffer;
> +       u32 length;
> +};
> +
>  struct e1000_tx_ring {
>         /* pointer to the descriptor ring memory */
>         void *desc;
> @@ -206,6 +213,9 @@ struct e1000_rx_ring {
>         struct e1000_rx_buffer *buffer_info;
>         struct sk_buff *rx_skb_top;
>
> +       /* array of XDP buffer information structs */
> +       struct e1000_rx_buffer_bundle *xdp_buffer;
> +
>         /* cpu for rx queue */
>         int cpu;
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 91d5c87..b985271 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>         struct pci_dev *pdev = adapter->pdev;
>         int size, desc_len;
>
> +       size = sizeof(struct e1000_rx_buffer_bundle) *
> +                       E1000_XDP_XMIT_BUNDLE_MAX;
> +       rxdr->xdp_buffer = vzalloc(size);
> +       if (!rxdr->xdp_buffer)
> +               return -ENOMEM;
> +
>         size = sizeof(struct e1000_rx_buffer) * rxdr->count;
>         rxdr->buffer_info = vzalloc(size);
> -       if (!rxdr->buffer_info)
> +       if (!rxdr->buffer_info) {
> +               vfree(rxdr->xdp_buffer);
>                 return -ENOMEM;
> +       }
>
>         desc_len = sizeof(struct e1000_rx_desc);
>
> @@ -1754,6 +1762,7 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>                                         GFP_KERNEL);
>         if (!rxdr->desc) {
>  setup_rx_desc_die:
> +               vfree(rxdr->xdp_buffer);
>                 vfree(rxdr->buffer_info);
>                 return -ENOMEM;
>         }
> @@ -2087,6 +2096,9 @@ static void e1000_free_rx_resources(struct e1000_adapter *adapter,
>
>         e1000_clean_rx_ring(adapter, rx_ring);
>
> +       vfree(rx_ring->xdp_buffer);
> +       rx_ring->xdp_buffer = NULL;
> +
>         vfree(rx_ring->buffer_info);
>         rx_ring->buffer_info = NULL;
>
> @@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
>  }
>
>  static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> -                                unsigned int len,
> -                                struct net_device *netdev,
> -                                struct e1000_adapter *adapter)
> +                                __u32 len,
> +                                struct e1000_adapter *adapter,
> +                                struct e1000_tx_ring *tx_ring)
>  {
> -       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> -       struct e1000_hw *hw = &adapter->hw;
> -       struct e1000_tx_ring *tx_ring;
> -
>         if (len > E1000_MAX_DATA_PER_TXD)
>                 return;
>
> +       if (E1000_DESC_UNUSED(tx_ring) < 2)
> +               return;
> +
> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +}
> +
> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
> +                                 struct net_device *netdev,
> +                                 struct e1000_adapter *adapter)
> +{
> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +       struct e1000_tx_ring *tx_ring = adapter->tx_ring;
> +       struct e1000_hw *hw = &adapter->hw;
> +       int i = 0;
> +
>         /* e1000 only support a single txq at the moment so the queue is being
>          * shared with stack. To support this requires locking to ensure the
>          * stack and XDP are not running at the same time. Devices with
>          * multiple queues should allocate a separate queue space.
> +        *
> +        * To amortize the locking cost e1000 bundles the xmits and sends as
> +        * many as possible until either running out of descriptors or failing.
>          */
>         HARD_TX_LOCK(netdev, txq, smp_processor_id());
>
> -       tx_ring = adapter->tx_ring;
> -
> -       if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -               HARD_TX_UNLOCK(netdev, txq);
> -               return;
> +       for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
> +               e1000_xmit_raw_frame(buffer_info[i].buffer,
> +                                    buffer_info[i].length,
> +                                    adapter, tx_ring);
> +               buffer_info[i].buffer->rxbuf.page = NULL;
> +               buffer_info[i].buffer = NULL;
> +               buffer_info[i].length = 0;
> +               i++;
>         }
>
> -       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> -       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +       /* kick hardware to send bundle and return control back to the stack */
> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +       mmiowb();
>
>         writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>         mmiowb();
> @@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>         struct bpf_prog *prog;
>         u32 length;
>         unsigned int i;
> -       int cleaned_count = 0;
> +       int cleaned_count = 0, xdp_xmit = 0;
>         bool cleaned = false;
>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> +       struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
>
>         rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>         prog = READ_ONCE(adapter->prog);
> @@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>                         case XDP_PASS:
>                                 break;
>                         case XDP_TX:
> +                               xdp_bundle[xdp_xmit].buffer = buffer_info;
> +                               xdp_bundle[xdp_xmit].length = length;
>                                 dma_sync_single_for_device(&pdev->dev,
>                                                            dma,
>                                                            length,
>                                                            DMA_TO_DEVICE);
> -                               e1000_xmit_raw_frame(buffer_info, length,
> -                                                    netdev, adapter);
> -                               buffer_info->rxbuf.page = NULL;
> +                               xdp_xmit++;
>                         case XDP_DROP:
>                         default:
>                                 /* re-use mapped page. keep buffer_info->dma
> @@ -4486,8 +4518,14 @@ next_desc:
>
>                 /* return some buffers to hardware, one at a time is too slow */
>                 if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
> +                       if (xdp_xmit)
> +                               e1000_xdp_xmit_bundle(xdp_bundle,
> +                                                     netdev,
> +                                                     adapter);
> +
>                         adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
>                         cleaned_count = 0;
> +                       xdp_xmit = 0;
>                 }
>
>                 /* use prefetched values */
> @@ -4498,8 +4536,11 @@ next_desc:
>         rx_ring->next_to_clean = i;
>
>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
> -       if (cleaned_count)
> +       if (cleaned_count) {
> +               if (xdp_xmit)
> +                       e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
>                 adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
> +       }
>
>         adapter->total_rx_packets += total_rx_packets;
>         adapter->total_rx_bytes += total_rx_bytes;
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-09 23:44   ` Tom Herbert
@ 2016-09-10  0:01     ` John Fastabend
  2016-09-10  1:04       ` Tom Herbert
  0 siblings, 1 reply; 24+ messages in thread
From: John Fastabend @ 2016-09-10  0:01 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, u9012063, Linux Kernel Network Developers

On 16-09-09 04:44 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> e1000 supports a single TX queue so it is being shared with the stack
>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> lock per packet this patch adds a bundling implementation to submit
>> a bundle of packets to the xmit routine.
>>
>> I tested this patch running e1000 in a VM using KVM over a tap
>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>
> Hi John,
> 
> How does this interact with BQL on e1000?
> 
> Tom
> 

Let me check if I have the API correct. When we enqueue a packet to
be sent we must issue a netdev_sent_queue() call and then on actual
transmission issue a netdev_completed_queue().

The patch attached here missed a few things though.

But it looks like I just need to call netdev_sent_queue() from the
e1000_xmit_raw_frame() routine and then let the tx completion logic
kick in which will call netdev_completed_queue() correctly.

I'll need to add a check for the queue state as well. So if I do these
three things,

	check __QUEUE_STATE_XOFF before sending
	netdev_sent_queue() -> on XDP_TX
	netdev_completed_queue()

It should work agree? Now should we do this even when XDP owns the
queue? Or is this purely an issue with sharing the queue between
XDP and stack.

.John

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  0:01     ` John Fastabend
@ 2016-09-10  1:04       ` Tom Herbert
  2016-09-10  1:12         ` John Fastabend
  0 siblings, 1 reply; 24+ messages in thread
From: Tom Herbert @ 2016-09-10  1:04 UTC (permalink / raw)
  To: John Fastabend
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, u9012063, Linux Kernel Network Developers

On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 16-09-09 04:44 PM, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>> e1000 supports a single TX queue so it is being shared with the stack
>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>> lock per packet this patch adds a bundling implementation to submit
>>> a bundle of packets to the xmit routine.
>>>
>>> I tested this patch running e1000 in a VM using KVM over a tap
>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>
>> Hi John,
>>
>> How does this interact with BQL on e1000?
>>
>> Tom
>>
>
> Let me check if I have the API correct. When we enqueue a packet to
> be sent we must issue a netdev_sent_queue() call and then on actual
> transmission issue a netdev_completed_queue().
>
> The patch attached here missed a few things though.
>
> But it looks like I just need to call netdev_sent_queue() from the
> e1000_xmit_raw_frame() routine and then let the tx completion logic
> kick in which will call netdev_completed_queue() correctly.
>
> I'll need to add a check for the queue state as well. So if I do these
> three things,
>
>         check __QUEUE_STATE_XOFF before sending
>         netdev_sent_queue() -> on XDP_TX
>         netdev_completed_queue()
>
> It should work agree? Now should we do this even when XDP owns the
> queue? Or is this purely an issue with sharing the queue between
> XDP and stack.
>
But what is the action for XDP_TX if the queue is stopped? There is no
qdisc to back pressure in the XDP path. Would we just start dropping
packets then?

Tom

> .John
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  1:04       ` Tom Herbert
@ 2016-09-10  1:12         ` John Fastabend
  2016-09-10  1:19           ` Tom Herbert
  0 siblings, 1 reply; 24+ messages in thread
From: John Fastabend @ 2016-09-10  1:12 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, u9012063, Linux Kernel Network Developers

On 16-09-09 06:04 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>> lock per packet this patch adds a bundling implementation to submit
>>>> a bundle of packets to the xmit routine.
>>>>
>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>
>>> Hi John,
>>>
>>> How does this interact with BQL on e1000?
>>>
>>> Tom
>>>
>>
>> Let me check if I have the API correct. When we enqueue a packet to
>> be sent we must issue a netdev_sent_queue() call and then on actual
>> transmission issue a netdev_completed_queue().
>>
>> The patch attached here missed a few things though.
>>
>> But it looks like I just need to call netdev_sent_queue() from the
>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>> kick in which will call netdev_completed_queue() correctly.
>>
>> I'll need to add a check for the queue state as well. So if I do these
>> three things,
>>
>>         check __QUEUE_STATE_XOFF before sending
>>         netdev_sent_queue() -> on XDP_TX
>>         netdev_completed_queue()
>>
>> It should work agree? Now should we do this even when XDP owns the
>> queue? Or is this purely an issue with sharing the queue between
>> XDP and stack.
>>
> But what is the action for XDP_TX if the queue is stopped? There is no
> qdisc to back pressure in the XDP path. Would we just start dropping
> packets then?

Yep that is what the patch does if there is any sort of error packets
get dropped on the floor. I don't think there is anything else that
can be done.

> 
> Tom
> 
>> .John
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  1:12         ` John Fastabend
@ 2016-09-10  1:19           ` Tom Herbert
  2016-09-10  1:40             ` Alexei Starovoitov
  2016-09-12 11:56             ` Jesper Dangaard Brouer
  0 siblings, 2 replies; 24+ messages in thread
From: Tom Herbert @ 2016-09-10  1:19 UTC (permalink / raw)
  To: John Fastabend
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 16-09-09 06:04 PM, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>> a bundle of packets to the xmit routine.
>>>>>
>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>
>>>> Hi John,
>>>>
>>>> How does this interact with BQL on e1000?
>>>>
>>>> Tom
>>>>
>>>
>>> Let me check if I have the API correct. When we enqueue a packet to
>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>> transmission issue a netdev_completed_queue().
>>>
>>> The patch attached here missed a few things though.
>>>
>>> But it looks like I just need to call netdev_sent_queue() from the
>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>> kick in which will call netdev_completed_queue() correctly.
>>>
>>> I'll need to add a check for the queue state as well. So if I do these
>>> three things,
>>>
>>>         check __QUEUE_STATE_XOFF before sending
>>>         netdev_sent_queue() -> on XDP_TX
>>>         netdev_completed_queue()
>>>
>>> It should work agree? Now should we do this even when XDP owns the
>>> queue? Or is this purely an issue with sharing the queue between
>>> XDP and stack.
>>>
>> But what is the action for XDP_TX if the queue is stopped? There is no
>> qdisc to back pressure in the XDP path. Would we just start dropping
>> packets then?
>
> Yep that is what the patch does if there is any sort of error packets
> get dropped on the floor. I don't think there is anything else that
> can be done.
>
That probably means that the stack will always win out under load.
Trying to used the same queue where half of the packets are well
managed by a qdisc and half aren't is going to leave someone unhappy.
Maybe in the this case where we have to share the qdisc we can
allocate the skb on on returning XDP_TX and send through the normal
qdisc for the device.

Tom

>>
>> Tom
>>
>>> .John
>>>
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  1:19           ` Tom Herbert
@ 2016-09-10  1:40             ` Alexei Starovoitov
  2016-09-10  3:12               ` Tom Herbert
  2016-09-12 11:56             ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 24+ messages in thread
From: Alexei Starovoitov @ 2016-09-10  1:40 UTC (permalink / raw)
  To: Tom Herbert
  Cc: John Fastabend, Brenden Blanco, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> > On 16-09-09 06:04 PM, Tom Herbert wrote:
> >> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> >>> On 16-09-09 04:44 PM, Tom Herbert wrote:
> >>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> >>>>> e1000 supports a single TX queue so it is being shared with the stack
> >>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
> >>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> >>>>> lock per packet this patch adds a bundling implementation to submit
> >>>>> a bundle of packets to the xmit routine.
> >>>>>
> >>>>> I tested this patch running e1000 in a VM using KVM over a tap
> >>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
> >>>>>
> >>>> Hi John,
> >>>>
> >>>> How does this interact with BQL on e1000?
> >>>>
> >>>> Tom
> >>>>
> >>>
> >>> Let me check if I have the API correct. When we enqueue a packet to
> >>> be sent we must issue a netdev_sent_queue() call and then on actual
> >>> transmission issue a netdev_completed_queue().
> >>>
> >>> The patch attached here missed a few things though.
> >>>
> >>> But it looks like I just need to call netdev_sent_queue() from the
> >>> e1000_xmit_raw_frame() routine and then let the tx completion logic
> >>> kick in which will call netdev_completed_queue() correctly.
> >>>
> >>> I'll need to add a check for the queue state as well. So if I do these
> >>> three things,
> >>>
> >>>         check __QUEUE_STATE_XOFF before sending
> >>>         netdev_sent_queue() -> on XDP_TX
> >>>         netdev_completed_queue()
> >>>
> >>> It should work agree? Now should we do this even when XDP owns the
> >>> queue? Or is this purely an issue with sharing the queue between
> >>> XDP and stack.
> >>>
> >> But what is the action for XDP_TX if the queue is stopped? There is no
> >> qdisc to back pressure in the XDP path. Would we just start dropping
> >> packets then?
> >
> > Yep that is what the patch does if there is any sort of error packets
> > get dropped on the floor. I don't think there is anything else that
> > can be done.
> >
> That probably means that the stack will always win out under load.
> Trying to used the same queue where half of the packets are well
> managed by a qdisc and half aren't is going to leave someone unhappy.
> Maybe in the this case where we have to share the qdisc we can
> allocate the skb on on returning XDP_TX and send through the normal
> qdisc for the device.

I wouldn't go to such extremes for e1k.
The only reason to have xdp in e1k is to use it for testing
of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
Existing stack with skb is perfectly fine as it is.
No need to do recycling, batching or any other complex things.
xdp for e1k cannot be used as an example for other drivers either,
since there is only one tx ring and any high performance adapter
has more which makes the driver support quite different.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  1:40             ` Alexei Starovoitov
@ 2016-09-10  3:12               ` Tom Herbert
  2016-09-10  3:26                 ` John Fastabend
  2016-09-10  3:56                 ` Alexei Starovoitov
  0 siblings, 2 replies; 24+ messages in thread
From: Tom Herbert @ 2016-09-10  3:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: John Fastabend, Brenden Blanco, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> > On 16-09-09 06:04 PM, Tom Herbert wrote:
>> >> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> >>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>> >>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> >>>>> e1000 supports a single TX queue so it is being shared with the stack
>> >>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> >>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> >>>>> lock per packet this patch adds a bundling implementation to submit
>> >>>>> a bundle of packets to the xmit routine.
>> >>>>>
>> >>>>> I tested this patch running e1000 in a VM using KVM over a tap
>> >>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>> >>>>>
>> >>>> Hi John,
>> >>>>
>> >>>> How does this interact with BQL on e1000?
>> >>>>
>> >>>> Tom
>> >>>>
>> >>>
>> >>> Let me check if I have the API correct. When we enqueue a packet to
>> >>> be sent we must issue a netdev_sent_queue() call and then on actual
>> >>> transmission issue a netdev_completed_queue().
>> >>>
>> >>> The patch attached here missed a few things though.
>> >>>
>> >>> But it looks like I just need to call netdev_sent_queue() from the
>> >>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>> >>> kick in which will call netdev_completed_queue() correctly.
>> >>>
>> >>> I'll need to add a check for the queue state as well. So if I do these
>> >>> three things,
>> >>>
>> >>>         check __QUEUE_STATE_XOFF before sending
>> >>>         netdev_sent_queue() -> on XDP_TX
>> >>>         netdev_completed_queue()
>> >>>
>> >>> It should work agree? Now should we do this even when XDP owns the
>> >>> queue? Or is this purely an issue with sharing the queue between
>> >>> XDP and stack.
>> >>>
>> >> But what is the action for XDP_TX if the queue is stopped? There is no
>> >> qdisc to back pressure in the XDP path. Would we just start dropping
>> >> packets then?
>> >
>> > Yep that is what the patch does if there is any sort of error packets
>> > get dropped on the floor. I don't think there is anything else that
>> > can be done.
>> >
>> That probably means that the stack will always win out under load.
>> Trying to used the same queue where half of the packets are well
>> managed by a qdisc and half aren't is going to leave someone unhappy.
>> Maybe in the this case where we have to share the qdisc we can
>> allocate the skb on on returning XDP_TX and send through the normal
>> qdisc for the device.
>
> I wouldn't go to such extremes for e1k.
> The only reason to have xdp in e1k is to use it for testing
> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.

I imagine someone may want this for the non-forwarding use cases like
early drop for DOS mitigation. Regardless of the use case, I don't
think we can break the fundamental assumptions made for qdiscs or the
rest of the transmit path. If XDP must transmit on a queue shared with
the stack we need to abide by the stack's rules for transmitting on
the queue-- which would mean alloc skbuff and go through qdisc (which
really shouldn't be difficult to implement). Emulating various
functions of the stack in the XDP TX path, like this patch seems to be
doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
trying to keep things coherent.

> Existing stack with skb is perfectly fine as it is.
> No need to do recycling, batching or any other complex things.
> xdp for e1k cannot be used as an example for other drivers either,
> since there is only one tx ring and any high performance adapter
> has more which makes the driver support quite different.
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  3:12               ` Tom Herbert
@ 2016-09-10  3:26                 ` John Fastabend
  2016-09-10  4:13                   ` Tom Herbert
  2016-09-10  3:56                 ` Alexei Starovoitov
  1 sibling, 1 reply; 24+ messages in thread
From: John Fastabend @ 2016-09-10  3:26 UTC (permalink / raw)
  To: Tom Herbert, Alexei Starovoitov
  Cc: Brenden Blanco, Jeff Kirsher, Jesper Dangaard Brouer,
	David S. Miller, Cong Wang, intel-wired-lan, William Tu,
	Linux Kernel Network Developers

On 16-09-09 08:12 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>> On 16-09-09 06:04 PM, Tom Herbert wrote:
>>>>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>>>>> a bundle of packets to the xmit routine.
>>>>>>>>
>>>>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>>>>
>>>>>>> Hi John,
>>>>>>>
>>>>>>> How does this interact with BQL on e1000?
>>>>>>>
>>>>>>> Tom
>>>>>>>
>>>>>>
>>>>>> Let me check if I have the API correct. When we enqueue a packet to
>>>>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>>>>> transmission issue a netdev_completed_queue().
>>>>>>
>>>>>> The patch attached here missed a few things though.
>>>>>>
>>>>>> But it looks like I just need to call netdev_sent_queue() from the
>>>>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>>>>> kick in which will call netdev_completed_queue() correctly.
>>>>>>
>>>>>> I'll need to add a check for the queue state as well. So if I do these
>>>>>> three things,
>>>>>>
>>>>>>         check __QUEUE_STATE_XOFF before sending
>>>>>>         netdev_sent_queue() -> on XDP_TX
>>>>>>         netdev_completed_queue()
>>>>>>
>>>>>> It should work agree? Now should we do this even when XDP owns the
>>>>>> queue? Or is this purely an issue with sharing the queue between
>>>>>> XDP and stack.
>>>>>>
>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
>>>>> packets then?
>>>>
>>>> Yep that is what the patch does if there is any sort of error packets
>>>> get dropped on the floor. I don't think there is anything else that
>>>> can be done.
>>>>
>>> That probably means that the stack will always win out under load.
>>> Trying to used the same queue where half of the packets are well
>>> managed by a qdisc and half aren't is going to leave someone unhappy.
>>> Maybe in the this case where we have to share the qdisc we can
>>> allocate the skb on on returning XDP_TX and send through the normal
>>> qdisc for the device.
>>
>> I wouldn't go to such extremes for e1k.
>> The only reason to have xdp in e1k is to use it for testing
>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
> 
> I imagine someone may want this for the non-forwarding use cases like
> early drop for DOS mitigation. Regardless of the use case, I don't
> think we can break the fundamental assumptions made for qdiscs or the
> rest of the transmit path. If XDP must transmit on a queue shared with
> the stack we need to abide by the stack's rules for transmitting on
> the queue-- which would mean alloc skbuff and go through qdisc (which

If we require XDP_TX to go up to qdisc layer its best not to implement
it at all and just handle it in normal ingress path. That said I think
users have to expect that XDP will interfere with qdisc schemes. Even
with its own tx queue its going to interfere at the hardware level with
bandwidth as the hardware round robins through the queues or uses
whatever hardware strategy it is configured to use. Additionally it
will bypass things like BQL, etc.

> really shouldn't be difficult to implement). Emulating various
> functions of the stack in the XDP TX path, like this patch seems to be
> doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
> trying to keep things coherent.

I think bundling tx xmits is fair game as an internal optimization and
doesn't need to be exposed at the XDP layer. Drivers already do this
type of optimizations for allocating buffers. It likely doesn't matter
much at the e1k level but doing a tail update on every pkt with the
40gbps drivers likely will be noticeable is my gut feeling.


> 
>> Existing stack with skb is perfectly fine as it is.
>> No need to do recycling, batching or any other complex things.
>> xdp for e1k cannot be used as an example for other drivers either,
>> since there is only one tx ring and any high performance adapter
>> has more which makes the driver support quite different.
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  3:12               ` Tom Herbert
  2016-09-10  3:26                 ` John Fastabend
@ 2016-09-10  3:56                 ` Alexei Starovoitov
  1 sibling, 0 replies; 24+ messages in thread
From: Alexei Starovoitov @ 2016-09-10  3:56 UTC (permalink / raw)
  To: Tom Herbert
  Cc: John Fastabend, Brenden Blanco, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On Fri, Sep 09, 2016 at 08:12:52PM -0700, Tom Herbert wrote:
> >> That probably means that the stack will always win out under load.
> >> Trying to used the same queue where half of the packets are well
> >> managed by a qdisc and half aren't is going to leave someone unhappy.
> >> Maybe in the this case where we have to share the qdisc we can
> >> allocate the skb on on returning XDP_TX and send through the normal
> >> qdisc for the device.
> >
> > I wouldn't go to such extremes for e1k.
> > The only reason to have xdp in e1k is to use it for testing
> > of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
> 
> I imagine someone may want this for the non-forwarding use cases like
> early drop for DOS mitigation.

Sure and they will be doing it on the NICs that they have in their servers.
e1k is not that nic. xdp e1k is for debugging xdp programs in KVM only.
Performance of such xdp programs on e1k is irrelevant.
There is absolutely no need to complicate the driver and the patches.
All other drivers is a different story.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  3:26                 ` John Fastabend
@ 2016-09-10  4:13                   ` Tom Herbert
  2016-09-12  3:15                     ` John Fastabend
  0 siblings, 1 reply; 24+ messages in thread
From: Tom Herbert @ 2016-09-10  4:13 UTC (permalink / raw)
  To: John Fastabend
  Cc: Alexei Starovoitov, Brenden Blanco, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On Fri, Sep 9, 2016 at 8:26 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 16-09-09 08:12 PM, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>>> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>>>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>> On 16-09-09 06:04 PM, Tom Herbert wrote:
>>>>>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>>>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>>>>>> a bundle of packets to the xmit routine.
>>>>>>>>>
>>>>>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>>>>>
>>>>>>>> Hi John,
>>>>>>>>
>>>>>>>> How does this interact with BQL on e1000?
>>>>>>>>
>>>>>>>> Tom
>>>>>>>>
>>>>>>>
>>>>>>> Let me check if I have the API correct. When we enqueue a packet to
>>>>>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>>>>>> transmission issue a netdev_completed_queue().
>>>>>>>
>>>>>>> The patch attached here missed a few things though.
>>>>>>>
>>>>>>> But it looks like I just need to call netdev_sent_queue() from the
>>>>>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>>>>>> kick in which will call netdev_completed_queue() correctly.
>>>>>>>
>>>>>>> I'll need to add a check for the queue state as well. So if I do these
>>>>>>> three things,
>>>>>>>
>>>>>>>         check __QUEUE_STATE_XOFF before sending
>>>>>>>         netdev_sent_queue() -> on XDP_TX
>>>>>>>         netdev_completed_queue()
>>>>>>>
>>>>>>> It should work agree? Now should we do this even when XDP owns the
>>>>>>> queue? Or is this purely an issue with sharing the queue between
>>>>>>> XDP and stack.
>>>>>>>
>>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
>>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
>>>>>> packets then?
>>>>>
>>>>> Yep that is what the patch does if there is any sort of error packets
>>>>> get dropped on the floor. I don't think there is anything else that
>>>>> can be done.
>>>>>
>>>> That probably means that the stack will always win out under load.
>>>> Trying to used the same queue where half of the packets are well
>>>> managed by a qdisc and half aren't is going to leave someone unhappy.
>>>> Maybe in the this case where we have to share the qdisc we can
>>>> allocate the skb on on returning XDP_TX and send through the normal
>>>> qdisc for the device.
>>>
>>> I wouldn't go to such extremes for e1k.
>>> The only reason to have xdp in e1k is to use it for testing
>>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
>>
>> I imagine someone may want this for the non-forwarding use cases like
>> early drop for DOS mitigation. Regardless of the use case, I don't
>> think we can break the fundamental assumptions made for qdiscs or the
>> rest of the transmit path. If XDP must transmit on a queue shared with
>> the stack we need to abide by the stack's rules for transmitting on
>> the queue-- which would mean alloc skbuff and go through qdisc (which
>
> If we require XDP_TX to go up to qdisc layer its best not to implement
> it at all and just handle it in normal ingress path. That said I think
> users have to expect that XDP will interfere with qdisc schemes. Even
> with its own tx queue its going to interfere at the hardware level with
> bandwidth as the hardware round robins through the queues or uses
> whatever hardware strategy it is configured to use. Additionally it
> will bypass things like BQL, etc.
>
Right, but not all use cases involve XDP_TX (like DOS mitigation as I
pointed out). Since you've already done 95% of the work, can you take
a look at creating the skbuff and injecting into the stack for XDP_TX
so we can evaluate the performance and impact of that :-)

With separate TX queues it's explicit which queues are managed by the
stack. This is no different than what kernel bypass gives use, we are
relying on HW to do something reasonable in scheduling MQ.

>> really shouldn't be difficult to implement). Emulating various
>> functions of the stack in the XDP TX path, like this patch seems to be
>> doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
>> trying to keep things coherent.
>
> I think bundling tx xmits is fair game as an internal optimization and
> doesn't need to be exposed at the XDP layer. Drivers already do this
> type of optimizations for allocating buffers. It likely doesn't matter
> much at the e1k level but doing a tail update on every pkt with the
> 40gbps drivers likely will be noticeable is my gut feeling.
>
How is this different than what XMIT_MORE gives us?

Tom

>
>>
>>> Existing stack with skb is perfectly fine as it is.
>>> No need to do recycling, batching or any other complex things.
>>> xdp for e1k cannot be used as an example for other drivers either,
>>> since there is only one tx ring and any high performance adapter
>>> has more which makes the driver support quite different.
>>>
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
  2016-09-09 23:37   ` John Fastabend
  2016-09-09 23:44   ` Tom Herbert
@ 2016-09-10 15:36   ` Tom Herbert
  2016-09-12  3:07     ` John Fastabend
  2016-09-12 12:17   ` Jesper Dangaard Brouer
  3 siblings, 1 reply; 24+ messages in thread
From: Tom Herbert @ 2016-09-10 15:36 UTC (permalink / raw)
  To: John Fastabend
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
>
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
>
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |   10 +++
>  drivers/net/ethernet/intel/e1000/e1000_main.c |   81 +++++++++++++++++++------
>  2 files changed, 71 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index 5cf8a0a..877b377 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -133,6 +133,8 @@ struct e1000_adapter;
>  #define E1000_TX_QUEUE_WAKE    16
>  /* How many Rx Buffers do we bundle into one write to the hardware ? */
>  #define E1000_RX_BUFFER_WRITE  16 /* Must be power of 2 */
> +/* How many XDP XMIT buffers to bundle into one xmit transaction */
> +#define E1000_XDP_XMIT_BUNDLE_MAX E1000_RX_BUFFER_WRITE
>
>  #define AUTO_ALL_MODES         0
>  #define E1000_EEPROM_82544_APM 0x0004
> @@ -168,6 +170,11 @@ struct e1000_rx_buffer {
>         dma_addr_t dma;
>  };
>
> +struct e1000_rx_buffer_bundle {
> +       struct e1000_rx_buffer *buffer;
> +       u32 length;
> +};
> +
>  struct e1000_tx_ring {
>         /* pointer to the descriptor ring memory */
>         void *desc;
> @@ -206,6 +213,9 @@ struct e1000_rx_ring {
>         struct e1000_rx_buffer *buffer_info;
>         struct sk_buff *rx_skb_top;
>
> +       /* array of XDP buffer information structs */
> +       struct e1000_rx_buffer_bundle *xdp_buffer;
> +
>         /* cpu for rx queue */
>         int cpu;
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 91d5c87..b985271 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>         struct pci_dev *pdev = adapter->pdev;
>         int size, desc_len;
>
> +       size = sizeof(struct e1000_rx_buffer_bundle) *
> +                       E1000_XDP_XMIT_BUNDLE_MAX;
> +       rxdr->xdp_buffer = vzalloc(size);
> +       if (!rxdr->xdp_buffer)
> +               return -ENOMEM;
> +
>         size = sizeof(struct e1000_rx_buffer) * rxdr->count;
>         rxdr->buffer_info = vzalloc(size);
> -       if (!rxdr->buffer_info)
> +       if (!rxdr->buffer_info) {
> +               vfree(rxdr->xdp_buffer);

This could be deferred until an XDP program is added.

>                 return -ENOMEM;
> +       }
>
>         desc_len = sizeof(struct e1000_rx_desc);
>
> @@ -1754,6 +1762,7 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>                                         GFP_KERNEL);
>         if (!rxdr->desc) {
>  setup_rx_desc_die:
> +               vfree(rxdr->xdp_buffer);
>                 vfree(rxdr->buffer_info);
>                 return -ENOMEM;
>         }
> @@ -2087,6 +2096,9 @@ static void e1000_free_rx_resources(struct e1000_adapter *adapter,
>
>         e1000_clean_rx_ring(adapter, rx_ring);
>
> +       vfree(rx_ring->xdp_buffer);
> +       rx_ring->xdp_buffer = NULL;
> +
>         vfree(rx_ring->buffer_info);
>         rx_ring->buffer_info = NULL;
>
> @@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
>  }
>
>  static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> -                                unsigned int len,
> -                                struct net_device *netdev,
> -                                struct e1000_adapter *adapter)
> +                                __u32 len,
> +                                struct e1000_adapter *adapter,
> +                                struct e1000_tx_ring *tx_ring)
>  {
> -       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> -       struct e1000_hw *hw = &adapter->hw;
> -       struct e1000_tx_ring *tx_ring;
> -
>         if (len > E1000_MAX_DATA_PER_TXD)
>                 return;
>
> +       if (E1000_DESC_UNUSED(tx_ring) < 2)
> +               return;
> +
> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +}
> +
> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
> +                                 struct net_device *netdev,
> +                                 struct e1000_adapter *adapter)
> +{
> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +       struct e1000_tx_ring *tx_ring = adapter->tx_ring;
> +       struct e1000_hw *hw = &adapter->hw;
> +       int i = 0;
> +
>         /* e1000 only support a single txq at the moment so the queue is being
>          * shared with stack. To support this requires locking to ensure the
>          * stack and XDP are not running at the same time. Devices with
>          * multiple queues should allocate a separate queue space.
> +        *
> +        * To amortize the locking cost e1000 bundles the xmits and sends as
> +        * many as possible until either running out of descriptors or failing.

Up to E1000_XDP_XMIT_BUNDLE_MAX  at least...

>          */
>         HARD_TX_LOCK(netdev, txq, smp_processor_id());
>
> -       tx_ring = adapter->tx_ring;
> -
> -       if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -               HARD_TX_UNLOCK(netdev, txq);
> -               return;
> +       for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
> +               e1000_xmit_raw_frame(buffer_info[i].buffer,
> +                                    buffer_info[i].length,
> +                                    adapter, tx_ring);
> +               buffer_info[i].buffer->rxbuf.page = NULL;
> +               buffer_info[i].buffer = NULL;
> +               buffer_info[i].length = 0;
> +               i++;
>         }
>
> -       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> -       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +       /* kick hardware to send bundle and return control back to the stack */
> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +       mmiowb();
>
>         writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>         mmiowb();
> @@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>         struct bpf_prog *prog;
>         u32 length;
>         unsigned int i;
> -       int cleaned_count = 0;
> +       int cleaned_count = 0, xdp_xmit = 0;
>         bool cleaned = false;
>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> +       struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
>
>         rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>         prog = READ_ONCE(adapter->prog);
> @@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>                         case XDP_PASS:
>                                 break;
>                         case XDP_TX:
> +                               xdp_bundle[xdp_xmit].buffer = buffer_info;
> +                               xdp_bundle[xdp_xmit].length = length;
>                                 dma_sync_single_for_device(&pdev->dev,
>                                                            dma,
>                                                            length,
>                                                            DMA_TO_DEVICE);
> -                               e1000_xmit_raw_frame(buffer_info, length,
> -                                                    netdev, adapter);
> -                               buffer_info->rxbuf.page = NULL;
> +                               xdp_xmit++;
>                         case XDP_DROP:
>                         default:
>                                 /* re-use mapped page. keep buffer_info->dma
> @@ -4486,8 +4518,14 @@ next_desc:
>
>                 /* return some buffers to hardware, one at a time is too slow */
>                 if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
> +                       if (xdp_xmit)
> +                               e1000_xdp_xmit_bundle(xdp_bundle,
> +                                                     netdev,
> +                                                     adapter);
> +
>                         adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
>                         cleaned_count = 0;
> +                       xdp_xmit = 0;
>                 }
>
>                 /* use prefetched values */
> @@ -4498,8 +4536,11 @@ next_desc:
>         rx_ring->next_to_clean = i;
>
>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
> -       if (cleaned_count)
> +       if (cleaned_count) {
> +               if (xdp_xmit)
> +                       e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
>                 adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
> +       }

Looks good for XDP path. Is this something we can abstract out into a
library for use by other drivers?


>
>         adapter->total_rx_packets += total_rx_packets;
>         adapter->total_rx_bytes += total_rx_bytes;
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10 15:36   ` Tom Herbert
@ 2016-09-12  3:07     ` John Fastabend
  0 siblings, 0 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-12  3:07 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On 16-09-10 08:36 AM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> e1000 supports a single TX queue so it is being shared with the stack
>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> lock per packet this patch adds a bundling implementation to submit
>> a bundle of packets to the xmit routine.
>>
>> I tested this patch running e1000 in a VM using KVM over a tap
>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>
>> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---

[...]

>> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
>> index 91d5c87..b985271 100644
>> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
>> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
>> @@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>>         struct pci_dev *pdev = adapter->pdev;
>>         int size, desc_len;
>>
>> +       size = sizeof(struct e1000_rx_buffer_bundle) *
>> +                       E1000_XDP_XMIT_BUNDLE_MAX;
>> +       rxdr->xdp_buffer = vzalloc(size);
>> +       if (!rxdr->xdp_buffer)
>> +               return -ENOMEM;
>> +
>>         size = sizeof(struct e1000_rx_buffer) * rxdr->count;
>>         rxdr->buffer_info = vzalloc(size);
>> -       if (!rxdr->buffer_info)
>> +       if (!rxdr->buffer_info) {
>> +               vfree(rxdr->xdp_buffer);
> 
> This could be deferred until an XDP program is added.

Yep that would be best to avoid overhead in the normal non-XDP case.
Also I'll move the xdp prog pointer into the rx ring per Jespers comment
that I missed in this rev.

[...]

>> +
>> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
>> +                                 struct net_device *netdev,
>> +                                 struct e1000_adapter *adapter)
>> +{
>> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
>> +       struct e1000_tx_ring *tx_ring = adapter->tx_ring;
>> +       struct e1000_hw *hw = &adapter->hw;
>> +       int i = 0;
>> +
>>         /* e1000 only support a single txq at the moment so the queue is being
>>          * shared with stack. To support this requires locking to ensure the
>>          * stack and XDP are not running at the same time. Devices with
>>          * multiple queues should allocate a separate queue space.
>> +        *
>> +        * To amortize the locking cost e1000 bundles the xmits and sends as
>> +        * many as possible until either running out of descriptors or failing.
> 
> Up to E1000_XDP_XMIT_BUNDLE_MAX  at least...

Yep will fix comment.

[...]

>>
>>                 /* use prefetched values */
>> @@ -4498,8 +4536,11 @@ next_desc:
>>         rx_ring->next_to_clean = i;
>>
>>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
>> -       if (cleaned_count)
>> +       if (cleaned_count) {
>> +               if (xdp_xmit)
>> +                       e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
>>                 adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
>> +       }
> 
> Looks good for XDP path. Is this something we can abstract out into a
> library for use by other drivers?
> 

I'm not really sure it can be abstracted much its a bit intertwined with
the normal rx receive path. But it should probably be a pattern that
gets copied so we avoid unnecessary tx work.

> 
>>
>>         adapter->total_rx_packets += total_rx_packets;
>>         adapter->total_rx_bytes += total_rx_bytes;
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  4:13                   ` Tom Herbert
@ 2016-09-12  3:15                     ` John Fastabend
  2016-09-12  4:12                       ` Alexei Starovoitov
  0 siblings, 1 reply; 24+ messages in thread
From: John Fastabend @ 2016-09-12  3:15 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Alexei Starovoitov, Brenden Blanco, Jeff Kirsher,
	Jesper Dangaard Brouer, David S. Miller, Cong Wang,
	intel-wired-lan, William Tu, Linux Kernel Network Developers

On 16-09-09 09:13 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 8:26 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> On 16-09-09 08:12 PM, Tom Herbert wrote:
>>> On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
>>> <alexei.starovoitov@gmail.com> wrote:
>>>> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>>>>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>> On 16-09-09 06:04 PM, Tom Herbert wrote:
>>>>>>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>>>>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>>>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>>>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>>>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>>>>>>> a bundle of packets to the xmit routine.
>>>>>>>>>>
>>>>>>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>>>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>>>>>>
>>>>>>>>> Hi John,
>>>>>>>>>
>>>>>>>>> How does this interact with BQL on e1000?
>>>>>>>>>
>>>>>>>>> Tom
>>>>>>>>>
>>>>>>>>
>>>>>>>> Let me check if I have the API correct. When we enqueue a packet to
>>>>>>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>>>>>>> transmission issue a netdev_completed_queue().
>>>>>>>>
>>>>>>>> The patch attached here missed a few things though.
>>>>>>>>
>>>>>>>> But it looks like I just need to call netdev_sent_queue() from the
>>>>>>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>>>>>>> kick in which will call netdev_completed_queue() correctly.
>>>>>>>>
>>>>>>>> I'll need to add a check for the queue state as well. So if I do these
>>>>>>>> three things,
>>>>>>>>
>>>>>>>>         check __QUEUE_STATE_XOFF before sending
>>>>>>>>         netdev_sent_queue() -> on XDP_TX
>>>>>>>>         netdev_completed_queue()
>>>>>>>>
>>>>>>>> It should work agree? Now should we do this even when XDP owns the
>>>>>>>> queue? Or is this purely an issue with sharing the queue between
>>>>>>>> XDP and stack.
>>>>>>>>
>>>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
>>>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
>>>>>>> packets then?
>>>>>>
>>>>>> Yep that is what the patch does if there is any sort of error packets
>>>>>> get dropped on the floor. I don't think there is anything else that
>>>>>> can be done.
>>>>>>
>>>>> That probably means that the stack will always win out under load.
>>>>> Trying to used the same queue where half of the packets are well
>>>>> managed by a qdisc and half aren't is going to leave someone unhappy.
>>>>> Maybe in the this case where we have to share the qdisc we can
>>>>> allocate the skb on on returning XDP_TX and send through the normal
>>>>> qdisc for the device.
>>>>
>>>> I wouldn't go to such extremes for e1k.
>>>> The only reason to have xdp in e1k is to use it for testing
>>>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
>>>
>>> I imagine someone may want this for the non-forwarding use cases like
>>> early drop for DOS mitigation. Regardless of the use case, I don't
>>> think we can break the fundamental assumptions made for qdiscs or the
>>> rest of the transmit path. If XDP must transmit on a queue shared with
>>> the stack we need to abide by the stack's rules for transmitting on
>>> the queue-- which would mean alloc skbuff and go through qdisc (which
>>
>> If we require XDP_TX to go up to qdisc layer its best not to implement
>> it at all and just handle it in normal ingress path. That said I think
>> users have to expect that XDP will interfere with qdisc schemes. Even
>> with its own tx queue its going to interfere at the hardware level with
>> bandwidth as the hardware round robins through the queues or uses
>> whatever hardware strategy it is configured to use. Additionally it
>> will bypass things like BQL, etc.
>>
> Right, but not all use cases involve XDP_TX (like DOS mitigation as I
> pointed out). Since you've already done 95% of the work, can you take
> a look at creating the skbuff and injecting into the stack for XDP_TX
> so we can evaluate the performance and impact of that :-)
> 
> With separate TX queues it's explicit which queues are managed by the
> stack. This is no different than what kernel bypass gives use, we are
> relying on HW to do something reasonable in scheduling MQ.
> 

How about instead of dropping packets on xdp errors we make the
behavior to send the packet to the stack by default. Then the stack can
decide what to do with it. This is easier from the drivers perspective
and avoids creating a qdisc inject path for XDP. We could set the mark
field if the stack wants to handle XDP exceptions somehow differently.

If we really want XDP to have an inject path I think we should add
another action XDP_QDISC_INJECT. And add some way for XDP to run
programs on exceptions. Perhaps via an exception map.

In this flow when an exception occurs in some path the exception map
is consulted and the exception handler is run. I think its better to
be very explicit when falling back to the stack vs doing it
transparently.

Notice even in the dedicated queue case errors may occur when
descriptors are exhausted and other transient errors occur.

>>> really shouldn't be difficult to implement). Emulating various
>>> functions of the stack in the XDP TX path, like this patch seems to be
>>> doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
>>> trying to keep things coherent.
>>
>> I think bundling tx xmits is fair game as an internal optimization and
>> doesn't need to be exposed at the XDP layer. Drivers already do this
>> type of optimizations for allocating buffers. It likely doesn't matter
>> much at the e1k level but doing a tail update on every pkt with the
>> 40gbps drivers likely will be noticeable is my gut feeling.
>>
> How is this different than what XMIT_MORE gives us?
> 

Its not really. Except there is no call signaling. The code path just
bundles up as many packets as it can and throws them onto the xmit
routine.

> Tom
> 
>>
>>>
>>>> Existing stack with skb is perfectly fine as it is.
>>>> No need to do recycling, batching or any other complex things.
>>>> xdp for e1k cannot be used as an example for other drivers either,
>>>> since there is only one tx ring and any high performance adapter
>>>> has more which makes the driver support quite different.
>>>>
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-12  3:15                     ` John Fastabend
@ 2016-09-12  4:12                       ` Alexei Starovoitov
  0 siblings, 0 replies; 24+ messages in thread
From: Alexei Starovoitov @ 2016-09-12  4:12 UTC (permalink / raw)
  To: John Fastabend
  Cc: Tom Herbert, Brenden Blanco, Jeff Kirsher, Jesper Dangaard Brouer,
	David S. Miller, Cong Wang, intel-wired-lan, William Tu,
	Linux Kernel Network Developers

On Sun, Sep 11, 2016 at 08:15:28PM -0700, John Fastabend wrote:
> >>>>>>>>
> >>>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
> >>>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
> >>>>>>> packets then?
> >>>>>>
> >>>>>> Yep that is what the patch does if there is any sort of error packets
> >>>>>> get dropped on the floor. I don't think there is anything else that
> >>>>>> can be done.
> >>>>>>
> >>>>> That probably means that the stack will always win out under load.
> >>>>> Trying to used the same queue where half of the packets are well
> >>>>> managed by a qdisc and half aren't is going to leave someone unhappy.
> >>>>> Maybe in the this case where we have to share the qdisc we can
> >>>>> allocate the skb on on returning XDP_TX and send through the normal
> >>>>> qdisc for the device.
> >>>>
> >>>> I wouldn't go to such extremes for e1k.
> >>>> The only reason to have xdp in e1k is to use it for testing
> >>>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
> >>>
> >>> I imagine someone may want this for the non-forwarding use cases like
> >>> early drop for DOS mitigation. Regardless of the use case, I don't
> >>> think we can break the fundamental assumptions made for qdiscs or the
> >>> rest of the transmit path. If XDP must transmit on a queue shared with
> >>> the stack we need to abide by the stack's rules for transmitting on
> >>> the queue-- which would mean alloc skbuff and go through qdisc (which
> >>
> >> If we require XDP_TX to go up to qdisc layer its best not to implement
> >> it at all and just handle it in normal ingress path. That said I think
> >> users have to expect that XDP will interfere with qdisc schemes. Even
> >> with its own tx queue its going to interfere at the hardware level with
> >> bandwidth as the hardware round robins through the queues or uses
> >> whatever hardware strategy it is configured to use. Additionally it
> >> will bypass things like BQL, etc.
> >>
> > Right, but not all use cases involve XDP_TX (like DOS mitigation as I
> > pointed out). Since you've already done 95% of the work, can you take
> > a look at creating the skbuff and injecting into the stack for XDP_TX
> > so we can evaluate the performance and impact of that :-)
> > 
> > With separate TX queues it's explicit which queues are managed by the
> > stack. This is no different than what kernel bypass gives use, we are
> > relying on HW to do something reasonable in scheduling MQ.
> > 
> 
> How about instead of dropping packets on xdp errors we make the
> behavior to send the packet to the stack by default. Then the stack can
> decide what to do with it. This is easier from the drivers perspective
> and avoids creating a qdisc inject path for XDP. We could set the mark
> field if the stack wants to handle XDP exceptions somehow differently.
> 
> If we really want XDP to have an inject path I think we should add
> another action XDP_QDISC_INJECT. And add some way for XDP to run
> programs on exceptions. Perhaps via an exception map.

Nack for any new features just for e1k.
I don't like where this discussion is going.
I've been hacking xdp support for e1k only to be able to debug
xdp programs in kvm instead of messing with physical hosts where
every bpf program mistake kills ssh connection.
Please stop this overdesign. I'd rather not have xdp for e1k
instead of going into this crazy new action codes and random
punt to stack. If there is a conflict between stack and xdp, just
drop the packet. e1k is _not_ an example for any other drivers.
When high performance NIC will have such tx ring sharing issues
only then we'd need to come with a solution. Currently that's not
the case, so there is no need to come up with anything but
the simplest approach.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-10  1:19           ` Tom Herbert
  2016-09-10  1:40             ` Alexei Starovoitov
@ 2016-09-12 11:56             ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 24+ messages in thread
From: Jesper Dangaard Brouer @ 2016-09-12 11:56 UTC (permalink / raw)
  To: Tom Herbert
  Cc: John Fastabend, Brenden Blanco, Alexei Starovoitov, Jeff Kirsher,
	David S. Miller, Cong Wang, intel-wired-lan, William Tu,
	Linux Kernel Network Developers, brouer


On Fri, 9 Sep 2016 18:19:56 -0700 Tom Herbert <tom@herbertland.com> wrote:
> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> > On 16-09-09 06:04 PM, Tom Herbert wrote:  
> >> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:  
> >>> On 16-09-09 04:44 PM, Tom Herbert wrote:  
> >>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:  
> >>>>> e1000 supports a single TX queue so it is being shared with the stack
> >>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
> >>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> >>>>> lock per packet this patch adds a bundling implementation to submit
> >>>>> a bundle of packets to the xmit routine.
> >>>>>
> >>>>> I tested this patch running e1000 in a VM using KVM over a tap
> >>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
> >>>>>  
> >>>> Hi John,
> >>>>
> >>>> How does this interact with BQL on e1000?
> >>>>
> >>>> Tom
> >>>>  
> >>>
> >>> Let me check if I have the API correct. When we enqueue a packet to
> >>> be sent we must issue a netdev_sent_queue() call and then on actual
> >>> transmission issue a netdev_completed_queue().
> >>>
> >>> The patch attached here missed a few things though.
> >>>
> >>> But it looks like I just need to call netdev_sent_queue() from the
> >>> e1000_xmit_raw_frame() routine and then let the tx completion logic
> >>> kick in which will call netdev_completed_queue() correctly.
> >>>
> >>> I'll need to add a check for the queue state as well. So if I do these
> >>> three things,
> >>>
> >>>         check __QUEUE_STATE_XOFF before sending
> >>>         netdev_sent_queue() -> on XDP_TX
> >>>         netdev_completed_queue()
> >>>
> >>> It should work agree? Now should we do this even when XDP owns the
> >>> queue? Or is this purely an issue with sharing the queue between
> >>> XDP and stack.
> >>>  
> >> But what is the action for XDP_TX if the queue is stopped? There is no
> >> qdisc to back pressure in the XDP path. Would we just start dropping
> >> packets then?  
> >
> > Yep that is what the patch does if there is any sort of error packets
> > get dropped on the floor. I don't think there is anything else that
> > can be done.

I agree, the only option is the drop the packet. For a DDoS use-case,
this is good, because this "switch" XDP into a more efficient mode
(direct recycling pages).

> >  
> That probably means that the stack will always win out under load.

Why would the stack win? Wouldn't XDP_TX win?

> Trying to used the same queue where half of the packets are well
> managed by a qdisc and half aren't is going to leave someone unhappy.
> Maybe in the this case where we have to share the qdisc we can
> allocate the skb on on returning XDP_TX and send through the normal
> qdisc for the device.

Hmmm. I'm not sure I like the approach of allocating an SKB, and
injecting into the qdisc.  Most of the performance gain goes out the
window.  Unless, we (1) bulk alloc SKBs, and (2) can avoid initializing
the entire SKB, and (3) bulk enqueue into qdisc.  It would be an
interesting "tool" for a zoom-in benchmark, what would allow us to
determine the cost/overhead of the network stack between RX to
qdisc-enqueue.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
                     ` (2 preceding siblings ...)
  2016-09-10 15:36   ` Tom Herbert
@ 2016-09-12 12:17   ` Jesper Dangaard Brouer
  2016-09-12 18:11     ` John Fastabend
  3 siblings, 1 reply; 24+ messages in thread
From: Jesper Dangaard Brouer @ 2016-09-12 12:17 UTC (permalink / raw)
  To: John Fastabend
  Cc: bblanco, alexei.starovoitov, jeffrey.t.kirsher, davem,
	xiyou.wangcong, intel-wired-lan, u9012063, netdev, brouer

On Fri, 09 Sep 2016 14:29:38 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
> 
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
> 
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>

Thank you for actually implementing this! :-)

> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
[...]
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 91d5c87..b985271 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
>  
[...]
> @@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
>  }
>  
>  static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> -				 unsigned int len,
> -				 struct net_device *netdev,
> -				 struct e1000_adapter *adapter)
> +				 __u32 len,
> +				 struct e1000_adapter *adapter,
> +				 struct e1000_tx_ring *tx_ring)
>  {
> -	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> -	struct e1000_hw *hw = &adapter->hw;
> -	struct e1000_tx_ring *tx_ring;
> -
>  	if (len > E1000_MAX_DATA_PER_TXD)
>  		return;
>  
> +	if (E1000_DESC_UNUSED(tx_ring) < 2)
> +		return;
> +
> +	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +}
> +
> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
> +				  struct net_device *netdev,
> +				  struct e1000_adapter *adapter)
> +{
> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +	struct e1000_tx_ring *tx_ring = adapter->tx_ring;
> +	struct e1000_hw *hw = &adapter->hw;
> +	int i = 0;
> +
>  	/* e1000 only support a single txq at the moment so the queue is being
>  	 * shared with stack. To support this requires locking to ensure the
>  	 * stack and XDP are not running at the same time. Devices with
>  	 * multiple queues should allocate a separate queue space.
> +	 *
> +	 * To amortize the locking cost e1000 bundles the xmits and sends as
> +	 * many as possible until either running out of descriptors or failing.
>  	 */
>  	HARD_TX_LOCK(netdev, txq, smp_processor_id());
>  
> -	tx_ring = adapter->tx_ring;
> -
> -	if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -		HARD_TX_UNLOCK(netdev, txq);
> -		return;
> +	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
                                                                       ^^^
> +		e1000_xmit_raw_frame(buffer_info[i].buffer,
> +				     buffer_info[i].length,
> +				     adapter, tx_ring);
> +		buffer_info[i].buffer->rxbuf.page = NULL;
> +		buffer_info[i].buffer = NULL;
> +		buffer_info[i].length = 0;
> +		i++;
                ^^^
Looks like "i" is incremented twice, is that correct?

>  	}
>  
> -	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> -	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +	/* kick hardware to send bundle and return control back to the stack */
> +	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +	mmiowb();
>  
>  	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>  	mmiowb();
> @@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>  	struct bpf_prog *prog;
>  	u32 length;
>  	unsigned int i;
> -	int cleaned_count = 0;
> +	int cleaned_count = 0, xdp_xmit = 0;
>  	bool cleaned = false;
>  	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> +	struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
>  
>  	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>  	prog = READ_ONCE(adapter->prog);
> @@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>  			case XDP_PASS:
>  				break;
>  			case XDP_TX:
> +				xdp_bundle[xdp_xmit].buffer = buffer_info;
> +				xdp_bundle[xdp_xmit].length = length;
>  				dma_sync_single_for_device(&pdev->dev,
>  							   dma,
>  							   length,
>  							   DMA_TO_DEVICE);
> -				e1000_xmit_raw_frame(buffer_info, length,
> -						     netdev, adapter);
> -				buffer_info->rxbuf.page = NULL;
> +				xdp_xmit++;
>  			case XDP_DROP:
>  			default:
>  				/* re-use mapped page. keep buffer_info->dma

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines
  2016-09-12 12:17   ` Jesper Dangaard Brouer
@ 2016-09-12 18:11     ` John Fastabend
  0 siblings, 0 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-12 18:11 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: bblanco, alexei.starovoitov, jeffrey.t.kirsher, davem,
	xiyou.wangcong, intel-wired-lan, u9012063, netdev

On 16-09-12 05:17 AM, Jesper Dangaard Brouer wrote:
> On Fri, 09 Sep 2016 14:29:38 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
> 
>> e1000 supports a single TX queue so it is being shared with the stack
>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> lock per packet this patch adds a bundling implementation to submit
>> a bundle of packets to the xmit routine.
>>
>> I tested this patch running e1000 in a VM using KVM over a tap
>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>
>> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> Thank you for actually implementing this! :-)
> 

Yep no problem the effects are minimal on e1000 but should be
noticeable at 10/40/100gbps nics.

>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
> [...]


[...]

>> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
>> +				  struct net_device *netdev,
>> +				  struct e1000_adapter *adapter)
>> +{
>> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
>> +	struct e1000_tx_ring *tx_ring = adapter->tx_ring;
>> +	struct e1000_hw *hw = &adapter->hw;
>> +	int i = 0;
>> +
>>  	/* e1000 only support a single txq at the moment so the queue is being
>>  	 * shared with stack. To support this requires locking to ensure the
>>  	 * stack and XDP are not running at the same time. Devices with
>>  	 * multiple queues should allocate a separate queue space.
>> +	 *
>> +	 * To amortize the locking cost e1000 bundles the xmits and sends as
>> +	 * many as possible until either running out of descriptors or failing.
>>  	 */
>>  	HARD_TX_LOCK(netdev, txq, smp_processor_id());
>>  
>> -	tx_ring = adapter->tx_ring;
>> -
>> -	if (E1000_DESC_UNUSED(tx_ring) < 2) {
>> -		HARD_TX_UNLOCK(netdev, txq);
>> -		return;
>> +	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
>                                                                        ^^^
>> +		e1000_xmit_raw_frame(buffer_info[i].buffer,
>> +				     buffer_info[i].length,
>> +				     adapter, tx_ring);
>> +		buffer_info[i].buffer->rxbuf.page = NULL;
>> +		buffer_info[i].buffer = NULL;
>> +		buffer_info[i].length = 0;
>> +		i++;
>                 ^^^
> Looks like "i" is incremented twice, is that correct?
> 
>>  	}

Yep this and a couple other issues are resolved in v3 which I'll send
out in a moment.

Also in v3 I kept the program in the adapter structure. Moving it into
the ring structure made the code a bit uglier IMO. I agree with the
logic but practically only one program can exist for e1000.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 1/2] e1000: add initial XDP support
  2016-09-09 21:29 [net-next PATCH v2 1/2] e1000: add initial XDP support John Fastabend
  2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
  2016-09-09 22:04 ` [net-next PATCH v2 1/2] e1000: add initial XDP support Eric Dumazet
@ 2016-09-21  4:26 ` zhuyj
  2016-09-21  4:30   ` John Fastabend
  2 siblings, 1 reply; 24+ messages in thread
From: zhuyj @ 2016-09-21  4:26 UTC (permalink / raw)
  To: John Fastabend
  Cc: bblanco, alexei.starovoitov, Jeff Kirsher, brouer,
	David S. Miller, Cong Wang, intel-wired-lan, u9012063, netdev

 +static int e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+       struct e1000_adapter *adapter = netdev_priv(netdev);
+       struct bpf_prog *old_prog;
+
+       old_prog = xchg(&adapter->prog, prog);
+       if (old_prog) {
+               synchronize_net();
+               bpf_prog_put(old_prog);
+       }
+
+       if (netif_running(netdev))
+               e1000_reinit_locked(adapter);
+       else
+               e1000_reset(adapter);
+       return 0;
+}

To this function, is it better to use "static void
e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)"?
since it is always to return 0.


On Sat, Sep 10, 2016 at 5:29 AM, John Fastabend
<john.fastabend@gmail.com> wrote:
> From: Alexei Starovoitov <ast@fb.com>
>
> This patch adds initial support for XDP on e1000 driver. Note e1000
> driver does not support page recycling in general which could be
> added as a further improvement. However XDP_DROP case will recycle.
> XDP_TX and XDP_PASS do not support recycling yet.
>
> I tested this patch running e1000 in a VM using KVM over a tap
> device.
>
> CC: William Tu <u9012063@gmail.com>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |    2
>  drivers/net/ethernet/intel/e1000/e1000_main.c |  171 +++++++++++++++++++++++++
>  2 files changed, 170 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index d7bdea7..5cf8a0a 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -150,6 +150,7 @@ struct e1000_adapter;
>   */
>  struct e1000_tx_buffer {
>         struct sk_buff *skb;
> +       struct page *page;
>         dma_addr_t dma;
>         unsigned long time_stamp;
>         u16 length;
> @@ -279,6 +280,7 @@ struct e1000_adapter {
>                              struct e1000_rx_ring *rx_ring,
>                              int cleaned_count);
>         struct e1000_rx_ring *rx_ring;      /* One per active queue */
> +       struct bpf_prog *prog;
>         struct napi_struct napi;
>
>         int num_tx_queues;
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index f42129d..91d5c87 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -32,6 +32,7 @@
>  #include <linux/prefetch.h>
>  #include <linux/bitops.h>
>  #include <linux/if_vlan.h>
> +#include <linux/bpf.h>
>
>  char e1000_driver_name[] = "e1000";
>  static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
> @@ -842,6 +843,44 @@ static int e1000_set_features(struct net_device *netdev,
>         return 0;
>  }
>
> +static int e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
> +{
> +       struct e1000_adapter *adapter = netdev_priv(netdev);
> +       struct bpf_prog *old_prog;
> +
> +       old_prog = xchg(&adapter->prog, prog);
> +       if (old_prog) {
> +               synchronize_net();
> +               bpf_prog_put(old_prog);
> +       }
> +
> +       if (netif_running(netdev))
> +               e1000_reinit_locked(adapter);
> +       else
> +               e1000_reset(adapter);
> +       return 0;
> +}
> +
> +static bool e1000_xdp_attached(struct net_device *dev)
> +{
> +       struct e1000_adapter *priv = netdev_priv(dev);
> +
> +       return !!priv->prog;
> +}
> +
> +static int e1000_xdp(struct net_device *dev, struct netdev_xdp *xdp)
> +{
> +       switch (xdp->command) {
> +       case XDP_SETUP_PROG:
> +               return e1000_xdp_set(dev, xdp->prog);
> +       case XDP_QUERY_PROG:
> +               xdp->prog_attached = e1000_xdp_attached(dev);
> +               return 0;
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
>  static const struct net_device_ops e1000_netdev_ops = {
>         .ndo_open               = e1000_open,
>         .ndo_stop               = e1000_close,
> @@ -860,6 +899,7 @@ static const struct net_device_ops e1000_netdev_ops = {
>  #endif
>         .ndo_fix_features       = e1000_fix_features,
>         .ndo_set_features       = e1000_set_features,
> +       .ndo_xdp                = e1000_xdp,
>  };
>
>  /**
> @@ -1276,6 +1316,9 @@ static void e1000_remove(struct pci_dev *pdev)
>         e1000_down_and_stop(adapter);
>         e1000_release_manageability(adapter);
>
> +       if (adapter->prog)
> +               bpf_prog_put(adapter->prog);
> +
>         unregister_netdev(netdev);
>
>         e1000_phy_hw_reset(hw);
> @@ -1859,7 +1902,7 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
>         struct e1000_hw *hw = &adapter->hw;
>         u32 rdlen, rctl, rxcsum;
>
> -       if (adapter->netdev->mtu > ETH_DATA_LEN) {
> +       if (adapter->netdev->mtu > ETH_DATA_LEN || adapter->prog) {
>                 rdlen = adapter->rx_ring[0].count *
>                         sizeof(struct e1000_rx_desc);
>                 adapter->clean_rx = e1000_clean_jumbo_rx_irq;
> @@ -1973,6 +2016,11 @@ e1000_unmap_and_free_tx_resource(struct e1000_adapter *adapter,
>                 dev_kfree_skb_any(buffer_info->skb);
>                 buffer_info->skb = NULL;
>         }
> +       if (buffer_info->page) {
> +               put_page(buffer_info->page);
> +               buffer_info->page = NULL;
> +       }
> +
>         buffer_info->time_stamp = 0;
>         /* buffer_info must be completely set up in the transmit path */
>  }
> @@ -3298,6 +3346,63 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
>         return NETDEV_TX_OK;
>  }
>
> +static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
> +                               struct e1000_rx_buffer *rx_buffer_info,
> +                               unsigned int len)
> +{
> +       struct e1000_tx_buffer *buffer_info;
> +       unsigned int i = tx_ring->next_to_use;
> +
> +       buffer_info = &tx_ring->buffer_info[i];
> +
> +       buffer_info->length = len;
> +       buffer_info->time_stamp = jiffies;
> +       buffer_info->mapped_as_page = false;
> +       buffer_info->dma = rx_buffer_info->dma;
> +       buffer_info->next_to_watch = i;
> +       buffer_info->page = rx_buffer_info->rxbuf.page;
> +
> +       tx_ring->buffer_info[i].skb = NULL;
> +       tx_ring->buffer_info[i].segs = 1;
> +       tx_ring->buffer_info[i].bytecount = len;
> +       tx_ring->buffer_info[i].next_to_watch = i;
> +}
> +
> +static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> +                                unsigned int len,
> +                                struct net_device *netdev,
> +                                struct e1000_adapter *adapter)
> +{
> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +       struct e1000_hw *hw = &adapter->hw;
> +       struct e1000_tx_ring *tx_ring;
> +
> +       if (len > E1000_MAX_DATA_PER_TXD)
> +               return;
> +
> +       /* e1000 only support a single txq at the moment so the queue is being
> +        * shared with stack. To support this requires locking to ensure the
> +        * stack and XDP are not running at the same time. Devices with
> +        * multiple queues should allocate a separate queue space.
> +        */
> +       HARD_TX_LOCK(netdev, txq, smp_processor_id());
> +
> +       tx_ring = adapter->tx_ring;
> +
> +       if (E1000_DESC_UNUSED(tx_ring) < 2) {
> +               HARD_TX_UNLOCK(netdev, txq);
> +               return;
> +       }
> +
> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +
> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +       mmiowb();
> +
> +       HARD_TX_UNLOCK(netdev, txq);
> +}
> +
>  #define NUM_REGS 38 /* 1 based count */
>  static void e1000_regdump(struct e1000_adapter *adapter)
>  {
> @@ -4142,6 +4247,19 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
>         return skb;
>  }
>
> +static inline int e1000_call_bpf(struct bpf_prog *prog, void *data,
> +                                unsigned int length)
> +{
> +       struct xdp_buff xdp;
> +       int ret;
> +
> +       xdp.data = data;
> +       xdp.data_end = data + length;
> +       ret = BPF_PROG_RUN(prog, (void *)&xdp);
> +
> +       return ret;
> +}
> +
>  /**
>   * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
>   * @adapter: board private structure
> @@ -4160,12 +4278,15 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>         struct pci_dev *pdev = adapter->pdev;
>         struct e1000_rx_desc *rx_desc, *next_rxd;
>         struct e1000_rx_buffer *buffer_info, *next_buffer;
> +       struct bpf_prog *prog;
>         u32 length;
>         unsigned int i;
>         int cleaned_count = 0;
>         bool cleaned = false;
>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
>
> +       rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
> +       prog = READ_ONCE(adapter->prog);
>         i = rx_ring->next_to_clean;
>         rx_desc = E1000_RX_DESC(*rx_ring, i);
>         buffer_info = &rx_ring->buffer_info[i];
> @@ -4191,12 +4312,55 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>
>                 cleaned = true;
>                 cleaned_count++;
> +               length = le16_to_cpu(rx_desc->length);
> +
> +               if (prog) {
> +                       struct page *p = buffer_info->rxbuf.page;
> +                       dma_addr_t dma = buffer_info->dma;
> +                       int act;
> +
> +                       if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
> +                               /* attached bpf disallows larger than page
> +                                * packets, so this is hw error or corruption
> +                                */
> +                               pr_info_once("%s buggy !eop\n", netdev->name);
> +                               break;
> +                       }
> +                       if (unlikely(rx_ring->rx_skb_top)) {
> +                               pr_info_once("%s ring resizing bug\n",
> +                                            netdev->name);
> +                               break;
> +                       }
> +                       dma_sync_single_for_cpu(&pdev->dev, dma,
> +                                               length, DMA_FROM_DEVICE);
> +                       act = e1000_call_bpf(prog, page_address(p), length);
> +                       switch (act) {
> +                       case XDP_PASS:
> +                               break;
> +                       case XDP_TX:
> +                               dma_sync_single_for_device(&pdev->dev,
> +                                                          dma,
> +                                                          length,
> +                                                          DMA_TO_DEVICE);
> +                               e1000_xmit_raw_frame(buffer_info, length,
> +                                                    netdev, adapter);
> +                               buffer_info->rxbuf.page = NULL;
> +                       case XDP_DROP:
> +                       default:
> +                               /* re-use mapped page. keep buffer_info->dma
> +                                * as-is, so that e1000_alloc_jumbo_rx_buffers
> +                                * only needs to put it back into rx ring
> +                                */
> +                               total_rx_bytes += length;
> +                               total_rx_packets++;
> +                               goto next_desc;
> +                       }
> +               }
> +
>                 dma_unmap_page(&pdev->dev, buffer_info->dma,
>                                adapter->rx_buffer_len, DMA_FROM_DEVICE);
>                 buffer_info->dma = 0;
>
> -               length = le16_to_cpu(rx_desc->length);
> -
>                 /* errors is only valid for DD + EOP descriptors */
>                 if (unlikely((status & E1000_RXD_STAT_EOP) &&
>                     (rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK))) {
> @@ -4330,6 +4494,7 @@ next_desc:
>                 rx_desc = next_rxd;
>                 buffer_info = next_buffer;
>         }
> +       rcu_read_unlock();
>         rx_ring->next_to_clean = i;
>
>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [net-next PATCH v2 1/2] e1000: add initial XDP support
  2016-09-21  4:26 ` zhuyj
@ 2016-09-21  4:30   ` John Fastabend
  0 siblings, 0 replies; 24+ messages in thread
From: John Fastabend @ 2016-09-21  4:30 UTC (permalink / raw)
  To: zhuyj
  Cc: bblanco, alexei.starovoitov, Jeff Kirsher, brouer,
	David S. Miller, Cong Wang, intel-wired-lan, u9012063, netdev

On 16-09-20 09:26 PM, zhuyj wrote:
>  +static int e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
> +{
> +       struct e1000_adapter *adapter = netdev_priv(netdev);
> +       struct bpf_prog *old_prog;
> +
> +       old_prog = xchg(&adapter->prog, prog);
> +       if (old_prog) {
> +               synchronize_net();
> +               bpf_prog_put(old_prog);
> +       }
> +
> +       if (netif_running(netdev))
> +               e1000_reinit_locked(adapter);
> +       else
> +               e1000_reset(adapter);
> +       return 0;
> +}
> 
> To this function, is it better to use "static void
> e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)"?
> since it is always to return 0.
> 

In general try to avoid top posting.

Yes making it void would be reasonable and probably a good idea. I'll
do it in v3.

[...]

Thanks,
John

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2016-09-21  4:30 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-09-09 21:29 [net-next PATCH v2 1/2] e1000: add initial XDP support John Fastabend
2016-09-09 21:29 ` [net-next PATCH v2 2/2] e1000: bundle xdp xmit routines John Fastabend
2016-09-09 23:37   ` John Fastabend
2016-09-09 23:44   ` Tom Herbert
2016-09-10  0:01     ` John Fastabend
2016-09-10  1:04       ` Tom Herbert
2016-09-10  1:12         ` John Fastabend
2016-09-10  1:19           ` Tom Herbert
2016-09-10  1:40             ` Alexei Starovoitov
2016-09-10  3:12               ` Tom Herbert
2016-09-10  3:26                 ` John Fastabend
2016-09-10  4:13                   ` Tom Herbert
2016-09-12  3:15                     ` John Fastabend
2016-09-12  4:12                       ` Alexei Starovoitov
2016-09-10  3:56                 ` Alexei Starovoitov
2016-09-12 11:56             ` Jesper Dangaard Brouer
2016-09-10 15:36   ` Tom Herbert
2016-09-12  3:07     ` John Fastabend
2016-09-12 12:17   ` Jesper Dangaard Brouer
2016-09-12 18:11     ` John Fastabend
2016-09-09 22:04 ` [net-next PATCH v2 1/2] e1000: add initial XDP support Eric Dumazet
2016-09-09 23:33   ` John Fastabend
2016-09-21  4:26 ` zhuyj
2016-09-21  4:30   ` John Fastabend

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).