Netdev List
 help / color / mirror / Atom feed
* [net-next 08/12] ixgbe: introduce a helper to simplify code
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: YueHaibing, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: YueHaibing <yuehaibing@huawei.com>

ixgbe_dbg_reg_ops_read and ixgbe_dbg_netdev_ops_read copy-pasting
the same code except for ixgbe_dbg_netdev_ops_buf/ixgbe_dbg_reg_ops_buf,
so introduce a helper ixgbe_dbg_common_ops_read to remove redundant code.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ixgbe/ixgbe_debugfs.c  | 57 +++++++------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
index 55fe8114fe99..50dfb02fa34c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
@@ -10,15 +10,9 @@ static struct dentry *ixgbe_dbg_root;
 
 static char ixgbe_dbg_reg_ops_buf[256] = "";
 
-/**
- * ixgbe_dbg_reg_ops_read - read for reg_ops datum
- * @filp: the opened file
- * @buffer: where to write the data for the user to read
- * @count: the size of the user's buffer
- * @ppos: file position offset
- **/
-static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
-				    size_t count, loff_t *ppos)
+static ssize_t ixgbe_dbg_common_ops_read(struct file *filp, char __user *buffer,
+					 size_t count, loff_t *ppos,
+					 char *dbg_buf)
 {
 	struct ixgbe_adapter *adapter = filp->private_data;
 	char *buf;
@@ -29,8 +23,7 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 		return 0;
 
 	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_reg_ops_buf);
+			adapter->netdev->name, dbg_buf);
 	if (!buf)
 		return -ENOMEM;
 
@@ -45,6 +38,20 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 	return len;
 }
 
+/**
+ * ixgbe_dbg_reg_ops_read - read for reg_ops datum
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
+				      size_t count, loff_t *ppos)
+{
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_reg_ops_buf);
+}
+
 /**
  * ixgbe_dbg_reg_ops_write - write into reg_ops datum
  * @filp: the opened file
@@ -121,33 +128,11 @@ static char ixgbe_dbg_netdev_ops_buf[256] = "";
  * @count: the size of the user's buffer
  * @ppos: file position offset
  **/
-static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp,
-					 char __user *buffer,
+static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp, char __user *buffer,
 					 size_t count, loff_t *ppos)
 {
-	struct ixgbe_adapter *adapter = filp->private_data;
-	char *buf;
-	int len;
-
-	/* don't allow partial reads */
-	if (*ppos != 0)
-		return 0;
-
-	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_netdev_ops_buf);
-	if (!buf)
-		return -ENOMEM;
-
-	if (count < strlen(buf)) {
-		kfree(buf);
-		return -ENOSPC;
-	}
-
-	len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-
-	kfree(buf);
-	return len;
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_netdev_ops_buf);
 }
 
 /**
-- 
2.17.1

^ permalink raw reply related

* [net-next 07/12] ixgbevf: fix possible race in the reset subtask
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Emil Tantilov, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: Emil Tantilov <emil.s.tantilov@intel.com>

Extend the RTNL lock in ixgbevf_reset_subtask() to protect the state bits
check in addition to the call to ixgbevf_reinit_locked().

This is to make sure that we get the most up-to-date values for the bits
and avoid a possible race when going down.

Suggested-by: Zhiping du <zhipingdu@tencent.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 2d5a706c3c29..59416eddd840 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3141,15 +3141,17 @@ static void ixgbevf_reset_subtask(struct ixgbevf_adapter *adapter)
 	if (!test_and_clear_bit(__IXGBEVF_RESET_REQUESTED, &adapter->state))
 		return;
 
+	rtnl_lock();
 	/* If we're already down or resetting, just bail */
 	if (test_bit(__IXGBEVF_DOWN, &adapter->state) ||
 	    test_bit(__IXGBEVF_REMOVING, &adapter->state) ||
-	    test_bit(__IXGBEVF_RESETTING, &adapter->state))
+	    test_bit(__IXGBEVF_RESETTING, &adapter->state)) {
+		rtnl_unlock();
 		return;
+	}
 
 	adapter->tx_timeout_count++;
 
-	rtnl_lock();
 	ixgbevf_reinit_locked(adapter);
 	rtnl_unlock();
 }
-- 
2.17.1

^ permalink raw reply related

* [net-next 06/12] ixgbevf: Fix coexistence of malicious driver detection with XDP
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

In the case of the VF driver it is supposed to provide a context descriptor
that allows us to provide information about the header offsets inside of
the frame. However in the case of XDP we don't really have any of that
information since the data is minimally processed. As a result we were
seeing malicious driver detection (MDD) events being triggered when the PF
had that functionality enabled.

To address this I have added a bit of new code that will "prime" the XDP
ring by providing one context descriptor that assumes the minimal setup of
an Ethernet frame which is an L2 header length of 14. With just that we can
provide enough information to make the hardware happy so that we don't
trigger MDD events.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  1 +
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 36 +++++++++++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 70c75681495f..56a1031dcc07 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -76,6 +76,7 @@ enum ixgbevf_ring_state_t {
 	__IXGBEVF_TX_DETECT_HANG,
 	__IXGBEVF_HANG_CHECK_ARMED,
 	__IXGBEVF_TX_XDP_RING,
+	__IXGBEVF_TX_XDP_RING_PRIMED,
 };
 
 #define ring_is_xdp(ring) \
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 083041129539..2d5a706c3c29 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -991,24 +991,45 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring,
 		return IXGBEVF_XDP_CONSUMED;
 
 	/* record the location of the first descriptor for this packet */
-	tx_buffer = &ring->tx_buffer_info[ring->next_to_use];
-	tx_buffer->bytecount = len;
-	tx_buffer->gso_segs = 1;
-	tx_buffer->protocol = 0;
-
 	i = ring->next_to_use;
-	tx_desc = IXGBEVF_TX_DESC(ring, i);
+	tx_buffer = &ring->tx_buffer_info[i];
 
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
 	tx_buffer->data = xdp->data;
-	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+	tx_buffer->bytecount = len;
+	tx_buffer->gso_segs = 1;
+	tx_buffer->protocol = 0;
+
+	/* Populate minimal context descriptor that will provide for the
+	 * fact that we are expected to process Ethernet frames.
+	 */
+	if (!test_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state)) {
+		struct ixgbe_adv_tx_context_desc *context_desc;
+
+		set_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state);
+
+		context_desc = IXGBEVF_TX_CTXTDESC(ring, 0);
+		context_desc->vlan_macip_lens	=
+			cpu_to_le32(ETH_HLEN << IXGBE_ADVTXD_MACLEN_SHIFT);
+		context_desc->seqnum_seed	= 0;
+		context_desc->type_tucmd_mlhl	=
+			cpu_to_le32(IXGBE_TXD_CMD_DEXT |
+				    IXGBE_ADVTXD_DTYP_CTXT);
+		context_desc->mss_l4len_idx	= 0;
+
+		i = 1;
+	}
 
 	/* put descriptor type bits */
 	cmd_type = IXGBE_ADVTXD_DTYP_DATA |
 		   IXGBE_ADVTXD_DCMD_DEXT |
 		   IXGBE_ADVTXD_DCMD_IFCS;
 	cmd_type |= len | IXGBE_TXD_CMD;
+
+	tx_desc = IXGBEVF_TX_DESC(ring, i);
+	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
 	tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
 	tx_desc->read.olinfo_status =
 			cpu_to_le32((len << IXGBE_ADVTXD_PAYLEN_SHIFT) |
@@ -1688,6 +1709,7 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter,
 	       sizeof(struct ixgbevf_tx_buffer) * ring->count);
 
 	clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state);
+	clear_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state);
 
 	IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), txdctl);
 
-- 
2.17.1

^ permalink raw reply related

* [net-next 09/12] bpf, i40e: add meta data support
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Daniel Borkmann, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: Daniel Borkmann <daniel@iogearbox.net>

Add support for XDP meta data when using build skb variant of
the i40e driver. Implementation is analogous to the existing
ixgbe and ixgbevf support for meta data from 366a88fe2f40 ("bpf,
ixgbe: add meta data support") and be8333322eff ("ixgbevf: Add
support for meta data"). With the build skb variant we get
192 bytes of extra headroom which can be used for encaps or
meta data.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Tested-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 39 ++++++++++++++++-----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..105a26f447c0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2032,6 +2032,21 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
 #if L1_CACHE_BYTES < 128
 	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
+	/* Note, we get here by enabling legacy-rx via:
+	 *
+	 *    ethtool --set-priv-flags <dev> legacy-rx on
+	 *
+	 * In this mode, we currently get 0 extra XDP headroom as
+	 * opposed to having legacy-rx off, where we process XDP
+	 * packets going to stack via i40e_build_skb(). The latter
+	 * provides us currently with 192 bytes of headroom.
+	 *
+	 * For i40e_construct_skb() mode it means that the
+	 * xdp->data_meta will always point to xdp->data, since
+	 * the helper cannot expand the head. Should this ever
+	 * change in future for legacy-rx mode on, then lets also
+	 * add xdp->data_meta handling here.
+	 */
 
 	/* allocate a skb to store the frags */
 	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
@@ -2083,19 +2098,25 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
 				      struct i40e_rx_buffer *rx_buffer,
 				      struct xdp_buff *xdp)
 {
-	unsigned int size = xdp->data_end - xdp->data;
+	unsigned int metasize = xdp->data - xdp->data_meta;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(I40E_SKB_PAD + size);
+				SKB_DATA_ALIGN(I40E_SKB_PAD +
+					       (xdp->data_end -
+						xdp->data_hard_start));
 #endif
 	struct sk_buff *skb;
 
-	/* prefetch first cache line of first page */
-	prefetch(xdp->data);
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points exactly as xdp->data, otherwise we
+	 * likely have a consumer accessing first few bytes of meta
+	 * data, and then actual data.
+	 */
+	prefetch(xdp->data_meta);
 #if L1_CACHE_BYTES < 128
-	prefetch(xdp->data + L1_CACHE_BYTES);
+	prefetch(xdp->data_meta + L1_CACHE_BYTES);
 #endif
 	/* build an skb around the page buffer */
 	skb = build_skb(xdp->data_hard_start, truesize);
@@ -2103,8 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
 		return NULL;
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, I40E_SKB_PAD);
-	__skb_put(skb, size);
+	skb_reserve(skb, I40E_SKB_PAD + (xdp->data - xdp->data_hard_start));
+	__skb_put(skb, xdp->data_end - xdp->data);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	/* buffer is used by skb, update page_offset */
 #if (PAGE_SIZE < 8192)
@@ -2341,7 +2364,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
-			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_meta = xdp.data;
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
-- 
2.17.1

^ permalink raw reply related

* [net-next 05/12] igb: Wait 10ms just once after TX queues reset
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Sergey Nemov, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: Sergey Nemov <sergey.nemov@intel.com>

Move 10ms sleep out of function resetting TX queue.
Reset all the TX queues in one turn and
wait for all of them just once.

Use usleep_range() instead of mdelay() in order not to
affect transmission on other interfaces.

Signed-off-by: Sergey Nemov <sergey.nemov@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 20b728218d20..c33821d2afb3 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4055,11 +4055,6 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
 	u64 tdba = ring->dma;
 	int reg_idx = ring->reg_idx;
 
-	/* disable the queue */
-	wr32(E1000_TXDCTL(reg_idx), 0);
-	wrfl();
-	mdelay(10);
-
 	wr32(E1000_TDLEN(reg_idx),
 	     ring->count * sizeof(union e1000_adv_tx_desc));
 	wr32(E1000_TDBAL(reg_idx),
@@ -4090,8 +4085,16 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
  **/
 static void igb_configure_tx(struct igb_adapter *adapter)
 {
+	struct e1000_hw *hw = &adapter->hw;
 	int i;
 
+	/* disable the queues */
+	for (i = 0; i < adapter->num_tx_queues; i++)
+		wr32(E1000_TXDCTL(adapter->tx_ring[i]->reg_idx), 0);
+
+	wrfl();
+	usleep_range(10000, 20000);
+
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		igb_configure_tx_ring(adapter, adapter->tx_ring[i]);
 }
-- 
2.17.1

^ permalink raw reply related

* [net-next 04/12] igb: Clear TSICR interrupts together with ICR
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Joanna Yurdal, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: Joanna Yurdal <jyu@trackman.com>

Issuing "ip link set up/down" can block TSICR interrupts, what results in
missing PTP Tx timestamp and no PPS pulse generation.

Problem happens when the link is set up with the TSICR interrupts pending.
ICR is cleared before enabling interrupts, while TSICR is not. When all TSICR
interrupts are pending at this moment, time_sync interrupt will never
be generated. TSICR should be cleared as well.

In order to reproduce the issue:
1. Setup linux with IEEE 1588 grandmaster and PPS output enabled
2. Continue setting link up/down with random intervals between commands
3. Wait until PPS is not generated ( only one pulse is generated and PPS
dies), and ptp4l complains constantly about Tx timeout.

Signed-off-by: Joanna Yurdal <jyu@trackman.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 78574c06635b..20b728218d20 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2058,6 +2058,7 @@ int igb_up(struct igb_adapter *adapter)
 		igb_assign_vector(adapter->q_vector[0], 0);
 
 	/* Clear any pending interrupts. */
+	rd32(E1000_TSICR);
 	rd32(E1000_ICR);
 	igb_irq_enable(adapter);
 
@@ -3865,6 +3866,7 @@ static int __igb_open(struct net_device *netdev, bool resuming)
 		napi_enable(&(adapter->q_vector[i]->napi));
 
 	/* Clear any pending interrupts. */
+	rd32(E1000_TSICR);
 	rd32(E1000_ICR);
 
 	igb_irq_enable(adapter);
-- 
2.17.1

^ permalink raw reply related

* [net-next 03/12] Documentation: e1000: Update kernel documentation
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

Updated the e1000.txt kernel documentation with the latest information.

Also convert the text file to reStructuredText (RST) format, since the
Linux kernel documentation now uses this format for documentation.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
---
 .../networking/{e1000.txt => e1000.rst}       | 59 ++++---------------
 Documentation/networking/index.rst            |  1 +
 MAINTAINERS                                   |  2 +-
 3 files changed, 12 insertions(+), 50 deletions(-)
 rename Documentation/networking/{e1000.txt => e1000.rst} (89%)

diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.rst
similarity index 89%
rename from Documentation/networking/e1000.txt
rename to Documentation/networking/e1000.rst
index 1f6ed848363d..616848940e63 100644
--- a/Documentation/networking/e1000.txt
+++ b/Documentation/networking/e1000.rst
@@ -154,7 +154,7 @@ NOTE:  When e1000 is loaded with default settings and multiple adapters
        are in use simultaneously, the CPU utilization may increase non-
        linearly.  In order to limit the CPU utilization without impacting
        the overall throughput, we recommend that you load the driver as
-       follows:
+       follows::
 
            modprobe e1000 InterruptThrottleRate=3000,3000,3000
 
@@ -167,8 +167,8 @@ NOTE:  When e1000 is loaded with default settings and multiple adapters
 
 RxDescriptors
 -------------
-Valid Range:   80-256 for 82542 and 82543-based adapters
-               80-4096 for all other supported adapters
+Valid Range:   48-256 for 82542 and 82543-based adapters
+               48-4096 for all other supported adapters
 Default Value: 256
 
 This value specifies the number of receive buffer descriptors allocated
@@ -230,8 +230,8 @@ speed.  Duplex should also be set when Speed is set to either 10 or 100.
 
 TxDescriptors
 -------------
-Valid Range:   80-256 for 82542 and 82543-based adapters
-               80-4096 for all other supported adapters
+Valid Range:   48-256 for 82542 and 82543-based adapters
+               48-4096 for all other supported adapters
 Default Value: 256
 
 This value is the number of transmit descriptors allocated by the driver.
@@ -242,41 +242,10 @@ NOTE:  Depending on the available system resources, the request for a
        higher number of transmit descriptors may be denied.  In this case,
        use a lower number.
 
-TxDescriptorStep
-----------------
-Valid Range:    1 (use every Tx Descriptor)
-                4 (use every 4th Tx Descriptor)
-
-Default Value:  1 (use every Tx Descriptor)
-
-On certain non-Intel architectures, it has been observed that intense TX
-traffic bursts of short packets may result in an improper descriptor
-writeback. If this occurs, the driver will report a "TX Timeout" and reset
-the adapter, after which the transmit flow will restart, though data may
-have stalled for as much as 10 seconds before it resumes.
-
-The improper writeback does not occur on the first descriptor in a system
-memory cache-line, which is typically 32 bytes, or 4 descriptors long.
-
-Setting TxDescriptorStep to a value of 4 will ensure that all TX descriptors
-are aligned to the start of a system memory cache line, and so this problem
-will not occur.
-
-NOTES: Setting TxDescriptorStep to 4 effectively reduces the number of
-       TxDescriptors available for transmits to 1/4 of the normal allocation.
-       This has a possible negative performance impact, which may be
-       compensated for by allocating more descriptors using the TxDescriptors
-       module parameter.
-
-       There are other conditions which may result in "TX Timeout", which will
-       not be resolved by the use of the TxDescriptorStep parameter. As the
-       issue addressed by this parameter has never been observed on Intel
-       Architecture platforms, it should not be used on Intel platforms.
-
 TxIntDelay
 ----------
 Valid Range:   0-65535 (0=off)
-Default Value: 64
+Default Value: 8
 
 This value delays the generation of transmit interrupts in units of
 1.024 microseconds.  Transmit interrupt reduction can improve CPU
@@ -288,7 +257,7 @@ TxAbsIntDelay
 -------------
 (This parameter is supported only on 82540, 82545 and later adapters.)
 Valid Range:   0-65535 (0=off)
-Default Value: 64
+Default Value: 32
 
 This value, in units of 1.024 microseconds, limits the delay in which a
 transmit interrupt is generated.  Useful only if TxIntDelay is non-zero,
@@ -310,7 +279,7 @@ Copybreak
 ---------
 Valid Range:   0-xxxxxxx (0=off)
 Default Value: 256
-Usage: insmod e1000.ko copybreak=128
+Usage: modprobe e1000.ko copybreak=128
 
 Driver copies all packets below or equaling this size to a fresh RX
 buffer before handing it up the stack.
@@ -328,14 +297,6 @@ Default Value:  0 (disabled)
 Allows PHY to turn off in lower power states. The user can turn off
 this parameter in supported chipsets.
 
-KumeranLockLoss
----------------
-Valid Range: 0-1
-Default Value: 1 (enabled)
-
-This workaround skips resetting the PHY at shutdown for the initial
-silicon releases of ICH8 systems.
-
 Speed and Duplex Configuration
 ==============================
 
@@ -397,12 +358,12 @@ Additional Configurations
   ------------
   Jumbo Frames support is enabled by changing the MTU to a value larger than
   the default of 1500.  Use the ifconfig command to increase the MTU size.
-  For example:
+  For example::
 
        ifconfig eth<x> mtu 9000 up
 
   This setting is not saved across reboots.  It can be made permanent if
-  you add:
+  you add::
 
        MTU=9000
 
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index d11a62977edd..fec8588a588e 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -11,6 +11,7 @@ Contents:
    can
    dpaa2/index
    e100
+   e1000
    kapi
    z8530book
    msg_zerocopy
diff --git a/MAINTAINERS b/MAINTAINERS
index d68981ca9896..32472fbf4d6e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7090,7 +7090,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git
 S:	Supported
 F:	Documentation/networking/e100.rst
-F:	Documentation/networking/e1000.txt
+F:	Documentation/networking/e1000.rst
 F:	Documentation/networking/e1000e.txt
 F:	Documentation/networking/igb.txt
 F:	Documentation/networking/igbvf.txt
-- 
2.17.1

^ permalink raw reply related

* [net-next 01/12] e1000e: Ignore TSYNCRXCTL when getting I219 clock attributes
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Benjamin Poirier, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

From: Benjamin Poirier <bpoirier@suse.com>

There have been multiple reports of crashes that look like
kernel: RIP: 0010:[<ffffffff8110303f>] timecounter_read+0xf/0x50
[...]
kernel: Call Trace:
kernel:  [<ffffffffa0806b0f>] e1000e_phc_gettime+0x2f/0x60 [e1000e]
kernel:  [<ffffffffa0806c5d>] e1000e_systim_overflow_work+0x1d/0x80 [e1000e]
kernel:  [<ffffffff810992c5>] process_one_work+0x155/0x440
kernel:  [<ffffffff81099e16>] worker_thread+0x116/0x4b0
kernel:  [<ffffffff8109f422>] kthread+0xd2/0xf0
kernel:  [<ffffffff8163184f>] ret_from_fork+0x3f/0x70

These can be traced back to the fact that e1000e_systim_reset() skips the
timecounter_init() call if e1000e_get_base_timinca() returns -EINVAL, which
leads to a null deref in timecounter_read().

Commit 83129b37ef35 ("e1000e: fix systim issues", v4.2-rc1) reworked
e1000e_get_base_timinca() in such a way that it can return -EINVAL for
e1000_pch_spt if the SYSCFI bit is not set in TSYNCRXCTL.

Some experimentation has shown that on I219 (e1000_pch_spt, "MAC: 12")
adapters, the E1000_TSYNCRXCTL_SYSCFI flag is unstable; TSYNCRXCTL reads
sometimes don't have the SYSCFI bit set. Retrying the read shortly after
finds the bit to be set. This was observed at boot (probe) but also link up
and link down.

Moreover, the phc (PTP Hardware Clock) seems to operate normally even after
reads where SYSCFI=0. Therefore, remove this register read and
unconditionally set the clock parameters.

Reported-by: Achim Mildenberger <admin@fph.physik.uni-karlsruhe.de>
Message-Id: <20180425065243.g5mqewg5irkwgwgv@f2>
Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1075876
Fixes: 83129b37ef35 ("e1000e: fix systim issues")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index d3fef7fefea8..acf1e8b52b8e 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3527,15 +3527,12 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca)
 		}
 		break;
 	case e1000_pch_spt:
-		if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) {
-			/* Stable 24MHz frequency */
-			incperiod = INCPERIOD_24MHZ;
-			incvalue = INCVALUE_24MHZ;
-			shift = INCVALUE_SHIFT_24MHZ;
-			adapter->cc.shift = shift;
-			break;
-		}
-		return -EINVAL;
+		/* Stable 24MHz frequency */
+		incperiod = INCPERIOD_24MHZ;
+		incvalue = INCVALUE_24MHZ;
+		shift = INCVALUE_SHIFT_24MHZ;
+		adapter->cc.shift = shift;
+		break;
 	case e1000_pch_cnp:
 		if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) {
 			/* Stable 24MHz frequency */
-- 
2.17.1

^ permalink raw reply related

* [net-next 02/12] Documentation: e100: Update the Intel 10/100 driver doc
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene
In-Reply-To: <20180604175644.24293-1-jeffrey.t.kirsher@intel.com>

Over the years, several of the links have changed or are no longer valid
so update them.  In addition, the default values were incorrect for a
couple of parameters.

Converted the text file to the reStructuredText (RST) format, since the
Linux kernel documentation now uses this format for documentation.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
---
 .../networking/{e100.txt => e100.rst}         | 60 +++++++++----------
 Documentation/networking/index.rst            |  1 +
 MAINTAINERS                                   |  2 +-
 3 files changed, 29 insertions(+), 34 deletions(-)
 rename Documentation/networking/{e100.txt => e100.rst} (79%)

diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.rst
similarity index 79%
rename from Documentation/networking/e100.txt
rename to Documentation/networking/e100.rst
index 54810b82c01a..d4d837027925 100644
--- a/Documentation/networking/e100.txt
+++ b/Documentation/networking/e100.rst
@@ -1,7 +1,7 @@
 Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters
 ==============================================================
 
-March 15, 2011
+June 1, 2018
 
 Contents
 ========
@@ -36,16 +36,9 @@ Channel Bonding documentation can be found in the Linux kernel source:
 Identifying Your Adapter
 ========================
 
-For more information on how to identify your adapter, go to the Adapter &
-Driver ID Guide at:
-
-  http://support.intel.com/support/network/adapter/pro100/21397.htm
-
-For the latest Intel network drivers for Linux, refer to the following
-website. In the search field, enter your adapter name or type, or use the
-networking link on the left to search for your adapter:
-
-  http://downloadfinder.intel.com/scripts-df/support_intel.asp
+For information on how to identify your adapter, and for the latest Intel
+network drivers, refer to the Intel Support website:
+http://www.intel.com/support
 
 Driver Configuration Parameters
 ===============================
@@ -57,22 +50,26 @@ Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
    structure that describes a receive buffer and its attributes to the network
    controller. The data in the descriptor is used by the controller to write
    data from the controller to host memory. In the 3.x.x driver the valid range
-   for this parameter is 64-256. The default value is 64. This parameter can be
-   changed using the command:
+   for this parameter is 64-256. The default value is 256. This parameter can be
+   changed using the command::
 
-   ethtool -G eth? rx n, where n is the number of desired rx descriptors.
+   ethtool -G eth? rx n
+
+   Where n is the number of desired Rx descriptors.
 
 Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
    structure that describes a transmit buffer and its attributes to the network
    controller. The data in the descriptor is used by the controller to read
    data from the host memory to the controller. In the 3.x.x driver the valid
-   range for this parameter is 64-256. The default value is 64. This parameter
-   can be changed using the command:
+   range for this parameter is 64-256. The default value is 128. This parameter
+   can be changed using the command::
+
+   ethtool -G eth? tx n
 
-   ethtool -G eth? tx n, where n is the number of desired tx descriptors.
+   Where n is the number of desired Tx descriptors.
 
 Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
-   default. The ethtool utility can be used as follows to force speed/duplex.
+   default. The ethtool utility can be used as follows to force speed/duplex.::
 
    ethtool -s eth?  autoneg off speed {10|100} duplex {full|half}
 
@@ -81,7 +78,7 @@ Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
 
 Event Log Message Level:  The driver uses the message level flag to log events
    to syslog. The message level can be set at driver load time. It can also be
-   set using the command:
+   set using the command::
 
    ethtool -s eth? msglvl n
 
@@ -112,9 +109,9 @@ Additional Configurations
   ---------------------
   In order to see link messages and other Intel driver information on your
   console, you must set the dmesg level up to six. This can be done by
-  entering the following on the command line before loading the e100 driver:
+  entering the following on the command line before loading the e100 driver::
 
-       dmesg -n 8
+       dmesg -n 6
 
   If you wish to see all messages issued by the driver, including debug
   messages, set the dmesg level to eight.
@@ -146,7 +143,8 @@ Additional Configurations
 
   NAPI (Rx polling mode) is supported in the e100 driver.
 
-  See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
+  See https://wiki.linuxfoundation.org/networking/napi for more information
+  on NAPI.
 
   Multiple Interfaces on Same Ethernet Broadcast Network
   ------------------------------------------------------
@@ -160,7 +158,7 @@ Additional Configurations
   If you have multiple interfaces in a server, either turn on ARP
   filtering by
 
-  (1) entering: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+  (1) entering:: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
       (this only works if your kernel's version is higher than 2.4.5), or
 
   (2) installing the interfaces in separate broadcast domains (either
@@ -169,15 +167,11 @@ Additional Configurations
 
 Support
 =======
-
 For general information, go to the Intel support website at:
+http://www.intel.com/support/
 
-    http://support.intel.com
-
-    or the Intel Wired Networking project hosted by Sourceforge at:
-
-    http://sourceforge.net/projects/e1000
-
-If an issue is identified with the released source code on the supported
-kernel with a supported adapter, email the specific information related to the
-issue to e1000-devel@lists.sourceforge.net.
+or the Intel Wired Networking project hosted by Sourceforge at:
+http://sourceforge.net/projects/e1000
+If an issue is identified with the released source code on a supported kernel
+with a supported adapter, email the specific information related to the issue
+to e1000-devel@lists.sf.net.
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index cbd9bdd4a79e..d11a62977edd 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -10,6 +10,7 @@ Contents:
    batman-adv
    can
    dpaa2/index
+   e100
    kapi
    z8530book
    msg_zerocopy
diff --git a/MAINTAINERS b/MAINTAINERS
index 0ae0dbf0e15e..d68981ca9896 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7089,7 +7089,7 @@ Q:	http://patchwork.ozlabs.org/project/intel-wired-lan/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git
 S:	Supported
-F:	Documentation/networking/e100.txt
+F:	Documentation/networking/e100.rst
 F:	Documentation/networking/e1000.txt
 F:	Documentation/networking/e1000e.txt
 F:	Documentation/networking/igb.txt
-- 
2.17.1

^ permalink raw reply related

* [net-next 00/12][pull request] Intel Wired LAN Driver Updates 2018-06-04
From: Jeff Kirsher @ 2018-06-04 17:56 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene

This series contains a smorgasbord of updates to documentation, e1000e,
igb, ixgbe, ixgbevf and i40e.

Benjamin Poirier fixes a potential kernel crash due to NULL pointer
dereference in e1000e.

Jeff updates the kernel documentation for e100 and e1000 to correct
default values and URLs which were incorrect in the documentation.  Also
took the time to update these to the new reStructured text format for
kernel documentation.

Joanna Yurdal fixes a missing PTP transmit timestamp by ensuring that
TSICR gets cleared when ICR is cleared.

Sergey updates igb to reset all the transmit queues at one time so that
we only have to wait once for all the queues to be reset.

Alex fixes ixgbevf so that malicious driver detection (MDD) can co-exist
with XDP.

Emil and Tony extend the RTNL lock to ensure we get the most up-to-date
values for the bits and avoid a possible race condition when going down.

YueHaibing from Huawei introduces a helper function in ixgbe for
operation reads to simplify the code a bit more.

Daniel Borkmann adds support for XDP meta data when using build SKB
for i40e.

Shannon Nelson provides twp fixes for the IPSec code in ixgbe, first is
to make sure we do not try to offload the decryption of any incoming
packet that is destined for the management engine.  The other fix is to
resolve a cast problem introduced by a sparse cleanup patch.

The following are changes since commit 8284fd4cb85577eecca024fe1e7a35b39ed0f3f5:
  Merge branch 'selftests-net-various'
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE

Alexander Duyck (1):
  ixgbevf: Fix coexistence of malicious driver detection with XDP

Benjamin Poirier (1):
  e1000e: Ignore TSYNCRXCTL when getting I219 clock attributes

Daniel Borkmann (1):
  bpf, i40e: add meta data support

Emil Tantilov (1):
  ixgbevf: fix possible race in the reset subtask

Jeff Kirsher (2):
  Documentation: e100: Update the Intel 10/100 driver doc
  Documentation: e1000: Update kernel documentation

Joanna Yurdal (1):
  igb: Clear TSICR interrupts together with ICR

Sergey Nemov (1):
  igb: Wait 10ms just once after TX queues reset

Shannon Nelson (2):
  ixgbe: check ipsec ip addr against mgmt filters
  ixgbe: fix broken ipsec Rx with proper cast on spi

Tony Nguyen (1):
  ixgbe: fix possible race in reset subtask

YueHaibing (1):
  ixgbe: introduce a helper to simplify code

 .../networking/{e100.txt => e100.rst}         | 60 ++++++-------
 .../networking/{e1000.txt => e1000.rst}       | 59 +++---------
 Documentation/networking/index.rst            |  2 +
 MAINTAINERS                                   |  4 +-
 drivers/net/ethernet/intel/e1000e/netdev.c    | 15 ++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 39 ++++++--
 drivers/net/ethernet/intel/igb/igb_main.c     | 15 ++--
 .../net/ethernet/intel/ixgbe/ixgbe_debugfs.c  | 57 +++++-------
 .../net/ethernet/intel/ixgbe/ixgbe_ipsec.c    | 90 ++++++++++++++++++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  6 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  1 +
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 42 +++++++--
 12 files changed, 236 insertions(+), 154 deletions(-)
 rename Documentation/networking/{e100.txt => e100.rst} (79%)
 rename Documentation/networking/{e1000.txt => e1000.rst} (89%)

-- 
2.17.1

^ permalink raw reply

* Re: [PATCH net-next V2 2/2] cls_flower: Fix comparing of old filter mask with new filter
From: Simon Horman @ 2018-06-04 17:52 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Jiri Pirko, Cong Wang, Jamal Hadi Salim, David Miller, netdev,
	Yevgeny Kliteynik, Roi Dayan, Shahar Klein, Mark Bloch,
	Or Gerlitz
In-Reply-To: <1528009574-63306-2-git-send-email-paulb@mellanox.com>

On Sun, Jun 03, 2018 at 10:06:14AM +0300, Paul Blakey wrote:
> We incorrectly compare the mask and the result is that we can't modify
> an already existing rule.
> 
> Fix that by comparing correctly.
> 
> Fixes: 05cd271fd61a ("cls_flower: Support multiple masks per priority")
> Reported-by: Vlad Buslov <vladbu@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>

Reviewed-by: Simon Horman <simon.horman@netronome.com>

^ permalink raw reply

* Re: [PATCH net-next V2 1/2] cls_flower: Fix missing free of rhashtable
From: Simon Horman @ 2018-06-04 17:51 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Jiri Pirko, Cong Wang, Jamal Hadi Salim, David Miller, netdev,
	Yevgeny Kliteynik, Roi Dayan, Shahar Klein, Mark Bloch,
	Or Gerlitz
In-Reply-To: <1528009574-63306-1-git-send-email-paulb@mellanox.com>

On Sun, Jun 03, 2018 at 10:06:13AM +0300, Paul Blakey wrote:
> When destroying the instance, destroy the head rhashtable.
> 
> Fixes: 05cd271fd61a ("cls_flower: Support multiple masks per priority")
> Reported-by: Vlad Buslov <vladbu@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>

Reviewed-by: Simon Horman <simon.horman@netronome.com>

^ permalink raw reply

* Re: [PATCH v5 2/3] media: rc: introduce BPF_PROG_LIRC_MODE2
From: Matthias Reichl @ 2018-06-04 17:47 UTC (permalink / raw)
  To: Sean Young
  Cc: linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, Daniel Borkmann, netdev, Devin Heitmueller,
	Y Song, Quentin Monnet
In-Reply-To: <9f2c54d4956f962f44fcda739a824397ddea132c.1527419762.git.sean@mess.org>

Hi Sean,

I finally found the time to test your patch series and noticed
2 issues - comments are inline

On Sun, May 27, 2018 at 12:24:09PM +0100, Sean Young wrote:
> diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
> index eb2c3b6eca7f..d5b35a6ba899 100644
> --- a/drivers/media/rc/Kconfig
> +++ b/drivers/media/rc/Kconfig
> @@ -25,6 +25,19 @@ config LIRC
>  	   passes raw IR to and from userspace, which is needed for
>  	   IR transmitting (aka "blasting") and for the lirc daemon.
>  
> +config BPF_LIRC_MODE2
> +	bool "Support for eBPF programs attached to lirc devices"
> +	depends on BPF_SYSCALL
> +	depends on RC_CORE=y

Requiring rc-core to be built into the kernel could become
problematic in the future for people using media_build.

Currently the whole media tree (including rc-core) can be built
as modules so DVB and IR drivers can be replaced by newer versions.
But with rc-core in the kernel things could easily break if internal
data structures are changed.

Maybe we should add a small layer with a stable API/ABI between
bpf-lirc and rc-core to decouple them? Or would it be possible
to build rc-core with bpf support as a module?

> +	depends on LIRC
> +	help
> +	   Allow attaching eBPF programs to a lirc device using the bpf(2)
> +	   syscall command BPF_PROG_ATTACH. This is supported for raw IR
> +	   receivers.
> +
> +	   These eBPF programs can be used to decode IR into scancodes, for
> +	   IR protocols not supported by the kernel decoders.
> +
>  menuconfig RC_DECODERS
>  	bool "Remote controller decoders"
>  	depends on RC_CORE
> [...]
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 388d4feda348..3c104113d040 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -11,6 +11,7 @@
>   */
>  #include <linux/bpf.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/bpf_lirc.h>
>  #include <linux/btf.h>
>  #include <linux/syscalls.h>
>  #include <linux/slab.h>
> @@ -1578,6 +1579,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>  	case BPF_SK_SKB_STREAM_PARSER:
>  	case BPF_SK_SKB_STREAM_VERDICT:
>  		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
> +	case BPF_LIRC_MODE2:
> +		return lirc_prog_attach(attr);
>  	default:
>  		return -EINVAL;
>  	}
> @@ -1648,6 +1651,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>  	case BPF_SK_SKB_STREAM_PARSER:
>  	case BPF_SK_SKB_STREAM_VERDICT:
>  		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
> +	case BPF_LIRC_MODE2:
> +		return lirc_prog_detach(attr);
>  	default:
>  		return -EINVAL;
>  	}
> @@ -1695,6 +1700,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
>  	case BPF_CGROUP_SOCK_OPS:
>  	case BPF_CGROUP_DEVICE:
>  		break;
> +	case BPF_LIRC_MODE2:
> +		return lirc_prog_query(attr, uattr);

When testing this patch series I was wondering why I always got
-EINVAL when trying to query the registered programs.

Closer inspection revealed that bpf_prog_attach/detach/query and
calls to them in the bpf syscall are in "#ifdef CONFIG_CGROUP_BPF"
blocks - and as I built the kernel without CONFIG_CGROUP_BPF
BPF_PROG_ATTACH/DETACH/QUERY weren't handled in the syscall switch
and I got -EINVAL from the bpf syscall function.

I haven't checked in detail yet, but it looks to me like
bpf_prog_attach/detach/query could always be built (or when
either cgroup bpf or lirc bpf are enabled) and the #ifdefs moved
inside the switch(). So lirc bpf could be used without cgroup bpf.
Or am I missing something?

so long,

Hias
>  	default:
>  		return -EINVAL;
>  	}
> -- 
> 2.17.0
> 

^ permalink raw reply

* Re: [PATCH] net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets
From: Eric Dumazet @ 2018-06-04 17:24 UTC (permalink / raw)
  To: Maciej Żenczykowski, Maciej Żenczykowski,
	David S . Miller
  Cc: Eric Dumazet, netdev
In-Reply-To: <20180603174705.51802-1-zenczykowski@gmail.com>



On 06/03/2018 10:47 AM, Maciej Żenczykowski wrote:
> From: Maciej Żenczykowski <maze@google.com>
> 
> It is not safe to do so because such sockets are already in the
> hash tables and changing these options can result in invalidating
> the tb->fastreuse(port) caching.
> 

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [RFC PATCH 0/6] net: ethernet: ti: cpsw: add MQPRIO and CBS Qdisc offload
From: Vinicius Costa Gomes @ 2018-06-04 17:23 UTC (permalink / raw)
  To: Ivan Khoronzhuk, grygorii.strashko, davem
  Cc: corbet, akpm, netdev, linux-doc, linux-kernel, linux-omap, henrik,
	jesus.sanchez-palencia, Ivan Khoronzhuk
In-Reply-To: <20180518211510.13341-1-ivan.khoronzhuk@linaro.org>

Hi Ivan,

Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org> writes:

> This series adds MQPRIO and CBS Qdisc offload for TI cpsw driver.
> It potentially can be used in audio video bridging (AVB) and time
> sensitive networking (TSN).
>
> Patchset was tested on AM572x EVM and BBB boards. Last patch from this
> series adds detailed description of configuration with examples. For
> consistency reasons, in role of talker and listener, tools from
> patchset "TSN: Add qdisc based config interface for CBS" were used and
> can be seen here: https://www.spinics.net/lists/netdev/msg460869.html
>
> Based on net-next/master
>

I didn't test this, but it looks fine from my side.

I agree with Grygorii, that if no comments, this should be re-sent as a
patch series next.


> Ivan Khoronzhuk (6):
>   net: ethernet: ti: cpsw: use cpdma channels in backward order for txq
>   net: ethernet: ti: cpdma: fit rated channels in backward order
>   net: ethernet: ti: cpsw: add MQPRIO Qdisc offload
>   net: ethernet: ti: cpsw: add CBS Qdisc offload
>   net: ethernet: ti: cpsw: restore shaper configuration while down/up
>   Documentation: networking: cpsw: add MQPRIO & CBS offload examples
>
>  Documentation/networking/cpsw.txt       | 540 ++++++++++++++++++++++++
>  drivers/net/ethernet/ti/cpsw.c          | 364 +++++++++++++++-
>  drivers/net/ethernet/ti/davinci_cpdma.c |  31 +-
>  3 files changed, 913 insertions(+), 22 deletions(-)
>  create mode 100644 Documentation/networking/cpsw.txt
>
> -- 
> 2.17.0


Cheers,
--
Vinicius

^ permalink raw reply

* Re: [PATCH net v3] ipv6: omit traffic class when calculating flow hash
From: David Miller @ 2018-06-04 17:22 UTC (permalink / raw)
  To: mkubecek; +Cc: netdev, linux-kernel, nicolas.dichtel, tom, dsahern, idosch
In-Reply-To: <20180604095619.7D9C1A09F0@unicorn.suse.cz>

From: Michal Kubecek <mkubecek@suse.cz>
Date: Mon, 4 Jun 2018 11:36:05 +0200

> Some of the code paths calculating flow hash for IPv6 use flowlabel member
> of struct flowi6 which, despite its name, encodes both flow label and
> traffic class. If traffic class changes within a TCP connection (as e.g.
> ssh does), ECMP route can switch between path. It's also inconsistent with
> other code paths where ip6_flowlabel() (returning only flow label) is used
> to feed the key.
> 
> Use only flow label everywhere, including one place where hash key is set
> using ip6_flowinfo().
> 
> Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> ---
> v2: introduce and use an inline helper as suggested by David Ahern
> v3: keep the cast out of the helper to make future cleanup easier

Applied and queued up for -stable, thank you.

^ permalink raw reply

* Re: [PATCH] net-tcp: extend tcp_tw_reuse sysctl to enable loopback only optimization
From: Eric Dumazet @ 2018-06-04 17:18 UTC (permalink / raw)
  To: Maciej Żenczykowski, Maciej Żenczykowski,
	David S . Miller, Eric Dumazet
  Cc: netdev, Neal Cardwell, Yuchung Cheng, Wei Wang
In-Reply-To: <20180603174117.48539-1-zenczykowski@gmail.com>



On 06/03/2018 10:41 AM, Maciej Żenczykowski wrote:
> From: Maciej Żenczykowski <maze@google.com>
> 
> This changes the /proc/sys/net/ipv4/tcp_tw_reuse from a boolean
> to an integer.
> 
> It now takes the values 0, 1 and 2, where 0 and 1 behave as before,
> while 2 enables timewait socket reuse only for sockets that we can
> prove are loopback connections:
>   ie. bound to 'lo' interface or where one of source or destination
>   IPs is 127.0.0.0/8, ::ffff:127.0.0.0/104 or ::1.
> 
> This enables quicker reuse of ephemeral ports for loopback connections
> - where tcp_tw_reuse is 100% safe from a protocol perspective
> (this assumes no artificially induced packet loss on 'lo').
> 
> This also makes estblishing many loopback connections *much* faster
> (allocating ports out of the first half of the ephemeral port range
> is significantly faster, then allocating from the second half)
> 
> Without this change in a 32K ephemeral port space my sample program
> (it just establishes and closes [::1]:ephemeral -> [::1]:server_port
> connections in a tight loop) fails after 32765 connections in 24 seconds.
> With it enabled 50000 connections only take 4.7 seconds.
> 
> This is particularly problematic for IPv6 where we only have one local
> address and cannot play tricks with varying source IP from 127.0.0.0/8
> pool.
> 
> Signed-off-by: Maciej Żenczykowski <maze@google.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Yuchung Cheng <ycheng@google.com>
> Cc: Wei Wang <weiwan@google.com>

This seems fine, thanks Maciej

Signed-off-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* [PATCH net] l2tp: fix refcount leakage on PPPoL2TP sockets
From: Guillaume Nault @ 2018-06-04 16:52 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman

Commit d02ba2a6110c ("l2tp: fix race in pppol2tp_release with session
object destroy") tried to fix a race condition where a PPPoL2TP socket
would disappear while the L2TP session was still using it. However, it
missed the root issue which is that an L2TP session may accept to be
reconnected if its associated socket has entered the release process.

The tentative fix makes the session hold the socket it is connected to.
That saves the kernel from crashing, but introduces refcount leakage,
preventing the socket from completing the release process. Once stalled,
everything the socket depends on can't be released anymore, including
the L2TP session and the l2tp_ppp module.

The root issue is that, when releasing a connected PPPoL2TP socket, the
session's ->sk pointer (RCU-protected) is reset to NULL and we have to
wait for a grace period before destroying the socket. The socket drops
the session in its ->sk_destruct callback function, so the session
will exist until the last reference on the socket is dropped.
Therefore, there is a time frame where pppol2tp_connect() may accept
reconnecting a session, as it only checks ->sk to figure out if the
session is connected. This time frame is shortened by the fact that
pppol2tp_release() calls l2tp_session_delete(), making the session
unreachable before resetting ->sk. However, pppol2tp_connect() may
grab the session before it gets unhashed by l2tp_session_delete(), but
it may test ->sk after the later got reset. The race is not so hard to
trigger and syzbot found a pretty reliable reproducer:
https://syzkaller.appspot.com/bug?id=418578d2a4389074524e04d641eacb091961b2cf

Before d02ba2a6110c, another race could let pppol2tp_release()
overwrite the ->__sk pointer of an L2TP session, thus tricking
pppol2tp_put_sk() into calling sock_put() on a socket that is different
than the one for which pppol2tp_release() was originally called. To get
there, we had to trigger the race described above, therefore having one
PPPoL2TP socket being released, while the session it is connected to is
reconnecting to a different PPPoL2TP socket. When releasing this new
socket fast enough, pppol2tp_release() overwrites the session's
->__sk pointer with the address of the new socket, before the first
pppol2tp_put_sk() call gets scheduled. Then the pppol2tp_put_sk() call
invoked by the original socket will sock_put() the new socket,
potentially dropping its last reference. When the second
pppol2tp_put_sk() finally runs, its socket has already been freed.

With d02ba2a6110c, the session takes a reference on both sockets.
Furthermore, the session's ->sk pointer is reset in the
pppol2tp_session_close() callback function rather than in
pppol2tp_release(). Therefore, ->__sk can't be overwritten and
pppol2tp_put_sk() is called only once (l2tp_session_delete() will only
run pppol2tp_session_close() once, to protect the session against
concurrent deletion requests). Now pppol2tp_put_sk() will properly
sock_put() the original socket, but the new socket will remain, as
l2tp_session_delete() prevented the release process from completing.
Here, we don't depend on the ->__sk race to trigger the bug. Getting
into the pppol2tp_connect() race is enough to leak the reference, no
matter when new socket is released.

So it all boils down to pppol2tp_connect() failing to realise that the
session has already been connected. This patch drops the unneeded extra
reference counting (mostly reverting d02ba2a6110c) and checks that
neither ->sk nor ->__sk is set before allowing a session to be
connected.

Fixes: d02ba2a6110c ("l2tp: fix race in pppol2tp_release with session object destroy")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ppp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1fd9e145076a..466f17646625 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -428,16 +428,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
  */
 static void pppol2tp_session_close(struct l2tp_session *session)
 {
-	struct pppol2tp_session *ps;
-
-	ps = l2tp_session_priv(session);
-	mutex_lock(&ps->sk_lock);
-	ps->__sk = rcu_dereference_protected(ps->sk,
-					     lockdep_is_held(&ps->sk_lock));
-	RCU_INIT_POINTER(ps->sk, NULL);
-	if (ps->__sk)
-		call_rcu(&ps->rcu, pppol2tp_put_sk);
-	mutex_unlock(&ps->sk_lock);
 }
 
 /* Really kill the session socket. (Called from sock_put() if
@@ -480,15 +470,24 @@ static int pppol2tp_release(struct socket *sock)
 	sock_orphan(sk);
 	sock->sk = NULL;
 
-	/* If the socket is associated with a session,
-	 * l2tp_session_delete will call pppol2tp_session_close which
-	 * will drop the session's ref on the socket.
-	 */
 	session = pppol2tp_sock_to_session(sk);
 	if (session) {
+		struct pppol2tp_session *ps;
+
 		l2tp_session_delete(session);
-		/* drop the ref obtained by pppol2tp_sock_to_session */
-		sock_put(sk);
+
+		ps = l2tp_session_priv(session);
+		mutex_lock(&ps->sk_lock);
+		ps->__sk = rcu_dereference_protected(ps->sk,
+						     lockdep_is_held(&ps->sk_lock));
+		RCU_INIT_POINTER(ps->sk, NULL);
+		mutex_unlock(&ps->sk_lock);
+		call_rcu(&ps->rcu, pppol2tp_put_sk);
+
+		/* Rely on the sock_put() call at the end of the function for
+		 * dropping the reference held by pppol2tp_sock_to_session().
+		 * The last reference will be dropped by pppol2tp_put_sk().
+		 */
 	}
 
 	release_sock(sk);
@@ -742,7 +741,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 		 */
 		mutex_lock(&ps->sk_lock);
 		if (rcu_dereference_protected(ps->sk,
-					      lockdep_is_held(&ps->sk_lock))) {
+					      lockdep_is_held(&ps->sk_lock)) ||
+		    ps->__sk) {
 			mutex_unlock(&ps->sk_lock);
 			error = -EEXIST;
 			goto end;
@@ -803,7 +803,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 
 out_no_ppp:
 	/* This is how we get the session context from the socket. */
-	sock_hold(sk);
 	sk->sk_user_data = session;
 	rcu_assign_pointer(ps->sk, sk);
 	mutex_unlock(&ps->sk_lock);
-- 
2.17.1

^ permalink raw reply related

* [PATCH V2 net] net: hns: Fix the process of adding broadcast addresses to tcam
From: Salil Mehta @ 2018-06-04 16:50 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, lipeng321, mehta.salil, netdev,
	linux-kernel, linuxarm, Xi Wang

From: Xi Wang <wangxi11@huawei.com>

If the multicast mask value in device tree is configured not all
0xff, the broadcast mac will be lost from tcam table after the
execution of command 'ifconfig up'. The address is appended by
hns_ae_start, but will be clear later by hns_nic_set_rx_mode
called in dev_open process.

This patch fixed it by not use the multicast mask when add a
broadcast address.

Fixes: b5996f11ea54 ("net: add Hisilicon Network Subsystem basic ethernet support")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
Patch V2: Fixed comments by David Miller
   Link: https://lkml.org/lkml/2018/6/4/671
Patch V1: Initial Submit
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 23 +++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
index e0bc79e..85e1d14 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -1648,6 +1648,15 @@ int hns_dsaf_rm_mac_addr(
 				      mac_entry->addr);
 }
 
+static void hns_dsaf_setup_mc_mask(struct dsaf_device *dsaf_dev,
+				   u8 port_num, u8 *mask, u8 *addr)
+{
+	if (MAC_IS_BROADCAST(addr))
+		memset(mask, 0xff, ETH_ALEN);
+	else
+		memcpy(mask, dsaf_dev->mac_cb[port_num]->mc_mask, ETH_ALEN);
+}
+
 static void hns_dsaf_mc_mask_bit_clear(char *dst, const char *src)
 {
 	u16 *a = (u16 *)dst;
@@ -1676,7 +1685,6 @@ int hns_dsaf_add_mac_mc_port(struct dsaf_device *dsaf_dev,
 	struct dsaf_drv_tbl_tcam_key tmp_mac_key;
 	struct dsaf_tbl_tcam_data tcam_data;
 	u8 mc_addr[ETH_ALEN];
-	u8 *mc_mask;
 	int mskid;
 
 	/*chechk mac addr */
@@ -1687,9 +1695,12 @@ int hns_dsaf_add_mac_mc_port(struct dsaf_device *dsaf_dev,
 	}
 
 	ether_addr_copy(mc_addr, mac_entry->addr);
-	mc_mask = dsaf_dev->mac_cb[mac_entry->in_port_num]->mc_mask;
 	if (!AE_IS_VER1(dsaf_dev->dsaf_ver)) {
+		u8 mc_mask[ETH_ALEN];
+
 		/* prepare for key data setting */
+		hns_dsaf_setup_mc_mask(dsaf_dev, mac_entry->in_port_num,
+				       mc_mask, mac_entry->addr);
 		hns_dsaf_mc_mask_bit_clear(mc_addr, mc_mask);
 
 		/* config key mask */
@@ -1844,7 +1855,6 @@ int hns_dsaf_del_mac_mc_port(struct dsaf_device *dsaf_dev,
 	struct dsaf_drv_tbl_tcam_key mask_key, tmp_mac_key;
 	struct dsaf_tbl_tcam_data *pmask_key = NULL;
 	u8 mc_addr[ETH_ALEN];
-	u8 *mc_mask;
 
 	if (!(void *)mac_entry) {
 		dev_err(dsaf_dev->dev,
@@ -1861,14 +1871,17 @@ int hns_dsaf_del_mac_mc_port(struct dsaf_device *dsaf_dev,
 
 	/* always mask vlan_id field */
 	ether_addr_copy(mc_addr, mac_entry->addr);
-	mc_mask = dsaf_dev->mac_cb[mac_entry->in_port_num]->mc_mask;
 
 	if (!AE_IS_VER1(dsaf_dev->dsaf_ver)) {
+		u8 mc_mask[ETH_ALEN];
+
 		/* prepare for key data setting */
+		hns_dsaf_setup_mc_mask(dsaf_dev, mac_entry->in_port_num,
+				       mc_mask, mac_entry->addr);
 		hns_dsaf_mc_mask_bit_clear(mc_addr, mc_mask);
 
 		/* config key mask */
-		hns_dsaf_set_mac_key(dsaf_dev, &mask_key, 0x00, 0xff, mc_addr);
+		hns_dsaf_set_mac_key(dsaf_dev, &mask_key, 0x00, 0xff, mc_mask);
 
 		mask_key.high.val = le32_to_cpu(mask_key.high.val);
 		mask_key.low.val = le32_to_cpu(mask_key.low.val);
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH bpf-next 00/11] AF_XDP: introducing zero-copy support
From: Alexei Starovoitov @ 2018-06-04 16:38 UTC (permalink / raw)
  To: Björn Töpel
  Cc: magnus.karlsson, magnus.karlsson, alexander.h.duyck,
	alexander.duyck, ast, brouer, daniel, netdev, mykyta.iziumtsev,
	Björn Töpel, john.fastabend, willemdebruijn.kernel, mst,
	michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
	francois.ozog, ilias.apalodimas, brian.brooks, andy, michael.chan,
	intel-wired-lan
In-Reply-To: <20180604120601.18123-1-bjorn.topel@gmail.com>

On Mon, Jun 04, 2018 at 02:05:50PM +0200, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> This patch serie introduces zerocopy (ZC) support for
> AF_XDP. Programs using AF_XDP sockets will now receive RX packets
> without any copies and can also transmit packets without incurring any
> copies. No modifications to the application are needed, but the NIC
> driver needs to be modified to support ZC. If ZC is not supported by
> the driver, the modes introduced in the AF_XDP patch will be
> used. Using ZC in our micro benchmarks results in significantly
> improved performance as can be seen in the performance section later
> in this cover letter.
> 
> Note that for an untrusted application, HW packet steering to a
> specific queue pair (the one associated with the application) is a
> requirement when using ZC, as the application would otherwise be able
> to see other user space processes' packets. If the HW cannot support
> the required packet steering you need to use the XDP_SKB mode or the
> XDP_DRV mode without ZC turned on. The XSKMAP introduced in the AF_XDP
> patch set can be used to do load balancing in that case.
> 
> For benchmarking, you can use the xdpsock application from the AF_XDP
> patch set without any modifications. Say that you would like your UDP
> traffic from port 4242 to end up in queue 16, that we will enable
> AF_XDP on. Here, we use ethtool for this:
> 
>       ethtool -N p3p2 rx-flow-hash udp4 fn
>       ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
>           action 16
> 
> Running the rxdrop benchmark in XDP_DRV mode with zerocopy can then be
> done using:
> 
>       samples/bpf/xdpsock -i p3p2 -q 16 -r -N
> 
> We have run some benchmarks on a dual socket system with two Broadwell
> E5 2660 @ 2.0 GHz with hyperthreading turned off. Each socket has 14
> cores which gives a total of 28, but only two cores are used in these
> experiments. One for TR/RX and one for the user space application. The
> memory is DDR4 @ 2133 MT/s (1067 MHz) and the size of each DIMM is
> 8192MB and with 8 of those DIMMs in the system we have 64 GB of total
> memory. The compiler used is gcc (Ubuntu 7.3.0-16ubuntu3) 7.3.0. The
> NIC is Intel I40E 40Gbit/s using the i40e driver.
> 
> Below are the results in Mpps of the I40E NIC benchmark runs for 64
> and 1500 byte packets, generated by a commercial packet generator HW
> outputing packets at full 40 Gbit/s line rate. The results are without
> retpoline so that we can compare against previous numbers. 
> 
> AF_XDP performance 64 byte packets. Results from the AF_XDP V3 patch
> set are also reported for ease of reference. The numbers within
> parantheses are from the RFC V1 ZC patch set.
> Benchmark   XDP_SKB    XDP_DRV    XDP_DRV with zerocopy
> rxdrop       2.9*       9.6*       21.1(21.5)
> txpush       2.6*       -          22.0(21.6)
> l2fwd        1.9*       2.5*       15.3(15.0)
> 
> AF_XDP performance 1500 byte packets:
> Benchmark   XDP_SKB   XDP_DRV     XDP_DRV with zerocopy
> rxdrop       2.1*       3.3*       3.3(3.3)
> l2fwd        1.4*       1.8*       3.1(3.1)
> 
> * From AF_XDP V3 patch set and cover letter.
> 
> So why do we not get higher values for RX similar to the 34 Mpps we
> had in AF_PACKET V4? We made an experiment running the rxdrop
> benchmark without using the xdp_do_redirect/flush infrastructure nor
> using an XDP program (all traffic on a queue goes to one
> socket). Instead the driver acts directly on the AF_XDP socket. With
> this we got 36.9 Mpps, a significant improvement without any change to
> the uapi. So not forcing users to have an XDP program if they do not
> need it, might be a good idea. This measurement is actually higher
> than what we got with AF_PACKET V4.
> 
> XDP performance on our system as a base line:
> 
> 64 byte packets:
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      16      32.3M  0
> 
> 1500 byte packets:
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      16      3.3M    0
> 
> The structure of the patch set is as follows:
> 
> Patches 1-3: Plumbing for AF_XDP ZC support
> Patches 4-5: AF_XDP ZC for RX
> Patches 6-7: AF_XDP ZC for TX

Acked-by: Alexei Starovoitov <ast@kernel.org>
for above patches

> Patch 8-10: ZC support for i40e.

these also look good to me.
would be great if i40e experts take a look at them asap.

If there are no major objections we'd like to merge all of it
for this merge window.

Thanks!

^ permalink raw reply

* Re: [PATCH net-next 0/2] cls_flower: Various fixes
From: Cong Wang @ 2018-06-04 16:45 UTC (permalink / raw)
  To: Roi Dayan
  Cc: Jiri Pirko, Paul Blakey, Jiri Pirko, Jamal Hadi Salim,
	David Miller, Linux Kernel Network Developers, Yevgeny Kliteynik,
	Shahar Klein, Mark Bloch, Or Gerlitz
In-Reply-To: <695d484a-b981-054a-1da0-e63719bc9100@mellanox.com>

On Mon, Jun 4, 2018 at 12:35 AM, Roi Dayan <roid@mellanox.com> wrote:
>
>
> On 03/06/2018 22:39, Jiri Pirko wrote:
>>
>> Sun, Jun 03, 2018 at 08:33:25PM CEST, xiyou.wangcong@gmail.com wrote:
>>>
>>> On Wed, May 30, 2018 at 1:17 AM, Paul Blakey <paulb@mellanox.com> wrote:
>>>>
>>>> Two of the fixes are for my multiple mask patch
>>>>
>>>> Paul Blakey (2):
>>>>    cls_flower: Fix missing free of rhashtable
>>>>    cls_flower: Fix comparing of old filter mask with new filter
>>>
>>>
>>> Both are bug fixes and one-line fixes, so definitely should go
>>> to -net tree and -stable tree.
>>
>>
>> I agree.
>>
>
> it's because the commit they fix doesn't exists in net yet.
>

Oh, sorry, my bad, I thought Apr 30 is in a previous release...

Anyway, v2 should be applied cleanly to -net-next or -net after
net-next is merged into it.

^ permalink raw reply

* [PATCH bpf-next v2 2/2] samples/bpf: Add xdp_sample_pkts example
From: Toke Høiland-Jørgensen @ 2018-06-04 16:33 UTC (permalink / raw)
  To: netdev
In-Reply-To: <152813003609.3465.618891361534945522.stgit@alrua-kau>

This adds an example program showing how to sample packets from XDP using
the perf event buffer. The example userspace program just prints the
ethernet header for every packet sampled.

The example sets up a perf file descriptor per CPU, allowing the XDP
program to pass BPF_F_CURRENT_CPU and work no matter which CPU handles the
packet.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 samples/bpf/Makefile               |    4 +
 samples/bpf/xdp_sample_pkts_kern.c |   62 ++++++++++++++
 samples/bpf/xdp_sample_pkts_user.c |  162 ++++++++++++++++++++++++++++++++++++
 3 files changed, 228 insertions(+)
 create mode 100644 samples/bpf/xdp_sample_pkts_kern.c
 create mode 100644 samples/bpf/xdp_sample_pkts_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1303af10e54d..6f0c6d276a86 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
+hostprogs-y += xdp_sample_pkts
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
+xdp_sample_pkts-objs := bpf_load.o xdp_sample_pkts_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -163,6 +165,7 @@ always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
+always += xdp_sample_pkts_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -179,6 +182,7 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 000000000000..4560522ca015
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,62 @@
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define SAMPLE_SIZE 64ul
+#define MAX_CPUS 24
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = MAX_CPUS,
+};
+
+SEC("xdp_sample")
+int xdp_sample_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+
+        /* Metadata will be in the perf event before the packet data. */
+	struct S {
+		u16 cookie;
+		u16 pkt_len;
+	} __attribute__((packed)) metadata;
+
+	if (data + SAMPLE_SIZE < data_end) {
+		/* The XDP perf_event_output handler will use the upper 32 bits
+		 * of the flags argument as a number of bytes to include of the
+		 * packet payload in the event data. If the size is too big, the
+		 * call to bpf_perf_event_output will fail and return -EFAULT.
+		 *
+		 * See bpf_xdp_event_output in net/core/filter.c.
+		 *
+		 * The BPF_F_CURRENT_CPU flag means that the event output fd
+		 * will be indexed by the CPU number in the event map.
+		 */
+		u64 flags = (SAMPLE_SIZE << 32) | BPF_F_CURRENT_CPU;
+		int ret;
+
+		metadata.cookie = 0xdead;
+		metadata.pkt_len = (u16)(data_end - data);
+
+		ret = bpf_perf_event_output(ctx, &my_map, flags,
+				      &metadata, sizeof(metadata));
+		if(ret)
+			bpf_printk("perf_event_output failed: %d\n", ret);
+	}
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 000000000000..35c5dd953f48
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,162 @@
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <net/if.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/sysinfo.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <libbpf.h>
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define MAX_CPUS 24
+static int pmu_fds[MAX_CPUS], if_idx = 0;
+static struct perf_event_mmap_page *headers[MAX_CPUS];
+static char *if_name;
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+#define SAMPLE_SIZE 64
+
+static int print_bpf_output(void *data, int size)
+{
+	struct {
+		__u16 cookie;
+		__u16 pkt_len;
+		__u8  pkt_data[SAMPLE_SIZE];
+	} __attribute__((packed)) *e = data;
+	int i;
+
+	if (e->cookie != 0xdead) {
+		printf("BUG cookie %x sized %d\n",
+		       e->cookie, size);
+		return LIBBPF_PERF_EVENT_ERROR;
+	}
+
+	printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len);
+	for (i = 0; i < 14 && i < e->pkt_len; i++)
+		printf("%02x ", e->pkt_data[i]);
+	printf("\n");
+
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void test_bpf_perf_event(int num)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+		.wakeup_events = 1, /* get an fd notification for every event */
+	};
+	int i;
+
+	for (i = 0; i < num; i++) {
+		int key = i;
+
+		pmu_fds[i] = sys_perf_event_open(&attr, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+
+		assert(pmu_fds[i] >= 0);
+		assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fds[i], BPF_ANY) == 0);
+		ioctl(pmu_fds[i], PERF_EVENT_IOC_ENABLE, 0);
+	}
+}
+
+static void sig_handler(int signo)
+{
+	do_detach(if_idx, if_name);
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+	int ret, err;
+	int numcpus;
+	int i;
+
+	if (argc < 2) {
+		printf("Usage: %s <ifname>\n", argv[0]);
+		return 1;
+	}
+
+	numcpus = get_nprocs();
+	if (numcpus > MAX_CPUS)
+		numcpus = MAX_CPUS;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if_idx = if_nametoindex(argv[1]);
+	if (!if_idx)
+		if_idx = strtoul(argv[1], NULL, 0);
+
+	if (!if_idx) {
+		fprintf(stderr, "Invalid ifname\n");
+		return 1;
+	}
+	if_name = argv[1];
+	err = do_attach(if_idx, prog_fd[0], argv[1]);
+	if (err)
+		return err;
+
+	if (signal(SIGINT, sig_handler) ||
+	    signal(SIGHUP, sig_handler) ||
+	    signal(SIGTERM, sig_handler)) {
+		perror("signal");
+		return 1;
+	}
+
+	test_bpf_perf_event(numcpus);
+
+	for (i = 0; i < numcpus; i++)
+		if (perf_event_mmap_header(pmu_fds[i], &headers[i]) < 0)
+			return 1;
+
+	ret = perf_event_poller_multi(pmu_fds, headers, numcpus, print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
+}

^ permalink raw reply related

* [PATCH bpf-next v2 1/2] trace_helpers.c: Add helpers to poll multiple perf FDs for events
From: Toke Høiland-Jørgensen @ 2018-06-04 16:33 UTC (permalink / raw)
  To: netdev

This adds two new helper functions to trace_helpers that supports polling
multiple perf file descriptors for events. These are used to the XDP
perf_event_output example, which needs to work with one perf fd per CPU.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 tools/testing/selftests/bpf/trace_helpers.c |   47 ++++++++++++++++++++++++++-
 tools/testing/selftests/bpf/trace_helpers.h |    4 ++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 3868dcb63420..1e62d89f34cf 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -88,7 +88,7 @@ static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
 
-int perf_event_mmap(int fd)
+int perf_event_mmap_header(int fd, struct perf_event_mmap_page **header)
 {
 	void *base;
 	int mmap_size;
@@ -102,10 +102,15 @@ int perf_event_mmap(int fd)
 		return -1;
 	}
 
-	header = base;
+	*header = base;
 	return 0;
 }
 
+int perf_event_mmap(int fd)
+{
+	return perf_event_mmap_header(fd, &header);
+}
+
 static int perf_event_poll(int fd)
 {
 	struct pollfd pfd = { .fd = fd, .events = POLLIN };
@@ -163,3 +168,41 @@ int perf_event_poller(int fd, perf_event_print_fn output_fn)
 
 	return ret;
 }
+
+int perf_event_poller_multi(int *fds, struct perf_event_mmap_page **headers,
+			    int num_fds, perf_event_print_fn output_fn)
+{
+	enum bpf_perf_event_ret ret;
+	struct pollfd *pfds;
+	void *buf = NULL;
+	size_t len = 0;
+	int i;
+
+	pfds = malloc(sizeof(*pfds) * num_fds);
+	if (!pfds)
+		return -1;
+
+	memset(pfds, 0, sizeof(*pfds) * num_fds);
+	for (i = 0; i < num_fds; i++) {
+		pfds[i].fd = fds[i];
+		pfds[i].events = POLLIN;
+	}
+
+	for (;;) {
+		poll(pfds, num_fds, 1000);
+		for (i = 0; i < num_fds; i++) {
+			if (pfds[i].revents) {
+				ret = bpf_perf_event_read_simple(headers[i], page_cnt * page_size,
+								page_size, &buf, &len,
+								bpf_perf_event_print,
+								output_fn);
+				if (ret != LIBBPF_PERF_EVENT_CONT)
+					break;
+			}
+		}
+	}
+	free(buf);
+	free(pfds);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 3b4bcf7f5084..18924f23db1b 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -3,6 +3,7 @@
 #define __TRACE_HELPER_H
 
 #include <libbpf.h>
+#include <linux/perf_event.h>
 
 struct ksym {
 	long addr;
@@ -16,6 +17,9 @@ long ksym_get_addr(const char *name);
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
 int perf_event_mmap(int fd);
+int perf_event_mmap_header(int fd, struct perf_event_mmap_page **header);
 /* return LIBBPF_PERF_EVENT_DONE or LIBBPF_PERF_EVENT_ERROR */
 int perf_event_poller(int fd, perf_event_print_fn output_fn);
+int perf_event_poller_multi(int *fds, struct perf_event_mmap_page **headers,
+			    int num_fds, perf_event_print_fn output_fn);
 #endif

^ permalink raw reply related

* Re: [PATCH bpf-next 0/5] AF_XDP: bug fixes and descriptor changes
From: Alexei Starovoitov @ 2018-06-04 16:24 UTC (permalink / raw)
  To: Björn Töpel
  Cc: magnus.karlsson, magnus.karlsson, alexander.h.duyck,
	alexander.duyck, ast, brouer, daniel, netdev, mykyta.iziumtsev,
	Björn Töpel, john.fastabend, willemdebruijn.kernel, mst,
	michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
	francois.ozog, ilias.apalodimas, brian.brooks, andy, michael.chan
In-Reply-To: <20180604115715.17895-1-bjorn.topel@gmail.com>

On Mon, Jun 04, 2018 at 01:57:10PM +0200, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> An issue with the current AF_XDP uapi raised by Mykyta Iziumtsev (see
> https://www.spinics.net/lists/netdev/msg503664.html) is that it does
> not support NICs that have a "type-writer" model in an efficient
> way. In this model, a memory window is passed to the hardware and
> multiple frames might be filled into that window, instead of just one
> that we have in the current fixed frame-size model.
> 
> This patch set fixes two bugs in the current implementation and then
> changes the uapi so that the type-writer model can be supported
> efficiently by a possible future extension of AF_XDP.
> 
> These are the uapi changes in this patch:
> 
> * Change the "u32 idx" in the descriptors to "u64 addr". The current
>   idx based format does NOT work for the type-writer model (as packets
>   can start anywhere within a frame) but that a relative address
>   pointer (the u64 addr) works well for both models in the prototype
>   code we have that supports both models. We increased it from u32 to
>   u64 to support umems larger than 4G. We have also removed the u16
>   offset when having a "u64 addr" since that information is already
>   carried in the least significant bits of the address.
> 
> * We want to use "u8 padding[5]" for something useful in the future
>   (since we are not allowed to change its name), so we now call it
>   just options so it can be extended for various purposes in the
>   future. It is an u32 as that it what is left of the 16 byte
>   descriptor.
> 
> * We changed the name of frame_size in the UMEM_REG setsockopt to
>   chunk_size since this naming also makes sense to the type-writer
>   model.
> 
> With these changes to the uapi, we believe the type-writer model can
> be supported without having to resort to a new descriptor format. The
> type-writer model could then be supported, from the uapi point of
> view, by setting a flag at bind time and providing a new flag bit in
> the options field of the descriptor that signals to user space that
> all packets have been written in a chunk. Or with a new chunk
> completion queue as suggested by Mykyta in his latest feedback mail on
> the list.

for the set:
Acked-by: Alexei Starovoitov <ast@kernel.org>
Thank you for these fixes.
According to unofficial feedback from brcm and netronome folks
the descriptor format should work for these nics too.
At some point we may consider second format, but I think SW
should drive HW requirements and not the other way around.

^ permalink raw reply

* Re: [PATCH net] VSOCK: check sk state before receive
From: Jorgen S. Hansen @ 2018-06-04 16:02 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Hangbin Liu, netdev@vger.kernel.org, David S. Miller
In-Reply-To: <20180530091727.GF14623@stefanha-x1.localdomain>


> On May 30, 2018, at 11:17 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Sun, May 27, 2018 at 11:29:45PM +0800, Hangbin Liu wrote:
>> Hmm...Although I won't reproduce this bug with my reproducer after
>> apply my patch. I could still get a similiar issue with syzkaller sock vnet test.
>> 
>> It looks this patch is not complete. Here is the KASAN call trace with my patch.
>> I can also reproduce it without my patch.
> 
> Seems like a race between vmci_datagram_destroy_handle() and the
> delayed callback, vmci_transport_recv_dgram_cb().
> 
> I don't know the VMCI transport well so I'll leave this to Jorgen.

Yes, it looks like we are calling the delayed callback after we return from vmci_datagram_destroy_handle(). I’ll take a closer look at the VMCI side here - the refcounting of VMCI datagram endpoints should guard against this, since the delayed callback does a get on the datagram resource, so this could a VMCI driver issue, and not a problem in the VMCI transport for AF_VSOCK.

> 
>> ==================================================================
>> BUG: KASAN: use-after-free in vmci_transport_allow_dgram.part.7+0x155/0x1a0 [vmw_vsock_vmci_transport]
>> Read of size 4 at addr ffff880026a3a914 by task kworker/0:2/96
>> 
>> CPU: 0 PID: 96 Comm: kworker/0:2 Not tainted 4.17.0-rc6.vsock+ #28
>> Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
>> Workqueue: events dg_delayed_dispatch [vmw_vmci]
>> Call Trace:
>> __dump_stack lib/dump_stack.c:77 [inline]
>> dump_stack+0xdd/0x18e lib/dump_stack.c:113
>> print_address_description+0x7a/0x3e0 mm/kasan/report.c:256
>> kasan_report_error mm/kasan/report.c:354 [inline]
>> kasan_report+0x1dd/0x460 mm/kasan/report.c:412
>> vmci_transport_allow_dgram.part.7+0x155/0x1a0 [vmw_vsock_vmci_transport]
>> vmci_transport_recv_dgram_cb+0x5d/0x200 [vmw_vsock_vmci_transport]
>> dg_delayed_dispatch+0x99/0x1b0 [vmw_vmci]
>> process_one_work+0xa4e/0x1720 kernel/workqueue.c:2145
>> worker_thread+0x1df/0x1400 kernel/workqueue.c:2279
>> kthread+0x343/0x4b0 kernel/kthread.c:240
>> ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:412
>> 
>> Allocated by task 2684:
>> set_track mm/kasan/kasan.c:460 [inline]
>> kasan_kmalloc+0xa0/0xd0 mm/kasan/kasan.c:553
>> slab_post_alloc_hook mm/slab.h:444 [inline]
>> slab_alloc_node mm/slub.c:2741 [inline]
>> slab_alloc mm/slub.c:2749 [inline]
>> kmem_cache_alloc+0x105/0x330 mm/slub.c:2754
>> sk_prot_alloc+0x6a/0x2c0 net/core/sock.c:1468
>> sk_alloc+0xc9/0xbb0 net/core/sock.c:1528
>> __vsock_create+0xc8/0x9b0 [vsock]
>> vsock_create+0xfd/0x1a0 [vsock]
>> __sock_create+0x310/0x690 net/socket.c:1285
>> sock_create net/socket.c:1325 [inline]
>> __sys_socket+0x101/0x240 net/socket.c:1355
>> __do_sys_socket net/socket.c:1364 [inline]
>> __se_sys_socket net/socket.c:1362 [inline]
>> __x64_sys_socket+0x7d/0xd0 net/socket.c:1362
>> do_syscall_64+0x175/0x630 arch/x86/entry/common.c:287
>> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> 
>> Freed by task 2684:
>> set_track mm/kasan/kasan.c:460 [inline]
>> __kasan_slab_free+0x130/0x180 mm/kasan/kasan.c:521
>> slab_free_hook mm/slub.c:1388 [inline]
>> slab_free_freelist_hook mm/slub.c:1415 [inline]
>> slab_free mm/slub.c:2988 [inline]
>> kmem_cache_free+0xce/0x410 mm/slub.c:3004
>> sk_prot_free net/core/sock.c:1509 [inline]
>> __sk_destruct+0x629/0x940 net/core/sock.c:1593
>> sk_destruct+0x4e/0x90 net/core/sock.c:1601
>> __sk_free+0xd3/0x320 net/core/sock.c:1612
>> sk_free+0x2a/0x30 net/core/sock.c:1623
>> __vsock_release+0x431/0x610 [vsock]
>> vsock_release+0x3c/0xc0 [vsock]
>> sock_release+0x91/0x200 net/socket.c:594
>> sock_close+0x17/0x20 net/socket.c:1149
>> __fput+0x368/0xa20 fs/file_table.c:209
>> task_work_run+0x1c5/0x2a0 kernel/task_work.c:113
>> exit_task_work include/linux/task_work.h:22 [inline]
>> do_exit+0x1876/0x26c0 kernel/exit.c:865
>> do_group_exit+0x159/0x3e0 kernel/exit.c:968
>> get_signal+0x65a/0x1780 kernel/signal.c:2482
>> do_signal+0xa4/0x1fe0 arch/x86/kernel/signal.c:810
>> exit_to_usermode_loop+0x1b8/0x260 arch/x86/entry/common.c:162
>> prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
>> syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
>> do_syscall_64+0x505/0x630 arch/x86/entry/common.c:290
>> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> 
>> The buggy address belongs to the object at ffff880026a3a600
>> which belongs to the cache AF_VSOCK of size 1056
>> The buggy address is located 788 bytes inside of
>> 1056-byte region [ffff880026a3a600, ffff880026a3aa20)
>> The buggy address belongs to the page:
>> page:ffffea00009a8e00 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0
>> flags: 0xfffffc0008100(slab|head)
>> raw: 000fffffc0008100 0000000000000000 0000000000000000 00000001000d000d
>> raw: dead000000000100 dead000000000200 ffff880034471a40 0000000000000000
>> page dumped because: kasan: bad access detected
>> 
>> Memory state around the buggy address:
>> ffff880026a3a800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> ffff880026a3a880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>> ffff880026a3a900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>                         ^
>> ffff880026a3a980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> ffff880026a3aa00: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
>> ==================================================================


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox