Netdev List

Netdev List
 help / color / mirror / Atom feed

* [next PATCH 04/11] ixgbe: Update code to better handle incrementing page count
From: Alexander Duyck @ 2017-01-06 16:06 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Batch the page count updates instead of doing them one at a time.  By doing
this we can improve the overall performance as the atomic increment
operations can be expensive due to the fact that on x86 they are locked
operations which can cause stalls.  By doing bulk updates we can
consolidate the stall which should help to improve the overall receive
performance.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    7 ++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   39 ++++++++++++++++---------
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 97e74deecae2..717c65b0deb2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -198,7 +198,12 @@ struct ixgbe_rx_buffer {
 	struct sk_buff *skb;
 	dma_addr_t dma;
 	struct page *page;
-	unsigned int page_offset;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+	__u32 page_offset;
+#else
+	__u16 page_offset;
+#endif
+	__u16 pagecnt_bias;
 };
 
 struct ixgbe_queue_stats {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 062b984ffdf4..519b6a2b65c1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1602,6 +1602,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 	bi->dma = dma;
 	bi->page = page;
 	bi->page_offset = 0;
+	bi->pagecnt_bias = 1;
 
 	return true;
 }
@@ -1959,13 +1960,15 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 	unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) -
 				   ixgbe_rx_bufsz(rx_ring);
 #endif
+	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+
 	/* avoid re-using remote pages */
 	if (unlikely(ixgbe_page_is_reserved(page)))
 		return false;
 
 #if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != 1))
+	if (unlikely(page_count(page) != pagecnt_bias))
 		return false;
 
 	/* flip page offset to other buffer */
@@ -1978,10 +1981,14 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 		return false;
 #endif
 
-	/* Even if we own the page, we are not allowed to use atomic_set()
-	 * This would break get_page_unless_zero() users.
+	/* If we have drained the page fragment pool we need to update
+	 * the pagecnt_bias and page count so that we fully restock the
+	 * number of references the driver holds.
 	 */
-	page_ref_inc(page);
+	if (unlikely(pagecnt_bias == 1)) {
+		page_ref_add(page, USHRT_MAX);
+		rx_buffer->pagecnt_bias = USHRT_MAX;
+	}
 
 	return true;
 }
@@ -2025,7 +2032,6 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 			return true;
 
 		/* this page cannot be reused so discard it */
-		__free_pages(page, ixgbe_rx_pg_order(rx_ring));
 		return false;
 	}
 
@@ -2104,15 +2110,19 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 	if (ixgbe_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
 		/* hand second half of page back to the ring */
 		ixgbe_reuse_rx_page(rx_ring, rx_buffer);
-	} else if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
-		/* the page has been released from the ring */
-		IXGBE_CB(skb)->page_released = true;
 	} else {
-		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
-				     ixgbe_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IXGBE_RX_DMA_ATTR);
+		if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
+			/* the page has been released from the ring */
+			IXGBE_CB(skb)->page_released = true;
+		} else {
+			/* we are not reusing the buffer so unmap it */
+			dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+					     ixgbe_rx_pg_size(rx_ring),
+					     DMA_FROM_DEVICE,
+					     IXGBE_RX_DMA_ATTR);
+		}
+		__page_frag_cache_drain(page,
+					rx_buffer->pagecnt_bias);
 	}
 
 	/* clear contents of buffer_info */
@@ -4972,7 +4982,8 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 				     ixgbe_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IXGBE_RX_DMA_ATTR);
-		__free_pages(rx_buffer->page, ixgbe_rx_pg_order(rx_ring));
+		__page_frag_cache_drain(rx_buffer->page,
+					rx_buffer->pagecnt_bias);
 
 		rx_buffer->page = NULL;
 	}

^ permalink raw reply related

* [next PATCH 05/11] ixgbe: Make use of order 1 pages and 3K buffers independent of FCoE
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

In order to support build_skb with jumbo frames it will be necessary to use
3K buffers for the Rx path with 8K pages backing them.  This is needed on
architectures that implement 4K pages because we can't support 2K buffers
plus padding in a 4K page.

In the case of systems that support page sizes larger than 4K the 3K
attribute will only be applied to FCoE as we can fall back to using just 2K
buffers and adding the padding.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |   20 +++++++++-----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   19 +++++++++++++------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 717c65b0deb2..80328e657d6a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -234,13 +234,14 @@ struct ixgbe_rx_queue_stats {
 #define IXGBE_TS_HDR_LEN 8
 
 enum ixgbe_ring_state_t {
+	__IXGBE_RX_3K_BUFFER,
+	__IXGBE_RX_RSC_ENABLED,
+	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
+	__IXGBE_RX_FCOE,
 	__IXGBE_TX_FDIR_INIT_DONE,
 	__IXGBE_TX_XPS_INIT_DONE,
 	__IXGBE_TX_DETECT_HANG,
 	__IXGBE_HANG_CHECK_ARMED,
-	__IXGBE_RX_RSC_ENABLED,
-	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
-	__IXGBE_RX_FCOE,
 };
 
 struct ixgbe_fwd_adapter {
@@ -352,19 +353,16 @@ struct ixgbe_ring_feature {
  */
 static inline unsigned int ixgbe_rx_bufsz(struct ixgbe_ring *ring)
 {
-#ifdef IXGBE_FCOE
-	if (test_bit(__IXGBE_RX_FCOE, &ring->state))
-		return (PAGE_SIZE < 8192) ? IXGBE_RXBUFFER_4K :
-					    IXGBE_RXBUFFER_3K;
-#endif
+	if (test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
+		return IXGBE_RXBUFFER_3K;
 	return IXGBE_RXBUFFER_2K;
 }
 
 static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring)
 {
-#ifdef IXGBE_FCOE
-	if (test_bit(__IXGBE_RX_FCOE, &ring->state))
-		return (PAGE_SIZE < 8192) ? 1 : 0;
+#if (PAGE_SIZE < 8192)
+	if (test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
+		return 1;
 #endif
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 519b6a2b65c1..1b6f1f584044 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1617,6 +1617,7 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 	union ixgbe_adv_rx_desc *rx_desc;
 	struct ixgbe_rx_buffer *bi;
 	u16 i = rx_ring->next_to_use;
+	u16 bufsz;
 
 	/* nothing to do */
 	if (!cleaned_count)
@@ -1626,14 +1627,15 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 	bi = &rx_ring->rx_buffer_info[i];
 	i -= rx_ring->count;
 
+	bufsz = ixgbe_rx_bufsz(rx_ring);
+
 	do {
 		if (!ixgbe_alloc_mapped_page(rx_ring, bi))
 			break;
 
 		/* sync the buffer for use by the device */
 		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
-						 bi->page_offset,
-						 ixgbe_rx_bufsz(rx_ring),
+						 bi->page_offset, bufsz,
 						 DMA_FROM_DEVICE);
 
 		/*
@@ -2016,9 +2018,9 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 	struct page *page = rx_buffer->page;
 	unsigned char *va = page_address(page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = ixgbe_rx_bufsz(rx_ring);
+	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+	unsigned int truesize = SKB_DATA_ALIGN(size);
 #endif
 
 	if (unlikely(skb_is_nonlinear(skb)))
@@ -3917,10 +3919,15 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 	 */
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		rx_ring = adapter->rx_ring[i];
+
+		clear_ring_rsc_enabled(rx_ring);
+		clear_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+
 		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
 			set_ring_rsc_enabled(rx_ring);
-		else
-			clear_ring_rsc_enabled(rx_ring);
+
+		if (test_bit(__IXGBE_RX_FCOE, &rx_ring->state))
+			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
 	}
 }
 

^ permalink raw reply related

* [next PATCH 06/11] ixgbe: Use length to determine if descriptor is done
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change makes it so that we use the length of the packet instead of the
DD status bit to determine if a new descriptor is ready to be processed.
The obvious advantage is that it cuts down on reads as we don't really even
need the DD bit if going from a 0 to a non-zero value on size is enough to
inform us that the packet has been completed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1b6f1f584044..6ce596816954 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1653,8 +1653,8 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 			i -= rx_ring->count;
 		}
 
-		/* clear the status bits for the next_to_use descriptor */
-		rx_desc->wb.upper.status_error = 0;
+		/* clear the length for the next_to_use descriptor */
+		rx_desc->wb.upper.length = 0;
 
 		cleaned_count--;
 	} while (cleaned_count);
@@ -2170,7 +2170,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 
 		rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-		if (!rx_desc->wb.upper.status_error)
+		if (!rx_desc->wb.upper.length)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -3749,6 +3749,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 			     struct ixgbe_ring *ring)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
+	union ixgbe_adv_rx_desc *rx_desc;
 	u64 rdba = ring->dma;
 	u32 rxdctl;
 	u8 reg_idx = ring->reg_idx;
@@ -3783,6 +3784,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 		rxdctl |=  0x080420;
 	}
 
+	/* initialize Rx descriptor 0 */
+	rx_desc = IXGBE_RX_DESC(ring, 0);
+	rx_desc->wb.upper.length = 0;
+
 	/* enable receive descriptor ring */
 	rxdctl |= IXGBE_RXDCTL_ENABLE;
 	IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(reg_idx), rxdctl);
@@ -4998,9 +5003,6 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
 	memset(rx_ring->rx_buffer_info, 0, size);
 
-	/* Zero out the descriptor ring */
-	memset(rx_ring->desc, 0, rx_ring->size);
-
 	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;

^ permalink raw reply related

* [next PATCH 07/11] ixgbe: Break out Rx buffer page management
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

We are going to be expanding the number of Rx paths in the driver.  Instead
of duplicating all that code I am pulling it apart into separate functions
so that we don't have so much code duplication.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  230 +++++++++++++------------
 1 file changed, 122 insertions(+), 108 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6ce596816954..79495cc990c2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1864,7 +1864,6 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 				     ixgbe_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IXGBE_RX_DMA_ATTR);
-		IXGBE_CB(skb)->page_released = false;
 	} else {
 		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
 
@@ -1874,7 +1873,6 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 					      skb_frag_size(frag),
 					      DMA_FROM_DEVICE);
 	}
-	IXGBE_CB(skb)->dma = 0;
 }
 
 /**
@@ -1945,8 +1943,14 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
 	nta++;
 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 
-	/* transfer page from old buffer to new buffer */
-	*new_buff = *old_buff;
+	/* Transfer page from old buffer to new buffer.
+	 * Move each member individually to avoid possible store
+	 * forwarding stalls and unnecessary copy of skb.
+	 */
+	new_buff->dma		= old_buff->dma;
+	new_buff->page		= old_buff->page;
+	new_buff->page_offset	= old_buff->page_offset;
+	new_buff->pagecnt_bias	= old_buff->pagecnt_bias;
 }
 
 static inline bool ixgbe_page_is_reserved(struct page *page)
@@ -1954,15 +1958,10 @@ static inline bool ixgbe_page_is_reserved(struct page *page)
 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
-static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
-				    struct page *page,
-				    const unsigned int truesize)
+static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer)
 {
-#if (PAGE_SIZE >= 8192)
-	unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) -
-				   ixgbe_rx_bufsz(rx_ring);
-#endif
-	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
+	struct page *page = rx_buffer->page;
 
 	/* avoid re-using remote pages */
 	if (unlikely(ixgbe_page_is_reserved(page)))
@@ -1970,16 +1969,17 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 
 #if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != pagecnt_bias))
+	if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
 		return false;
-
-	/* flip page offset to other buffer */
-	rx_buffer->page_offset ^= truesize;
 #else
-	/* move offset up to the next cache line */
-	rx_buffer->page_offset += truesize;
-
-	if (rx_buffer->page_offset > last_offset)
+	/* The last offset is a bit aggressive in that we assume the
+	 * worst case of FCoE being enabled and using a 3K buffer.
+	 * However this should have minimal impact as the 1K extra is
+	 * still less than one buffer in size.
+	 */
+#define IXGBE_LAST_OFFSET \
+	(SKB_WITH_OVERHEAD(PAGE_SIZE) - IXGBE_RXBUFFER_3K)
+	if (rx_buffer->page_offset > IXGBE_LAST_OFFSET)
 		return false;
 #endif
 
@@ -1987,7 +1987,7 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 	 * the pagecnt_bias and page count so that we fully restock the
 	 * number of references the driver holds.
 	 */
-	if (unlikely(pagecnt_bias == 1)) {
+	if (unlikely(!pagecnt_bias)) {
 		page_ref_add(page, USHRT_MAX);
 		rx_buffer->pagecnt_bias = USHRT_MAX;
 	}
@@ -2010,106 +2010,65 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
  * The function will then update the page offset if necessary and return
  * true if the buffer can be reused by the adapter.
  **/
-static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
+static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 			      struct ixgbe_rx_buffer *rx_buffer,
-			      unsigned int size,
-			      struct sk_buff *skb)
+			      struct sk_buff *skb,
+			      unsigned int size)
 {
-	struct page *page = rx_buffer->page;
-	unsigned char *va = page_address(page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(size);
 #endif
-
-	if (unlikely(skb_is_nonlinear(skb)))
-		goto add_tail_frag;
-
-	if (size <= IXGBE_RX_HDR_SIZE) {
-		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
-
-		/* page is not reserved, we can reuse buffer as-is */
-		if (likely(!ixgbe_page_is_reserved(page)))
-			return true;
-
-		/* this page cannot be reused so discard it */
-		return false;
-	}
-
-add_tail_frag:
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
 			rx_buffer->page_offset, size, truesize);
-
-	return ixgbe_can_reuse_rx_page(rx_buffer, page, truesize);
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
 }
 
-static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
-					     union ixgbe_adv_rx_desc *rx_desc)
+static struct ixgbe_rx_buffer *ixgbe_get_rx_buffer(struct ixgbe_ring *rx_ring,
+						   union ixgbe_adv_rx_desc *rx_desc,
+						   struct sk_buff **skb,
+						   const unsigned int size)
 {
-	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 	struct ixgbe_rx_buffer *rx_buffer;
-	struct sk_buff *skb;
-	struct page *page;
 
 	rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
-	page = rx_buffer->page;
-	prefetchw(page);
-
-	skb = rx_buffer->skb;
-
-	if (likely(!skb)) {
-		void *page_addr = page_address(page) +
-				  rx_buffer->page_offset;
+	prefetchw(rx_buffer->page);
+	*skb = rx_buffer->skb;
 
-		/* prefetch first cache line of first page */
-		prefetch(page_addr);
-#if L1_CACHE_BYTES < 128
-		prefetch(page_addr + L1_CACHE_BYTES);
-#endif
-
-		/* allocate a skb to store the frags */
-		skb = napi_alloc_skb(&rx_ring->q_vector->napi,
-				     IXGBE_RX_HDR_SIZE);
-		if (unlikely(!skb)) {
-			rx_ring->rx_stats.alloc_rx_buff_failed++;
-			return NULL;
-		}
-
-		/*
-		 * we will be copying header into skb->data in
-		 * pskb_may_pull so it is in our interest to prefetch
-		 * it now to avoid a possible cache miss
-		 */
-		prefetchw(skb->data);
-
-		/*
-		 * Delay unmapping of the first packet. It carries the
-		 * header information, HW may still access the header
-		 * after the writeback.  Only unmap it when EOP is
-		 * reached
-		 */
-		if (likely(ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)))
-			goto dma_sync;
-
-		IXGBE_CB(skb)->dma = rx_buffer->dma;
+	/* Delay unmapping of the first packet. It carries the header
+	 * information, HW may still access the header after the writeback.
+	 * Only unmap it when EOP is reached
+	 */
+	if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)) {
+		if (!*skb)
+			goto skip_sync;
 	} else {
-		if (ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
-			ixgbe_dma_sync_frag(rx_ring, skb);
+		if (*skb)
+			ixgbe_dma_sync_frag(rx_ring, *skb);
+	}
 
-dma_sync:
-		/* we are reusing so sync this buffer for CPU use */
-		dma_sync_single_range_for_cpu(rx_ring->dev,
-					      rx_buffer->dma,
-					      rx_buffer->page_offset,
-					      size,
-					      DMA_FROM_DEVICE);
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev,
+				      rx_buffer->dma,
+				      rx_buffer->page_offset,
+				      size,
+				      DMA_FROM_DEVICE);
+skip_sync:
+	rx_buffer->pagecnt_bias--;
 
-		rx_buffer->skb = NULL;
-	}
+	return rx_buffer;
+}
 
-	/* pull page into skb */
-	if (ixgbe_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
+static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
+				struct ixgbe_rx_buffer *rx_buffer,
+				struct sk_buff *skb)
+{
+	if (ixgbe_can_reuse_rx_page(rx_buffer)) {
 		/* hand second half of page back to the ring */
 		ixgbe_reuse_rx_page(rx_ring, rx_buffer);
 	} else {
@@ -2123,12 +2082,55 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 					     DMA_FROM_DEVICE,
 					     IXGBE_RX_DMA_ATTR);
 		}
-		__page_frag_cache_drain(page,
+		__page_frag_cache_drain(rx_buffer->page,
 					rx_buffer->pagecnt_bias);
 	}
 
-	/* clear contents of buffer_info */
+	/* clear contents of rx_buffer */
 	rx_buffer->page = NULL;
+	rx_buffer->skb = NULL;
+}
+
+static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
+					   struct ixgbe_rx_buffer *rx_buffer,
+					   union ixgbe_adv_rx_desc *rx_desc,
+					   unsigned int size)
+{
+	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+#endif
+	struct sk_buff *skb;
+
+	/* prefetch first cache line of first page */
+	prefetch(va);
+#if L1_CACHE_BYTES < 128
+	prefetch(va + L1_CACHE_BYTES);
+#endif
+
+	/* allocate a skb to store the frags */
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE);
+	if (unlikely(!skb))
+		return NULL;
+
+	if (size > IXGBE_RX_HDR_SIZE) {
+		if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
+			IXGBE_CB(skb)->dma = rx_buffer->dma;
+
+		skb_add_rx_frag(skb, 0, rx_buffer->page,
+				rx_buffer->page_offset,
+				size, truesize);
+#if (PAGE_SIZE < 8192)
+		rx_buffer->page_offset ^= truesize;
+#else
+		rx_buffer->page_offset += truesize;
+#endif
+	} else {
+		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
+		rx_buffer->pagecnt_bias++;
+	}
 
 	return skb;
 }
@@ -2160,7 +2162,9 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 
 	while (likely(total_rx_packets < budget)) {
 		union ixgbe_adv_rx_desc *rx_desc;
+		struct ixgbe_rx_buffer *rx_buffer;
 		struct sk_buff *skb;
+		unsigned int size;
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
@@ -2169,8 +2173,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 		rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
-
-		if (!rx_desc->wb.upper.length)
+		size = le16_to_cpu(rx_desc->wb.upper.length);
+		if (!size)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -2179,13 +2183,23 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		 */
 		dma_rmb();
 
+		rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);
+
 		/* retrieve a buffer from the ring */
-		skb = ixgbe_fetch_rx_buffer(rx_ring, rx_desc);
+		if (skb)
+			ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
+		else
+			skb = ixgbe_construct_skb(rx_ring, rx_buffer,
+						  rx_desc, size);
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb)
+		if (!skb) {
+			rx_ring->rx_stats.alloc_rx_buff_failed++;
+			rx_buffer->pagecnt_bias++;
 			break;
+		}
 
+		ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb);
 		cleaned_count++;
 
 		/* place incomplete frames back on ring for completion */

^ permalink raw reply related

* [next PATCH 08/11] ixgbe: Add support for padding packet
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds support for providing a buffer with headroom and tailroom
to allow for shared info, NET_SKB_PAD, and NET_IP_ALIGN.  With this
combined with the DMA changes we can start using build_skb to build frames
around an incoming Rx buffer instead of having to memcpy the headers.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |   17 ++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   43 +++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 80328e657d6a..3537d07b4807 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -94,6 +94,14 @@
 #define IXGBE_RXBUFFER_4K    4096
 #define IXGBE_MAX_RXBUFFER  16384  /* largest size for a single descriptor */
 
+#define IXGBE_SKB_PAD		(NET_SKB_PAD + NET_IP_ALIGN)
+#if (PAGE_SIZE < 8192)
+#define IXGBE_MAX_FRAME_BUILD_SKB \
+	(SKB_WITH_OVERHEAD(IXGBE_RXBUFFER_2K) - IXGBE_SKB_PAD)
+#else
+#define IGB_MAX_FRAME_BUILD_SKB IXGBE_RXBUFFER_2K
+#endif
+
 /*
  * NOTE: netdev_alloc_skb reserves up to 64 bytes, NET_IP_ALIGN means we
  * reserve 64 more, and skb_shared_info adds an additional 320 bytes more,
@@ -235,6 +243,7 @@ struct ixgbe_rx_queue_stats {
 
 enum ixgbe_ring_state_t {
 	__IXGBE_RX_3K_BUFFER,
+	__IXGBE_RX_BUILD_SKB_ENABLED,
 	__IXGBE_RX_RSC_ENABLED,
 	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
 	__IXGBE_RX_FCOE,
@@ -244,6 +253,9 @@ enum ixgbe_ring_state_t {
 	__IXGBE_HANG_CHECK_ARMED,
 };
 
+#define ring_uses_build_skb(ring) \
+	test_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &(ring)->state)
+
 struct ixgbe_fwd_adapter {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 	struct net_device *netdev;
@@ -355,6 +367,10 @@ static inline unsigned int ixgbe_rx_bufsz(struct ixgbe_ring *ring)
 {
 	if (test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
 		return IXGBE_RXBUFFER_3K;
+#if (PAGE_SIZE < 8192)
+	if (ring_uses_build_skb(ring))
+		return IXGBE_MAX_FRAME_BUILD_SKB;
+#endif
 	return IXGBE_RXBUFFER_2K;
 }
 
@@ -670,6 +686,7 @@ struct ixgbe_adapter {
 #define IXGBE_FLAG2_VLAN_PROMISC		BIT(13)
 #define IXGBE_FLAG2_EEE_CAPABLE			BIT(14)
 #define IXGBE_FLAG2_EEE_ENABLED			BIT(15)
+#define IXGBE_FLAG2_RX_LEGACY			BIT(16)
 
 	/* Tx fast path data */
 	int num_tx_queues;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 79495cc990c2..8529eafb9717 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1565,6 +1565,11 @@ static inline void ixgbe_rx_checksum(struct ixgbe_ring *ring,
 	}
 }
 
+static inline unsigned int ixgbe_rx_offset(struct ixgbe_ring *rx_ring)
+{
+	return ring_uses_build_skb(rx_ring) ? IXGBE_SKB_PAD : 0;
+}
+
 static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 				    struct ixgbe_rx_buffer *bi)
 {
@@ -1601,7 +1606,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 
 	bi->dma = dma;
 	bi->page = page;
-	bi->page_offset = 0;
+	bi->page_offset = ixgbe_rx_offset(rx_ring);
 	bi->pagecnt_bias = 1;
 
 	return true;
@@ -2018,7 +2023,9 @@ static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = SKB_DATA_ALIGN(size);
+	unsigned int truesize = ring_uses_build_skb(rx_ring) ?
+				SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) :
+				SKB_DATA_ALIGN(size);
 #endif
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
 			rx_buffer->page_offset, size, truesize);
@@ -2100,7 +2107,7 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+	unsigned int truesize = SKB_DATA_ALIGN(size);
 #endif
 	struct sk_buff *skb;
 
@@ -3462,7 +3469,10 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter,
 	srrctl = IXGBE_RX_HDR_SIZE << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT;
 
 	/* configure the packet buffer length */
-	srrctl |= ixgbe_rx_bufsz(rx_ring) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
+	if (test_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state))
+		srrctl |= IXGBE_RXBUFFER_3K >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
+	else
+		srrctl |= IXGBE_RXBUFFER_2K >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
 
 	/* configure descriptor type */
 	srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
@@ -3796,6 +3806,17 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 		 */
 		rxdctl &= ~0x3FFFFF;
 		rxdctl |=  0x080420;
+#if (PAGE_SIZE < 8192)
+	} else {
+		rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
+			    IXGBE_RXDCTL_RLPML_EN);
+
+		/* Limit the maximum frame size so we don't overrun the skb */
+		if (ring_uses_build_skb(ring) &&
+		    !test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
+			rxdctl |= IXGBE_MAX_FRAME_BUILD_SKB |
+				  IXGBE_RXDCTL_RLPML_EN;
+#endif
 	}
 
 	/* initialize Rx descriptor 0 */
@@ -3941,12 +3962,26 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 
 		clear_ring_rsc_enabled(rx_ring);
 		clear_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+		clear_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &rx_ring->state);
 
 		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
 			set_ring_rsc_enabled(rx_ring);
 
 		if (test_bit(__IXGBE_RX_FCOE, &rx_ring->state))
 			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+
+		if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
+			continue;
+
+		set_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &rx_ring->state);
+
+#if (PAGE_SIZE < 8192)
+		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
+			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+
+		if (max_frame > (ETH_FRAME_LEN + ETH_FCS_LEN))
+			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+#endif
 	}
 }
 

^ permalink raw reply related

* [next PATCH 09/11] ixgbe: Add private flag to control buffer mode
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Since there are potential drawbacks to the new Rx allocation approach I
thought it best to add a "chicken bit" so that we can turn the feature off
if in the event that a problem is found.

It also provides a means of validating the legacy Rx path in the event that
we are forced to fall back.  At some point in the future when we are
convinced we don't need it anymore we might be able to drop the legacy-rx
flag.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index aa80ece9ea96..6466d828bc6f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -151,6 +151,13 @@ struct ixgbe_stats {
 };
 #define IXGBE_TEST_LEN sizeof(ixgbe_gstrings_test) / ETH_GSTRING_LEN
 
+static const char ixgbe_priv_flags_strings[][ETH_GSTRING_LEN] = {
+#define IXGBE_PRIV_FLAGS_LEGACY_RX	BIT(0)
+	"legacy-rx",
+};
+
+#define IXGBE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(ixgbe_priv_flags_strings)
+
 /* currently supported speeds for 10G */
 #define ADVRTSD_MSK_10G (SUPPORTED_10000baseT_Full | \
 			 SUPPORTED_10000baseKX4_Full | \
@@ -1002,6 +1009,8 @@ static void ixgbe_get_drvinfo(struct net_device *netdev,
 
 	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
 		sizeof(drvinfo->bus_info));
+
+	drvinfo->n_priv_flags = IXGBE_PRIV_FLAGS_STR_LEN;
 }
 
 static void ixgbe_get_ringparam(struct net_device *netdev,
@@ -1141,6 +1150,8 @@ static int ixgbe_get_sset_count(struct net_device *netdev, int sset)
 		return IXGBE_TEST_LEN;
 	case ETH_SS_STATS:
 		return IXGBE_STATS_LEN;
+	case ETH_SS_PRIV_FLAGS:
+		return IXGBE_PRIV_FLAGS_STR_LEN;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1305,6 +1316,9 @@ static void ixgbe_get_strings(struct net_device *netdev, u32 stringset,
 		}
 		/* BUG_ON(p - data != IXGBE_STATS_LEN * ETH_GSTRING_LEN); */
 		break;
+	case ETH_SS_PRIV_FLAGS:
+		memcpy(data, ixgbe_priv_flags_strings,
+		       IXGBE_PRIV_FLAGS_STR_LEN * ETH_GSTRING_LEN);
 	}
 }
 
@@ -3386,6 +3400,37 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
 	return 0;
 }
 
+static u32 ixgbe_get_priv_flags(struct net_device *netdev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	u32 priv_flags = 0;
+
+	if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
+		priv_flags |= IXGBE_PRIV_FLAGS_LEGACY_RX;
+
+	return priv_flags;
+}
+
+static int ixgbe_set_priv_flags(struct net_device *netdev, u32 priv_flags)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	unsigned int flags2 = adapter->flags2;
+
+	flags2 &= ~IXGBE_FLAG2_RX_LEGACY;
+	if (priv_flags & IXGBE_PRIV_FLAGS_LEGACY_RX)
+		flags2 |= IXGBE_FLAG2_RX_LEGACY;
+
+	if (flags2 != adapter->flags2) {
+		adapter->flags2 = flags2;
+
+		/* reset interface to repopulate queues */
+		if (netif_running(netdev))
+			ixgbe_reinit_locked(adapter);
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops ixgbe_ethtool_ops = {
 	.get_settings           = ixgbe_get_settings,
 	.set_settings           = ixgbe_set_settings,
@@ -3422,6 +3467,8 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
 	.set_eee		= ixgbe_set_eee,
 	.get_channels		= ixgbe_get_channels,
 	.set_channels		= ixgbe_set_channels,
+	.get_priv_flags		= ixgbe_get_priv_flags,
+	.set_priv_flags		= ixgbe_set_priv_flags,
 	.get_ts_info		= ixgbe_get_ts_info,
 	.get_module_info	= ixgbe_get_module_info,
 	.get_module_eeprom	= ixgbe_get_module_eeprom,

^ permalink raw reply related

* [next PATCH 10/11] ixgbe: Add support for build_skb
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds build_skb support to the Rx path.  There are several
advantages to this change.

1.  It avoids the memcpy and skb->head allocation for small packets which
    improves performance by about 5% in my tests.
2.  It avoids the memcpy, skb->head allocation, and eth_get_headlen
    for larger packets improving performance by about 10% in my tests.
3.  For VXLAN packets it allows the full header to be in skb->data which
    improves the performance by as much as 30% in some of my tests.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   49 ++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8529eafb9717..dd04b76ae0bd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1913,7 +1913,7 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
 	}
 
 	/* place header in linear portion of buffer */
-	if (skb_is_nonlinear(skb))
+	if (!skb_headlen(skb))
 		ixgbe_pull_tail(rx_ring, skb);
 
 #ifdef IXGBE_FCOE
@@ -2142,6 +2142,49 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 	return skb;
 }
 
+static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
+				       struct ixgbe_rx_buffer *rx_buffer,
+				       union ixgbe_adv_rx_desc *rx_desc,
+				       unsigned int size)
+{
+	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
+				SKB_DATA_ALIGN(IXGBE_SKB_PAD + size);
+#endif
+	struct sk_buff *skb;
+
+	/* prefetch first cache line of first page */
+	prefetch(va);
+#if L1_CACHE_BYTES < 128
+	prefetch(va + L1_CACHE_BYTES);
+#endif
+
+	/* build an skb to around the page buffer */
+	skb = build_skb(va - IXGBE_SKB_PAD, truesize);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* update pointers within the skb to store the data */
+	skb_reserve(skb, IXGBE_SKB_PAD);
+	__skb_put(skb, size);
+
+	/* record DMA address if this is the start of a chain of buffers */
+	if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
+		IXGBE_CB(skb)->dma = rx_buffer->dma;
+
+	/* update buffer offset */
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
+
+	return skb;
+}
+
 /**
  * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @q_vector: structure containing interrupt and ring information
@@ -2195,6 +2238,9 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		/* retrieve a buffer from the ring */
 		if (skb)
 			ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
+		else if (ring_uses_build_skb(rx_ring))
+			skb = ixgbe_build_skb(rx_ring, rx_buffer,
+					      rx_desc, size);
 		else
 			skb = ixgbe_construct_skb(rx_ring, rx_buffer,
 						  rx_desc, size);
@@ -3970,6 +4016,7 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 		if (test_bit(__IXGBE_RX_FCOE, &rx_ring->state))
 			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
 
+		clear_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &rx_ring->state);
 		if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
 			continue;
 

^ permalink raw reply related

* [next PATCH 11/11] ixgbe: Don't bother clearing buffer memory for descriptor rings
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch makes it so that we don't need to bother with clearing the
memory out for the descriptor rings.  The general idea is to only free
buffers associated with buffers in use which are located between the
next_to_clean and next_to_use or next_to_alloc values.  Everything outside
of those regions can be safely ignored since they should have no buffers
associated with them.

The advantage to doing things this way is that is should speed up bring-up
and tear-down of the rings.  Specifically we can avoid the 512 or more
cycles required to memset the rings in tear-down.  In the bring-up phase we
then clear the memory as a part of initialization.  The general idea is
that the clearing in initialization can act as a prefetch of sorts for the
buffer info structures so they are in the local CPU when we go to populate
them.  This should help to improve overall time needed to perform a
suspend/resume.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   11 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  158 ++++++++++++----------
 2 files changed, 98 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 6466d828bc6f..8c6f40d4db6f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1945,7 +1945,16 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
 
 		/* unmap buffer on Tx side */
 		tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
-		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer);
+
+		/* Free all the Tx ring sk_buffs */
+		dev_kfree_skb_any(tx_buffer->skb);
+
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buffer, len, 0);
 
 		/* increment Rx/Tx next to clean counters */
 		rx_ntc++;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index dd04b76ae0bd..6ec0ebf7f174 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -958,28 +958,6 @@ static inline void ixgbe_irq_rearm_queues(struct ixgbe_adapter *adapter,
 	}
 }
 
-void ixgbe_unmap_and_free_tx_resource(struct ixgbe_ring *ring,
-				      struct ixgbe_tx_buffer *tx_buffer)
-{
-	if (tx_buffer->skb) {
-		dev_kfree_skb_any(tx_buffer->skb);
-		if (dma_unmap_len(tx_buffer, len))
-			dma_unmap_single(ring->dev,
-					 dma_unmap_addr(tx_buffer, dma),
-					 dma_unmap_len(tx_buffer, len),
-					 DMA_TO_DEVICE);
-	} else if (dma_unmap_len(tx_buffer, len)) {
-		dma_unmap_page(ring->dev,
-			       dma_unmap_addr(tx_buffer, dma),
-			       dma_unmap_len(tx_buffer, len),
-			       DMA_TO_DEVICE);
-	}
-	tx_buffer->next_to_watch = NULL;
-	tx_buffer->skb = NULL;
-	dma_unmap_len_set(tx_buffer, len, 0);
-	/* tx_buffer must be completely set up in the transmit path */
-}
-
 static void ixgbe_update_xoff_rx_lfc(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -1211,7 +1189,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 				 DMA_TO_DEVICE);
 
 		/* clear tx_buffer data */
-		tx_buffer->skb = NULL;
 		dma_unmap_len_set(tx_buffer, len, 0);
 
 		/* unmap remaining buffers */
@@ -3345,6 +3322,10 @@ void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter,
 
 	clear_bit(__IXGBE_HANG_CHECK_ARMED, &ring->state);
 
+	/* reinitialize tx_buffer_info */
+	memset(ring->tx_buffer_info, 0,
+	       sizeof(struct ixgbe_tx_buffer) * ring->count);
+
 	/* enable queue */
 	IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(reg_idx), txdctl);
 
@@ -3865,6 +3846,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 #endif
 	}
 
+	/* initialize rx_buffer_info */
+	memset(ring->rx_buffer_info, 0,
+	       sizeof(struct ixgbe_rx_buffer) * ring->count);
+
 	/* initialize Rx descriptor 0 */
 	rx_desc = IXGBE_RX_DESC(ring, 0);
 	rx_desc->wb.upper.length = 0;
@@ -5049,33 +5034,22 @@ static void ixgbe_fwd_psrtype(struct ixgbe_fwd_adapter *vadapter)
  **/
 static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 {
-	struct device *dev = rx_ring->dev;
-	unsigned long size;
-	u16 i;
-
-	/* ring already cleared, nothing to do */
-	if (!rx_ring->rx_buffer_info)
-		return;
+	u16 i = rx_ring->next_to_clean;
+	struct ixgbe_rx_buffer *rx_buffer = &rx_ring->rx_buffer_info[i];
 
 	/* Free all the Rx ring sk_buffs */
-	for (i = 0; i < rx_ring->count; i++) {
-		struct ixgbe_rx_buffer *rx_buffer = &rx_ring->rx_buffer_info[i];
-
+	while (i != rx_ring->next_to_alloc) {
 		if (rx_buffer->skb) {
 			struct sk_buff *skb = rx_buffer->skb;
 			if (IXGBE_CB(skb)->page_released)
-				dma_unmap_page_attrs(dev,
+				dma_unmap_page_attrs(rx_ring->dev,
 						     IXGBE_CB(skb)->dma,
 						     ixgbe_rx_pg_size(rx_ring),
 						     DMA_FROM_DEVICE,
 						     IXGBE_RX_DMA_ATTR);
 			dev_kfree_skb(skb);
-			rx_buffer->skb = NULL;
 		}
 
-		if (!rx_buffer->page)
-			continue;
-
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
@@ -5086,19 +5060,21 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 					      DMA_FROM_DEVICE);
 
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(dev, rx_buffer->dma,
+		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
 				     ixgbe_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IXGBE_RX_DMA_ATTR);
 		__page_frag_cache_drain(rx_buffer->page,
 					rx_buffer->pagecnt_bias);
 
-		rx_buffer->page = NULL;
+		i++;
+		rx_buffer++;
+		if (i == rx_ring->count) {
+			i = 0;
+			rx_buffer = rx_ring->rx_buffer_info;
+		}
 	}
 
-	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
-	memset(rx_ring->rx_buffer_info, 0, size);
-
 	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
@@ -5567,28 +5543,57 @@ void ixgbe_reset(struct ixgbe_adapter *adapter)
  **/
 static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 {
-	struct ixgbe_tx_buffer *tx_buffer_info;
-	unsigned long size;
-	u16 i;
+	u16 i = tx_ring->next_to_clean;
+	struct ixgbe_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
 
-	/* ring already cleared, nothing to do */
-	if (!tx_ring->tx_buffer_info)
-		return;
+	while (i != tx_ring->next_to_use) {
+		union ixgbe_adv_tx_desc *eop_desc, *tx_desc;
 
-	/* Free all the Tx ring sk_buffs */
-	for (i = 0; i < tx_ring->count; i++) {
-		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
-	}
+		/* Free all the Tx ring sk_buffs */
+		dev_kfree_skb_any(tx_buffer->skb);
 
-	netdev_tx_reset_queue(txring_txq(tx_ring));
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
 
-	size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count;
-	memset(tx_ring->tx_buffer_info, 0, size);
+		/* check for eop_desc to determine the end of the packet */
+		eop_desc = tx_buffer->next_to_watch;
+		tx_desc = IXGBE_TX_DESC(tx_ring, i);
 
-	/* Zero out the descriptor ring */
-	memset(tx_ring->desc, 0, tx_ring->size);
+		/* unmap remaining buffers */
+		while (tx_desc != eop_desc) {
+			tx_buffer++;
+			tx_desc++;
+			i++;
+			if (unlikely(i == tx_ring->count)) {
+				i = 0;
+				tx_buffer = tx_ring->tx_buffer_info;
+				tx_desc = IXGBE_TX_DESC(tx_ring, 0);
+			}
+
+			/* unmap any remaining paged data */
+			if (dma_unmap_len(tx_buffer, len))
+				dma_unmap_page(tx_ring->dev,
+					       dma_unmap_addr(tx_buffer, dma),
+					       dma_unmap_len(tx_buffer, len),
+					       DMA_TO_DEVICE);
+		}
 
+		/* move us one more past the eop_desc for start of next pkt */
+		tx_buffer++;
+		i++;
+		if (unlikely(i == tx_ring->count)) {
+			i = 0;
+			tx_buffer = tx_ring->tx_buffer_info;
+		}
+	}
+
+	/* reset BQL for queue */
+	netdev_tx_reset_queue(txring_txq(tx_ring));
+
+	/* reset next_to_use and next_to_clean */
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 }
@@ -6034,9 +6039,9 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
 	if (tx_ring->q_vector)
 		ring_node = tx_ring->q_vector->numa_node;
 
-	tx_ring->tx_buffer_info = vzalloc_node(size, ring_node);
+	tx_ring->tx_buffer_info = vmalloc_node(size, ring_node);
 	if (!tx_ring->tx_buffer_info)
-		tx_ring->tx_buffer_info = vzalloc(size);
+		tx_ring->tx_buffer_info = vmalloc(size);
 	if (!tx_ring->tx_buffer_info)
 		goto err;
 
@@ -6116,9 +6121,9 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
 	if (rx_ring->q_vector)
 		ring_node = rx_ring->q_vector->numa_node;
 
-	rx_ring->rx_buffer_info = vzalloc_node(size, ring_node);
+	rx_ring->rx_buffer_info = vmalloc_node(size, ring_node);
 	if (!rx_ring->rx_buffer_info)
-		rx_ring->rx_buffer_info = vzalloc(size);
+		rx_ring->rx_buffer_info = vmalloc(size);
 	if (!rx_ring->rx_buffer_info)
 		goto err;
 
@@ -7836,16 +7841,29 @@ static void ixgbe_tx_map(struct ixgbe_ring *tx_ring,
 	dev_err(tx_ring->dev, "TX DMA map failed\n");
 
 	/* clear dma mappings for failed tx_buffer_info map */
-	for (;;) {
+	while (tx_buffer != first) {
+		if (dma_unmap_len(tx_buffer, len))
+			dma_unmap_page(tx_ring->dev,
+				       dma_unmap_addr(tx_buffer, dma),
+				       dma_unmap_len(tx_buffer, len),
+				       DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buffer, len, 0);
+
+		if (i--)
+			i += tx_ring->count;
 		tx_buffer = &tx_ring->tx_buffer_info[i];
-		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer);
-		if (tx_buffer == first)
-			break;
-		if (i == 0)
-			i = tx_ring->count;
-		i--;
 	}
 
+	if (dma_unmap_len(tx_buffer, len))
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_buffer, len, 0);
+
+	dev_kfree_skb_any(first->skb);
+	first->skb = NULL;
+
 	tx_ring->next_to_use = i;
 }
 

^ permalink raw reply related

* Re: [for-next 07/10] IB/mlx5: Use blue flame register allocator in mlx5_ib
From: David Miller @ 2017-01-06 16:11 UTC (permalink / raw)
  To: leonro-VPRAkNaXOzVWk0Htik3J/w
  Cc: eli-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	saeedm-VPRAkNaXOzVWk0Htik3J/w, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	eli-VPRAkNaXOzVWk0Htik3J/w, matanb-VPRAkNaXOzVWk0Htik3J/w
In-Reply-To: <20170106060608.GG15685-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>

From: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Date: Fri, 6 Jan 2017 08:06:09 +0200

> On Thu, Jan 05, 2017 at 03:07:31PM -0500, David Miller wrote:
>> From: Eli Cohen <eli-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
>> Date: Thu, 5 Jan 2017 14:03:18 -0600
>>
>> > If necessary I can make sure it builds on 32 bits as well.
>>
>> Please do.
> 
> Dave,
> 
> I'm failing to understand the benefits of building mlx5 on 32 bits, and
> see only disadvantages:
>  * It is actual dead code without test coverage.
>  * It misleads reviewers/customers by seeing code for 32 bits.
>  * It adds compilation time for 32 bits platforms and "punishes" them
>    for not relevant for them driver.
> 
> Why do you call removing all that as a "regression"?

We have this thing called "CONFIG_COMPILE_TEST", it has tons of value,
perhaps you've seen it before?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Fixed link for 10G
From: Andrew Lunn @ 2017-01-06 16:24 UTC (permalink / raw)
  To: Madalin-Cristian Bucur
  Cc: Florian Fainelli, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <AM2PR04MB11228A96940B1E409CEC6E04EC630@AM2PR04MB1122.eurprd04.prod.outlook.com>

On Fri, Jan 06, 2017 at 12:01:06PM +0000, Madalin-Cristian Bucur wrote:
> Hi Florian,
> 
> I'm trying to add a fixed-link property that declares 10G speed
> for a XGMII PHY and I'm encountering some issues as the fixed
> link infrastructure does not seem to support this speed.
> 
> I'm using this device tree snippet (using the legacy format, but it
> should not matter):
> 
>         ethernet@f2000 { /* 10GEC2 */
>                 fixed-link = <0 1 10000 0 0>;
>                 phy-connection-type = "xgmii";
>         };
> 
> and I get this error:
> 
> 	[    0.464238] swphy: unknown speed
> 	[    0.467464] fsl_mac: probe of 1af2000.ethernet failed with error -22
> 
> Looking at the code, fixed_phy_register() seems to check for speeds up
> to 1G and swphy only caters 1G and lower speeds, the swphy_decode_speed()
> returning -EINVAL for 10G, triggering the error printed above in
> swphy_validate_state().
> 
> What would be the proper way to add support for the 10G fixed link speed?

Hi Madalin

I came across the same issue a couple of months ago. But i found a
different way to solve my problem.

Anyway, fixed-link emulates a PHY. It has the common PHY registers,
and sets the register values to indicate the device tree
configuration.

As you have found out, it only emulates 10/100/1000. For 10G, you need
to extend the emulation to include the 10G registers. Assuming the 10G
registers are standardised.

	  Andrew

^ permalink raw reply

* Re: [PATCH 2/3] xen: modify xenstore watch event interface
From: Wei Liu @ 2017-01-06 16:29 UTC (permalink / raw)
  To: Juergen Gross
  Cc: linux-kernel, xen-devel, boris.ostrovsky, konrad.wilk, roger.pau,
	wei.liu2, paul.durrant, netdev
In-Reply-To: <20170106150544.10836-3-jgross@suse.com>

On Fri, Jan 06, 2017 at 04:05:43PM +0100, Juergen Gross wrote:
> Today a Xenstore watch event is delivered via a callback function
> declared as:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char **vec, unsigned int len);
> 
> As all watch events only ever come with two parameters (path and token)
> changing the prototype to:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char *path, const char *token);
> 
> is the natural thing to do.
> 
> Apply this change and adapt all users.
> 
> Cc: konrad.wilk@oracle.com
> Cc: roger.pau@citrix.com
> Cc: wei.liu2@citrix.com
> Cc: paul.durrant@citrix.com
> Cc: netdev@vger.kernel.org
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

Reviewed-by: Wei Liu <wei.liu2@citrix.com>

^ permalink raw reply

* Re: [PATCH 2/3] xen: modify xenstore watch event interface
From: Roger Pau Monné @ 2017-01-06 16:37 UTC (permalink / raw)
  To: Juergen Gross
  Cc: linux-kernel, xen-devel, boris.ostrovsky, konrad.wilk, wei.liu2,
	paul.durrant, netdev
In-Reply-To: <20170106150544.10836-3-jgross@suse.com>

On Fri, Jan 06, 2017 at 04:05:43PM +0100, Juergen Gross wrote:
> Today a Xenstore watch event is delivered via a callback function
> declared as:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char **vec, unsigned int len);
> 
> As all watch events only ever come with two parameters (path and token)
> changing the prototype to:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char *path, const char *token);
> 
> is the natural thing to do.
> 
> Apply this change and adapt all users.
> 
> Cc: konrad.wilk@oracle.com
> Cc: roger.pau@citrix.com
> Cc: wei.liu2@citrix.com
> Cc: paul.durrant@citrix.com
> Cc: netdev@vger.kernel.org
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

blkback changes:

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.

^ permalink raw reply

* [PATCH net] dccp: fix option range in dccp_parse_options()
From: Eric Dumazet @ 2017-01-06 16:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Andrey Konovalov, Gerrit Renker

From: Eric Dumazet <edumazet@google.com>

dccp_parse_options() improperly parses 12 or 16 bytes in excess,
because it forgets to subtract DCCP header len.

This causes various issues, since these 12/16 bytes are part of the
payload and this might not even be present in skb->head, as
dccp_invalid_packet() only pulled everything but payload.

KASAN complains since we might access uninitialized data.

Strangely enough, net/netfilter/xt_dccp.c got this right.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/dccp/options.c b/net/dccp/options.c
index 74d29c56c36709fd4e31f0e63a1f8b1aa38a32cd..41bd4bc4026f97b155e12a3a37095653836ce7fc 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,10 +54,9 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	struct dccp_sock *dp = dccp_sk(sk);
 	const struct dccp_hdr *dh = dccp_hdr(skb);
 	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
-	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
-	unsigned char *opt_ptr = options;
-	const unsigned char *opt_end = (unsigned char *)dh +
-					(dh->dccph_doff * 4);
+	unsigned char *opt_ptr = (unsigned char *)dh + __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff * 4 - __dccp_hdr_len(dh);
+	const unsigned char *opt_end = opt_ptr + optlen;
 	struct dccp_options_received *opt_recv = &dp->dccps_options_received;
 	unsigned char opt, len;
 	unsigned char *uninitialized_var(value);

^ permalink raw reply related

* [PATCH v3] net: stmmac: fix maxmtu assignment to be within valid range
From: Kweh, Hock Leong @ 2017-01-07  0:49 UTC (permalink / raw)
  To: David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe, Jarod Wilson, Andy Shevchenko
  Cc: Alexandre TORGUE, Joachim Eastwood, Niklas Cassel, Johan Hovold,
	pavel, Kweh, Hock Leong, lars.persson, netdev, LKML

From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>

There is no checking valid value of maxmtu when getting it from device tree.
This resolution added the checking condition to ensure the assignment is
made within a valid range.

Signed-off-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>
---
changelog v3:
* print the warning message only if maxmtu < min_mtu
* add maxmtu = JUMBO_LEN at stmmac_pci.c to follow stmmac_platform.c

changelog v2:
* correction of "devicetree" to "device tree" reported by Andy
* print warning message while maxmtu is not in valid range

 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    8 +++++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c  |    4 ++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 92ac006..ce74ae6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
 		ndev->max_mtu = JUMBO_LEN;
 	else
 		ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
-	if (priv->plat->maxmtu < ndev->max_mtu)
+
+	if ((priv->plat->maxmtu < ndev->max_mtu) &&
+	    (priv->plat->maxmtu >= ndev->min_mtu))
 		ndev->max_mtu = priv->plat->maxmtu;
+	else if (priv->plat->maxmtu < ndev->min_mtu)
+		netdev_warn(priv->dev,
+			    "%s: warning: maxmtu having invalid value (%d)\n",
+			    __func__, priv->plat->maxmtu);
 
 	if (flow_ctrl)
 		priv->flow_ctrl = FLOW_AUTO;	/* RX/TX pause on */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index a283177..e539afe 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -204,6 +204,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
+	/* Set the maxmtu to a default of JUMBO_LEN in case the
+	 * parameter is not defined for the device.
+	 */
+	plat->maxmtu = JUMBO_LEN;
 	if (info) {
 		info->pdev = pdev;
 		if (info->setup) {
-- 
1.7.9.5

^ permalink raw reply related

* EMAIL ALERT
From: System Administrator @ 2017-01-06 14:15 UTC (permalink / raw)
  To: Recipients

Recently, we have detect some unusual activity on your account and as a
result, all email users are urged to update their email account within 24 hours of receiving this e-mail, Please CLICK  to confirm that your email account is up to date with the institution requirement.

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

^ permalink raw reply

* Re: [PATCH v3] net: stmmac: fix maxmtu assignment to be within valid range
From: Joao Pinto @ 2017-01-06 16:57 UTC (permalink / raw)
  To: Kweh, Hock Leong, David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe, Jarod Wilson, Andy Shevchenko
  Cc: Alexandre TORGUE, Joachim Eastwood, Niklas Cassel, Johan Hovold,
	pavel, lars.persson, netdev, LKML
In-Reply-To: <1483750143-11966-1-git-send-email-hock.leong.kweh@intel.com>


Hi Wilson,

Às 12:49 AM de 1/7/2017, Kweh, Hock Leong escreveu:
> From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>
> 
> There is no checking valid value of maxmtu when getting it from device tree.
> This resolution added the checking condition to ensure the assignment is
> made within a valid range.
> 
> Signed-off-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>
> ---
> changelog v3:
> * print the warning message only if maxmtu < min_mtu
> * add maxmtu = JUMBO_LEN at stmmac_pci.c to follow stmmac_platform.c
> 
> changelog v2:
> * correction of "devicetree" to "device tree" reported by Andy
> * print warning message while maxmtu is not in valid range
> 
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    8 +++++++-
>  drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c  |    4 ++++
>  2 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 92ac006..ce74ae6 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
>  		ndev->max_mtu = JUMBO_LEN;
>  	else
>  		ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
> -	if (priv->plat->maxmtu < ndev->max_mtu)
> +
> +	if ((priv->plat->maxmtu < ndev->max_mtu) &&
> +	    (priv->plat->maxmtu >= ndev->min_mtu))
>  		ndev->max_mtu = priv->plat->maxmtu;
> +	else if (priv->plat->maxmtu < ndev->min_mtu)
> +		netdev_warn(priv->dev,
> +			    "%s: warning: maxmtu having invalid value (%d)\n",
> +			    __func__, priv->plat->maxmtu);
>  
>  	if (flow_ctrl)
>  		priv->flow_ctrl = FLOW_AUTO;	/* RX/TX pause on */
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> index a283177..e539afe 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> @@ -204,6 +204,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
>  
>  	pci_set_master(pdev);
>  
> +	/* Set the maxmtu to a default of JUMBO_LEN in case the
> +	 * parameter is not defined for the device.
> +	 */
> +	plat->maxmtu = JUMBO_LEN;

I suggest to put this configuration in one of the default config functions.

Tahnks.

>  	if (info) {
>  		info->pdev = pdev;
>  		if (info->setup) {
> 

^ permalink raw reply

* Re: [PATCH v3] net: stmmac: fix maxmtu assignment to be within valid range
From: Andy Shevchenko @ 2017-01-06 17:06 UTC (permalink / raw)
  To: Kweh, Hock Leong
  Cc: David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe, Jarod Wilson, Alexandre TORGUE,
	Joachim Eastwood, Niklas Cassel, Johan Hovold, Pavel Machek,
	lars.persson, netdev, LKML
In-Reply-To: <1483750143-11966-1-git-send-email-hock.leong.kweh@intel.com>

On Sat, Jan 7, 2017 at 2:49 AM, Kweh, Hock Leong
<hock.leong.kweh@intel.com> wrote:
> From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>
>
> There is no checking valid value of maxmtu when getting it from device tree.
> This resolution added the checking condition to ensure the assignment is
> made within a valid range.

> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
>                 ndev->max_mtu = JUMBO_LEN;
>         else
>                 ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
> -       if (priv->plat->maxmtu < ndev->max_mtu)

> +

The lines are logically grouped here. No need to split it. Thus,
remove this extra line.

> +       if ((priv->plat->maxmtu < ndev->max_mtu) &&
> +           (priv->plat->maxmtu >= ndev->min_mtu))

>                 ndev->max_mtu = priv->plat->maxmtu;

> +       else if (priv->plat->maxmtu < ndev->min_mtu)

And if it > ndev->max_mtu?..

> +               netdev_warn(priv->dev,
> +                           "%s: warning: maxmtu having invalid value (%d)\n",
> +                           __func__, priv->plat->maxmtu);


> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> @@ -204,6 +204,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
>
>         pci_set_master(pdev);
>
> +       /* Set the maxmtu to a default of JUMBO_LEN in case the
> +        * parameter is not defined for the device.
> +        */
> +       plat->maxmtu = JUMBO_LEN;

Please, use *_default_data() hooks for that.

At some point it might make sense to extract
static int common_default_data() {...}
and call it at the beginning of the rest of *_defautl_data() hooks.

-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply

* [PATCH net-next 0/2] afs: Implement bulk read
From: David Howells @ 2017-01-06 17:08 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel


This pair of patches implements bulk data reading from an AFS server.

The patches can be found here also:

	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-rewrite

Tagged thusly:

	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
	rxrpc-rewrite-20170106

David
---
David Howells (2):
      afs: Make afs_fs_fetch_data() take a list of pages
      afs: Make afs_readpages() fetch data in bulk


 fs/afs/file.c     |  168 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/afs/fsclient.c |  117 ++++++++++++++++++++++++-------------
 fs/afs/internal.h |   21 ++++++-
 fs/afs/vnode.c    |    6 +-
 fs/afs/volume.c   |    1 
 fs/afs/write.c    |   19 +++++-
 6 files changed, 275 insertions(+), 57 deletions(-)

^ permalink raw reply

* [PATCH net-next 1/2] afs: Make afs_fs_fetch_data() take a list of pages
From: David Howells @ 2017-01-06 17:08 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <148372251997.8578.5117118854142528477.stgit@warthog.procyon.org.uk>

Make afs_fs_fetch_data() take a list of pages for bulk data transfer.  This
will allow afs_readpages() to be made more efficient.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/afs/file.c     |   37 ++++++++++++++---
 fs/afs/fsclient.c |  117 ++++++++++++++++++++++++++++++++++-------------------
 fs/afs/internal.h |   21 +++++++++-
 fs/afs/vnode.c    |    6 +--
 fs/afs/write.c    |   19 +++++++--
 5 files changed, 145 insertions(+), 55 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6344aee4ac4b..6c262ceef32d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -101,6 +101,21 @@ int afs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * Dispose of a ref to a read record.
+ */
+void afs_put_read(struct afs_read *req)
+{
+	int i;
+
+	if (atomic_dec_and_test(&req->usage)) {
+		for (i = 0; i < req->nr_pages; i++)
+			if (req->pages[i])
+				put_page(req->pages[i]);
+		kfree(req);
+	}
+}
+
 #ifdef CONFIG_AFS_FSCACHE
 /*
  * deal with notification that a page was read from the cache
@@ -126,9 +141,8 @@ int afs_page_filler(void *data, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_read *req;
 	struct key *key = data;
-	size_t len;
-	off_t offset;
 	int ret;
 
 	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -164,12 +178,23 @@ int afs_page_filler(void *data, struct page *page)
 		_debug("cache said ENOBUFS");
 	default:
 	go_on:
-		offset = page->index << PAGE_SHIFT;
-		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
+		req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
+			      GFP_KERNEL);
+		if (!req)
+			goto enomem;
+
+		atomic_set(&req->usage, 1);
+		req->pos = (loff_t)page->index << PAGE_SHIFT;
+		req->len = min_t(size_t, i_size_read(inode) - req->pos,
+				 PAGE_SIZE);
+		req->nr_pages = 1;
+		req->pages[0] = page;
+		get_page(page);
 
 		/* read the contents of the file from the server into the
 		 * page */
-		ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
+		ret = afs_vnode_fetch_data(vnode, key, req);
+		afs_put_read(req);
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				_debug("got NOENT from server"
@@ -201,6 +226,8 @@ int afs_page_filler(void *data, struct page *page)
 	_leave(" = 0");
 	return 0;
 
+enomem:
+	ret = -ENOMEM;
 error:
 	SetPageError(page);
 	unlock_page(page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 31c616ab9b40..7dc1f6fb3661 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -309,15 +309,19 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
+	struct afs_read *req = call->reply3;
 	const __be32 *bp;
-	struct page *page;
+	unsigned int size;
 	void *buffer;
 	int ret;
 
-	_enter("{%u}", call->unmarshall);
+	_enter("{%u,%zu/%u;%u/%llu}",
+	       call->unmarshall, call->offset, call->count,
+	       req->remain, req->actual_len);
 
 	switch (call->unmarshall) {
 	case 0:
+		req->actual_len = 0;
 		call->offset = 0;
 		call->unmarshall++;
 		if (call->operation_ID != FSFETCHDATA64) {
@@ -334,10 +338,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		call->count = ntohl(call->tmp);
-		_debug("DATA length MSW: %u", call->count);
-		if (call->count > 0)
-			return -EBADMSG;
+		req->actual_len = ntohl(call->tmp);
+		req->actual_len <<= 32;
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -349,26 +351,52 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		call->count = ntohl(call->tmp);
-		_debug("DATA length: %u", call->count);
-		if (call->count > PAGE_SIZE)
+		req->actual_len |= ntohl(call->tmp);
+		_debug("DATA length: %llu", req->actual_len);
+		/* Check that the server didn't want to send us extra.  We
+		 * might want to just discard instead, but that requires
+		 * cooperation from AF_RXRPC.
+		 */
+		if (req->actual_len > req->len)
 			return -EBADMSG;
-		call->offset = 0;
+
+		req->remain = req->actual_len;
+		call->offset = req->pos & (PAGE_SIZE - 1);
+		req->index = 0;
+		if (req->actual_len == 0)
+			goto no_more_data;
 		call->unmarshall++;
 
+	begin_page:
+		if (req->remain > PAGE_SIZE - call->offset)
+			size = PAGE_SIZE - call->offset;
+		else
+			size = req->remain;
+		call->count = call->offset + size;
+		ASSERTCMP(call->count, <=, PAGE_SIZE);
+		req->remain -= size;
+
 		/* extract the returned data */
 	case 3:
-		_debug("extract data");
-		if (call->count > 0) {
-			page = call->reply3;
-			buffer = kmap(page);
-			ret = afs_extract_data(call, buffer,
-					       call->count, true);
-			kunmap(page);
-			if (ret < 0)
-				return ret;
+		_debug("extract data %u/%llu %zu/%u",
+		       req->remain, req->actual_len, call->offset, call->count);
+
+		buffer = kmap(req->pages[req->index]);
+		ret = afs_extract_data(call, buffer, call->count, true);
+		kunmap(req->pages[req->index]);
+		if (ret < 0)
+			return ret;
+		if (call->offset == PAGE_SIZE) {
+			if (req->page_done)
+				req->page_done(call, req);
+			if (req->remain > 0) {
+				req->index++;
+				call->offset = 0;
+				goto begin_page;
+			}
 		}
 
+	no_more_data:
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -393,17 +421,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 	}
 
 	if (call->count < PAGE_SIZE) {
-		_debug("clear");
-		page = call->reply3;
-		buffer = kmap(page);
+		buffer = kmap(req->pages[req->index]);
 		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
-		kunmap(page);
+		kunmap(req->pages[req->index]);
+		if (req->page_done)
+			req->page_done(call, req);
 	}
 
 	_leave(" = 0 [done]");
 	return 0;
 }
 
+static void afs_fetch_data_destructor(struct afs_call *call)
+{
+	struct afs_read *req = call->reply3;
+
+	afs_put_read(req);
+	afs_flat_call_destructor(call);
+}
+
 /*
  * FS.FetchData operation type
  */
@@ -411,14 +447,14 @@ static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
 	.deliver	= afs_deliver_fs_fetch_data,
 	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+	.destructor	= afs_fetch_data_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
 	.deliver	= afs_deliver_fs_fetch_data,
 	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+	.destructor	= afs_fetch_data_destructor,
 };
 
 /*
@@ -427,8 +463,7 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
 static int afs_fs_fetch_data64(struct afs_server *server,
 			       struct key *key,
 			       struct afs_vnode *vnode,
-			       off_t offset, size_t length,
-			       struct page *buffer,
+			       struct afs_read *req,
 			       const struct afs_wait_mode *wait_mode)
 {
 	struct afs_call *call;
@@ -436,8 +471,6 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 
 	_enter("");
 
-	ASSERTCMP(length, <, ULONG_MAX);
-
 	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
@@ -445,7 +478,7 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	call->key = key;
 	call->reply = vnode;
 	call->reply2 = NULL; /* volsync */
-	call->reply3 = buffer;
+	call->reply3 = req;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
 	call->operation_ID = FSFETCHDATA64;
@@ -456,11 +489,12 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
-	bp[4] = htonl(upper_32_bits(offset));
-	bp[5] = htonl((u32) offset);
+	bp[4] = htonl(upper_32_bits(req->pos));
+	bp[5] = htonl(lower_32_bits(req->pos));
 	bp[6] = 0;
-	bp[7] = htonl((u32) length);
+	bp[7] = htonl(lower_32_bits(req->len));
 
+	atomic_inc(&req->usage);
 	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
 
@@ -470,16 +504,16 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 int afs_fs_fetch_data(struct afs_server *server,
 		      struct key *key,
 		      struct afs_vnode *vnode,
-		      off_t offset, size_t length,
-		      struct page *buffer,
+		      struct afs_read *req,
 		      const struct afs_wait_mode *wait_mode)
 {
 	struct afs_call *call;
 	__be32 *bp;
 
-	if (upper_32_bits(offset) || upper_32_bits(offset + length))
-		return afs_fs_fetch_data64(server, key, vnode, offset, length,
-					   buffer, wait_mode);
+	if (upper_32_bits(req->pos) ||
+	    upper_32_bits(req->len) ||
+	    upper_32_bits(req->pos + req->len))
+		return afs_fs_fetch_data64(server, key, vnode, req, wait_mode);
 
 	_enter("");
 
@@ -490,7 +524,7 @@ int afs_fs_fetch_data(struct afs_server *server,
 	call->key = key;
 	call->reply = vnode;
 	call->reply2 = NULL; /* volsync */
-	call->reply3 = buffer;
+	call->reply3 = req;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
 	call->operation_ID = FSFETCHDATA;
@@ -501,9 +535,10 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
-	bp[4] = htonl(offset);
-	bp[5] = htonl(length);
+	bp[4] = htonl(lower_32_bits(req->pos));
+	bp[5] = htonl(lower_32_bits(req->len));
 
+	atomic_inc(&req->usage);
 	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 535a38d2c1d0..6f7a9638ba1a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -134,6 +134,22 @@ struct afs_call_type {
 };
 
 /*
+ * Record of an outstanding read operation on a vnode.
+ */
+struct afs_read {
+	loff_t			pos;		/* Where to start reading */
+	loff_t			len;		/* How much to read */
+	loff_t			actual_len;	/* How much we're actually getting */
+	atomic_t		usage;
+	unsigned int		remain;		/* Amount remaining */
+	unsigned int		index;		/* Which page we're reading into */
+	unsigned int		pg_offset;	/* Offset in page we're at */
+	unsigned int		nr_pages;
+	void (*page_done)(struct afs_call *, struct afs_read *);
+	struct page		*pages[];
+};
+
+/*
  * record of an outstanding writeback on a vnode
  */
 struct afs_writeback {
@@ -494,6 +510,7 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
 extern int afs_page_filler(void *, struct page *);
+extern void afs_put_read(struct afs_read *);
 
 /*
  * flock.c
@@ -513,7 +530,7 @@ extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
 extern int afs_fs_give_up_callbacks(struct afs_server *,
 				    const struct afs_wait_mode *);
 extern int afs_fs_fetch_data(struct afs_server *, struct key *,
-			     struct afs_vnode *, off_t, size_t, struct page *,
+			     struct afs_vnode *, struct afs_read *,
 			     const struct afs_wait_mode *);
 extern int afs_fs_create(struct afs_server *, struct key *,
 			 struct afs_vnode *, const char *, umode_t,
@@ -699,7 +716,7 @@ extern void afs_vnode_finalise_status_update(struct afs_vnode *,
 extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
 				  struct key *);
 extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
-				off_t, size_t, struct page *);
+				struct afs_read *);
 extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
 			    umode_t, struct afs_fid *, struct afs_file_status *,
 			    struct afs_callback *, struct afs_server **);
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 25cf4c3f4ff7..45aa874f5d32 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -393,7 +393,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,
  * - TODO implement caching
  */
 int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-			 off_t offset, size_t length, struct page *page)
+			 struct afs_read *desc)
 {
 	struct afs_server *server;
 	int ret;
@@ -420,8 +420,8 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_fetch_data(server, key, vnode, offset, length,
-					page, &afs_sync_call);
+		ret = afs_fs_fetch_data(server, key, vnode, desc,
+					&afs_sync_call);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f865c3f05bea..c83c1a0e851f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -86,19 +86,30 @@ void afs_put_writeback(struct afs_writeback *wb)
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 			 loff_t pos, struct page *page)
 {
+	struct afs_read *req;
 	loff_t i_size;
 	int ret;
-	int len;
 
 	_enter(",,%llu", (unsigned long long)pos);
 
+	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	atomic_set(&req->usage, 1);
+	req->pos = pos;
+	req->nr_pages = 1;
+	req->pages[0] = page;
+
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (pos + PAGE_SIZE > i_size)
-		len = i_size - pos;
+		req->len = i_size - pos;
 	else
-		len = PAGE_SIZE;
+		req->len = PAGE_SIZE;
 
-	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
+	ret = afs_vnode_fetch_data(vnode, key, req);
+	afs_put_read(req);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
 			_debug("got NOENT from server"

^ permalink raw reply related

* [PATCH net-next 2/2] afs: Make afs_readpages() fetch data in bulk
From: David Howells @ 2017-01-06 17:09 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <148372251997.8578.5117118854142528477.stgit@warthog.procyon.org.uk>

Make afs_readpages() use afs_vnode_fetch_data()'s new ability to take a
list of pages and do a bulk fetch.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/afs/file.c   |  131 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/afs/volume.c |    1 
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6c262ceef32d..82897a78abc7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -16,6 +16,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/gfp.h>
+#include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
@@ -262,6 +263,129 @@ static int afs_readpage(struct file *file, struct page *page)
 }
 
 /*
+ * Make pages available as they're filled.
+ */
+static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
+{
+	struct afs_vnode *vnode = call->reply;
+	struct page *page = req->pages[req->index];
+
+	req->pages[req->index] = NULL;
+	SetPageUptodate(page);
+
+	/* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+	if (PageFsCache(page) &&
+	    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+		fscache_uncache_page(vnode->cache, page);
+		BUG_ON(PageFsCache(page));
+	}
+#endif
+	unlock_page(page);
+	put_page(page);
+}
+
+/*
+ * Read a contiguous set of pages.
+ */
+static int afs_readpages_one(struct file *file, struct address_space *mapping,
+			     struct list_head *pages)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct afs_read *req;
+	struct list_head *p;
+	struct page *first, *page;
+	struct key *key = file->private_data;
+	pgoff_t index;
+	int ret, n, i;
+
+	/* Count the number of contiguous pages at the front of the list.  Note
+	 * that the list goes prev-wards rather than next-wards.
+	 */
+	first = list_entry(pages->prev, struct page, lru);
+	index = first->index + 1;
+	n = 1;
+	for (p = first->lru.prev; p != pages; p = p->prev) {
+		page = list_entry(p, struct page, lru);
+		if (page->index != index)
+			break;
+		index++;
+		n++;
+	}
+
+	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *) * n,
+		      GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	atomic_set(&req->usage, 1);
+	req->page_done = afs_readpages_page_done;
+	req->pos = first->index;
+	req->pos <<= PAGE_SHIFT;
+
+	/* Transfer the pages to the request.  We add them in until one fails
+	 * to add to the LRU and then we stop (as that'll make a hole in the
+	 * contiguous run.
+	 *
+	 * Note that it's possible for the file size to change whilst we're
+	 * doing this, but we rely on the server returning less than we asked
+	 * for if the file shrank.  We also rely on this to deal with a partial
+	 * page at the end of the file.
+	 */
+	do {
+		page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		index = page->index;
+		if (add_to_page_cache_lru(page, mapping, index,
+					  readahead_gfp_mask(mapping))) {
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			put_page(page);
+			break;
+		}
+
+		req->pages[req->nr_pages++] = page;
+		req->len += PAGE_SIZE;
+	} while (req->nr_pages < n);
+
+	if (req->nr_pages == 0) {
+		kfree(req);
+		return 0;
+	}
+
+	ret = afs_vnode_fetch_data(vnode, key, req);
+	if (ret < 0)
+		goto error;
+
+	task_io_account_read(PAGE_SIZE * req->nr_pages);
+	afs_put_read(req);
+	return 0;
+
+error:
+	if (ret == -ENOENT) {
+		_debug("got NOENT from server"
+		       " - marking file deleted and stale");
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		ret = -ESTALE;
+	}
+
+	for (i = 0; i < req->nr_pages; i++) {
+		page = req->pages[i];
+		if (page) {
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			SetPageError(page);
+			unlock_page(page);
+		}
+	}
+
+	afs_put_read(req);
+	return ret;
+}
+
+/*
  * read a set of pages
  */
 static int afs_readpages(struct file *file, struct address_space *mapping,
@@ -314,8 +438,11 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
-	/* load the missing pages from the network */
-	ret = read_cache_pages(mapping, pages, afs_page_filler, key);
+	while (!list_empty(pages)) {
+		ret = afs_readpages_one(file, mapping, pages);
+		if (ret < 0)
+			break;
+	}
 
 	_leave(" = %d [netting]", ret);
 	return ret;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index d142a2449e65..546f9d01710b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
+	volume->bdi.ra_pages	= VM_MAX_READAHEAD*1024/PAGE_SIZE; 
 	ret = bdi_setup_and_register(&volume->bdi, "afs");
 	if (ret)
 		goto error_bdi;

^ permalink raw reply related

* Re: [PATCH net] dccp: fix option range in dccp_parse_options()
From: Eric Dumazet @ 2017-01-06 17:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Andrey Konovalov, Gerrit Renker
In-Reply-To: <1483721009.9712.14.camel@edumazet-glaptop3.roam.corp.google.com>

On Fri, 2017-01-06 at 08:43 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> dccp_parse_options() improperly parses 12 or 16 bytes in excess,
> because it forgets to subtract DCCP header len.
> 
> This causes various issues, since these 12/16 bytes are part of the
> payload and this might not even be present in skb->head, as
> dccp_invalid_packet() only pulled everything but payload.
> 
> KASAN complains since we might access uninitialized data.

Scratch that.

Never send a patch before first coffee in the morning ;)

^ permalink raw reply

* Re: [PATCH] net: phy: dp83867: fix irq generation
From: Grygorii Strashko @ 2017-01-06 17:19 UTC (permalink / raw)
  To: Florian Fainelli, netdev, Dan Murphy, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, linux-arm-kernel
In-Reply-To: <de3c3fb3-1244-76a5-ea5e-8ef1b83f25d4@gmail.com>



On 01/05/2017 04:10 PM, Florian Fainelli wrote:
> On 01/05/2017 12:48 PM, Grygorii Strashko wrote:
>> For proper IRQ generation by DP83867 phy the INT/PWDN pin has to be
>> programmed as an interrupt output instead of a Powerdown input in
>> Configuration Register 3 (CFG3), Address 0x001E, bit 7 INT_OE = 1. The
>> current driver doesn't do this and as result IRQs will not be generated by
>> DP83867 phy even if they are properly configured in DT.
>>
>> Hence, fix IRQ generation by properly configuring CFG3.INT_OE bit and
>> ensure that Link Status Change (LINK_STATUS_CHNG_INT) and Auto-Negotiation
>> Complete (AUTONEG_COMP_INT) interrupt are enabled. After this the DP83867
>> driver will work properly in interrupt enabled mode.
>>
>> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
>> ---
>>  drivers/net/phy/dp83867.c | 10 ++++++++++
>>  1 file changed, 10 insertions(+)
>>
>> diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
>> index 1b63924..e84ae08 100644
>> --- a/drivers/net/phy/dp83867.c
>> +++ b/drivers/net/phy/dp83867.c
>> @@ -29,6 +29,7 @@
>>  #define MII_DP83867_MICR	0x12
>>  #define MII_DP83867_ISR		0x13
>>  #define DP83867_CTRL		0x1f
>> +#define DP83867_CFG3		0x1e
>>
>>  /* Extended Registers */
>>  #define DP83867_RGMIICTL	0x0032
>> @@ -98,6 +99,8 @@ static int dp83867_config_intr(struct phy_device *phydev)
>>  		micr_status |=
>>  			(MII_DP83867_MICR_AN_ERR_INT_EN |
>>  			MII_DP83867_MICR_SPEED_CHNG_INT_EN |
>> +			MII_DP83867_MICR_AUTONEG_COMP_INT_EN |
>> +			MII_DP83867_MICR_LINK_STS_CHNG_INT_EN |
>>  			MII_DP83867_MICR_DUP_MODE_CHNG_INT_EN |
>>  			MII_DP83867_MICR_SLEEP_MODE_CHNG_INT_EN);
>>
>> @@ -214,6 +217,13 @@ static int dp83867_config_init(struct phy_device *phydev)
>>  		}
>>  	}
>>
>> +	/* Enable Interrupt output INT_OE in CFG3 register */
>> +	if (phy_interrupt_is_valid(phydev)) {
>> +		val = phy_read(phydev, DP83867_CFG3);
>> +		val |= BIT(7);
>> +		phy_write(phydev, DP83867_CFG3, val);
>> +	}
>
> Don't you need to clear that bit in the case phy_interrupt_is_valid()
> returns false?

Not sure I need to touch it in this case - default value is 0
and Linux will not configure any IRQ.

>
> Other than that:
>
> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
>

-- 
regards,
-grygorii

^ permalink raw reply

* Re: [net-next PATCH v2 5/6] i40e: Add TX and RX support in switchdev mode.
From: Jiri Pirko @ 2017-01-06 17:30 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: alexander.h.duyck, john.r.fastabend, anjali.singhai,
	jakub.kicinski, davem, intel-wired-lan, netdev
In-Reply-To: <586EE461.2000801@intel.com>

Fri, Jan 06, 2017 at 01:27:13AM CET, sridhar.samudrala@intel.com wrote:
>
>
>On 1/5/2017 4:56 AM, Jiri Pirko wrote:
>> Tue, Jan 03, 2017 at 07:07:53PM CET, sridhar.samudrala@intel.com wrote:
>> > In switchdev mode, broadcast filter is not enabled on VFs. The broadcasts and
>> > unknown frames from VFs are received by the PF and passed to corresponding VF
>> > port representator netdev.
>> > A host based switching entity like a linux bridge or OVS redirects these frames
>> > to the right VFs via VFPR netdevs. Any frames sent via VFPR netdevs are sent as
>> > directed transmits to the corresponding VFs. To enable directed transmit, skb
>> > metadata dst is used to pass the VF id and the frame is requeued to call the PFs
>> > transmit routine.
>> > 
>> > Small script to demonstrate inter VF pings in switchdev mode.
>> > PF: enp5s0f0, VFs: enp5s2,enp5s2f1 VFPRs:enp5s0f0-vf0, enp5s0f0-vf1
>> > 
>> > # rmmod i40e; modprobe i40e
>> > # devlink dev eswitch set pci/0000:05:00.0 mode switchdev
>> > # echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
>> > # ip link set enp5s0f0 vf 0 mac 00:11:22:33:44:55
>> > # ip link set enp5s0f0 vf 1 mac 00:11:22:33:44:56
>> > # rmmod i40evf; modprobe i40evf
>> > 
>> > /* Create 2 namespaces and move the VFs to the corresponding ns. */
>> > # ip netns add ns0
>> > # ip link set enp5s2 netns ns0
>> > # ip netns exec ns0 ip addr add 192.168.1.10/24 dev enp5s2
>> > # ip netns exec ns0 ip link set enp5s2 up
>> > # ip netns add ns1
>> > # ip link set enp5s2f1 netns ns1
>> > # ip netns exec ns1 ip addr add 192.168.1.11/24 dev enp5s2f1
>> > # ip netns exec ns1 ip link set enp5s2f1 up
>> > 
>> > /* bring up pf and vfpr netdevs */
>> > # ip link set enp5s0f0 up
>> > # ip link set enp5s0f0-vf0 up
>> > # ip link set enp5s0f0-vf1 up
>> > 
>> > /* Create a linux bridge and add vfpr netdevs to it. */
>> > # ip link add vfpr-br type bridge
>> > # ip link set enp5s0f0-vf0 master vfpr-br
>> > # ip link set enp5s0f0-vf1 master vfpr-br
>> > # ip addr add 192.168.1.1/24 dev vfpr-br
>> > # ip link set vfpr-br up
>> > 
>> > # ip netns exec ns0 ping -c3 192.168.1.11
>> > # ip netns exec ns1 ping -c3 192.168.1.10
>> > 
>> > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> [...]
>> 
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> > index 352cf7c..b46ddaa 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> > @@ -1176,16 +1176,37 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
>> >   * @rx_ring:  rx ring in play
>> >   * @skb: packet to send up
>> >   * @vlan_tag: vlan tag for packet
>> > + * @lpbk: is it a loopback frame?
>> >   **/
>> > static void i40e_receive_skb(struct i40e_ring *rx_ring,
>> > -			     struct sk_buff *skb, u16 vlan_tag)
>> > +			     struct sk_buff *skb, u16 vlan_tag, bool lpbk)
>> > {
>> > 	struct i40e_q_vector *q_vector = rx_ring->q_vector;
>> > +	struct i40e_pf *pf = rx_ring->vsi->back;
>> > +	struct i40e_vf *vf;
>> > +	struct ethhdr *eth;
>> > +	int vf_id;
>> > 
>> > 	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
>> > 	    (vlan_tag & VLAN_VID_MASK))
>> > 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
>> > 
>> > +	if ((pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY) || !lpbk)
>> > +		goto gro_receive;
>> > +
>> > +	/* If a loopback packet is received from a VF in switchdev mode, pass the
>> > +	 * frame to the corresponding VFPR netdev based on the source MAC in the frame.
>> > +	 */
>> > +	eth = (struct ethhdr *)skb_mac_header(skb);
>> > +	for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
>> > +		vf = &pf->vf[vf_id];
>> > +		if (ether_addr_equal(eth->h_source, vf->default_lan_addr.addr)) {
>> > +			skb->dev = vf->vfpr_netdev;
>> This sucks :( Is't there any identification coming from rx ring that
>> would tell you what vf this is? To match vrpr according to a single MAC
>> address seems a bit awkward. What if there is a macvlan configured
>> on the VF?
>Unfortunately, with the current HW, RX descriptor only indicates if it is a
>loopback packet from a VF,
>but not the specific id of the VF.

Is it a FW limitation? If so, I believe that you should consider to
implement it.


>Multiple macs on VF are not supported with the current patchset.
>At this point we are not making switchdev as the default mode because of
>these limitations.
>
>> 
>> 
>> 
>> > +			break;
>> > +		}
>> > +	}
>> > +
>> > +gro_receive:
>> > 	napi_gro_receive(&q_vector->napi, skb);
>> > }
>> > 
>> [...]
>> 
>> > @@ -2998,3 +3064,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>> > 
>> > 	return i40e_xmit_frame_ring(skb, tx_ring);
>> > }
>> > +
>> > +netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff *skb,
>> > +				        struct net_device *dev)
>> > +{
>> > +	struct i40e_vfpr_netdev_priv *priv = netdev_priv(dev);
>> > +	struct i40e_vf *vf = priv->vf;
>> > +	struct i40e_pf *pf = vf->pf;
>> > +	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
>> > +
>> > +	skb_dst_drop(skb);
>> > +	dst_hold(&priv->vfpr_dst->dst);
>> > +	skb_dst_set(skb, &priv->vfpr_dst->dst);
>> > +	skb->dev = vsi->netdev;
>> This dst dance seems a bit odd to me. Why don't you just call
>> i40e_xmit_frame_ring with an extra arg holding the needed metadata?
>
>We don't have TX/RX queues associated with VFPR netdevs, so we need to set
>the dev to PF netdev and requeue the skb.

Still, you eventually call a function within same .c file. Using dst
does not look right to me.


>
>> 
>> 
>> > +
>> > +	return dev_queue_xmit(skb);
>> > +}
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> > index e065321..850723f 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> > @@ -366,6 +366,7 @@ struct i40e_ring_container {
>> > 
>> > bool i40e_alloc_rx_buffers(struct i40e_ring *rxr, u16 cleaned_count);
>> > netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
>> > +netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev);
>> > void i40e_clean_tx_ring(struct i40e_ring *tx_ring);
>> > void i40e_clean_rx_ring(struct i40e_ring *rx_ring);
>> > int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring);
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
>> > index edc0abd..c136cc9 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_type.h
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
>> > @@ -728,6 +728,9 @@ enum i40e_rx_desc_status_bits {
>> > #define I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT  I40E_RX_DESC_STATUS_TSYNVALID_SHIFT
>> > #define I40E_RXD_QW1_STATUS_TSYNVALID_MASK \
>> > 				    BIT_ULL(I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT)
>> > +#define I40E_RXD_QW1_STATUS_LPBK_SHIFT  I40E_RX_DESC_STATUS_LPBK_SHIFT
>> > +#define I40E_RXD_QW1_STATUS_LPBK_MASK \
>> > +				BIT_ULL(I40E_RXD_QW1_STATUS_LPBK_SHIFT)
>> > 
>> > enum i40e_rx_desc_fltstat_values {
>> > 	I40E_RX_DESC_FLTSTAT_NO_DATA	= 0,
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>> > index 0c8687d..f0860ef 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>> > @@ -1062,6 +1062,7 @@ static int i40e_vfpr_netdev_stop(struct net_device *dev)
>> > static const struct net_device_ops i40e_vfpr_netdev_ops = {
>> > 	.ndo_open       	= i40e_vfpr_netdev_open,
>> > 	.ndo_stop       	= i40e_vfpr_netdev_stop,
>> > +	.ndo_start_xmit		= i40e_vfpr_netdev_start_xmit,
>> > };
>> > 
>> > /**
>> > @@ -1121,16 +1122,22 @@ int i40e_alloc_vfpr_netdev(struct i40e_vf *vf, u16 vf_num)
>> > 
>> > 	priv = netdev_priv(vfpr_netdev);
>> > 	priv->vf = &(pf->vf[vf_num]);
>> > +	priv->vfpr_dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL);
>> I'm missing a patch here. In net-next, metadata_dst_alloc has 2 args.
>
>Somehow that patch didn't make it to netdev. You can find it here on
>intel-wired-lan archives.
>http://lists.osuosl.org/pipermail/intel-wired-lan/Week-of-Mon-20170102/007723.html
>I will CC you when i submit V3.

Thanks.

^ permalink raw reply

* Re: [PATCH net] net: Fix inconsistent rtnl_lock usage on dev_get_stats().
From: Eric Dumazet @ 2017-01-06 17:32 UTC (permalink / raw)
  To: Michael Chan; +Cc: davem, netdev
In-Reply-To: <1483676478-14228-1-git-send-email-michael.chan@broadcom.com>

On Thu, 2017-01-05 at 23:21 -0500, Michael Chan wrote:
> Some callers take rtnl_lock() before calling dev_get_stats() and some
> don't.  Most network drivers expect the ndo_get_stats64() to be called
> under rtnl_lock() to avoid race conditions with device close or ethtool
> reconfigurations.  Fix it so that all callers take rtnl_lock().
> 
> Rename the original dev_get_stats() as __dev_get_stats() and add a new
> dev_get_stats() that takes rtnl_lock() before calling __dev_get_stats().
> Modify all callers that already take rtnl_lock() to call __dev_get_stats().

This makes no sense to me.

RTNL is absolutely not needed to get device stats.

We try to not add RTNL, especially when not required.

Sure, RTNETLINK dumps currently hold RTNL, but we had various attempts
in the past to get rid of this behavior.

If a device driver expects RTNL being locked, it is clearly a bug that
needs a fix anyway.

^ permalink raw reply

* [net-next PATCH] net: reduce cycles spend on ICMP replies that gets rate limited
From: Jesper Dangaard Brouer @ 2017-01-06 17:39 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Jesper Dangaard Brouer

This patch split the global and per (inet)peer ICMP-reply limiter
code, and moves the global limit check to earlier in the packet
processing path.  Thus, avoid spending cycles on ICMP replies that
gets limited/suppressed anyhow.

The global ICMP rate limiter icmp_global_allow() is a good solution,
it just happens too late in the process.  The kernel goes through
allocating memory and route lookup (return path) for the ICMP message,
before taking the rate limit decision of not sending the ICMP reply.

Details: The kernels global rate limiter for ICMP messages got added
in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
a token bucket limiter with a global lock.  It brilliantly avoids
locking congestion by only updating when 20ms (HZ/50) were elapsed. It
can then avoids taking lock when credit is exhausted (when under
pressure) and time constraint for refill is not yet meet.

Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.
 After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets.  This is bad for capacity planning when restarting a service.

UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
 Baseline: 6.6 Mpps
 Patch:   14.5 Mpps

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 net/ipv4/icmp.c |   87 +++++++++++++++++++++++++++++++++++--------------------
 net/ipv6/icmp.c |   49 +++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 46 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..3d7b447c8b72 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -282,6 +282,33 @@ bool icmp_global_allow(void)
 }
 EXPORT_SYMBOL(icmp_global_allow);
 
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+	if (type > NR_ICMP_TYPES)
+		return true;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		return true;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+		return true;
+
+	return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+	if (icmpv4_mask_allow(net, type, code))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  *	Send an ICMP frame.
  */
@@ -290,34 +317,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 			       struct flowi4 *fl4, int type, int code)
 {
 	struct dst_entry *dst = &rt->dst;
+	struct inet_peer *peer;
 	bool rc = true;
+	int vif;
 
-	if (type > NR_ICMP_TYPES)
-		goto out;
-
-	/* Don't limit PMTU discovery. */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+	if (icmpv4_mask_allow(net, type, code))
 		goto out;
 
 	/* No rate limit on loopback */
 	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
 		goto out;
 
-	/* Limit if icmp type is enabled in ratemask. */
-	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
-		goto out;
-
-	rc = false;
-	if (icmp_global_allow()) {
-		int vif = l3mdev_master_ifindex(dst->dev);
-		struct inet_peer *peer;
-
-		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
-		rc = inet_peer_xrlim_allow(peer,
-					   net->ipv4.sysctl_icmp_ratelimit);
-		if (peer)
-			inet_putpeer(peer);
-	}
+	vif = l3mdev_master_ifindex(dst->dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+	if (peer)
+		inet_putpeer(peer);
 out:
 	return rc;
 }
@@ -396,6 +411,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct inet_sock *inet;
 	__be32 daddr, saddr;
 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+	int type = icmp_param->data.icmph.type;
+	int code = icmp_param->data.icmph.code;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -405,6 +422,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 		return;
 	inet = inet_sk(sk);
 
+	/* global icmp_msgs_per_sec */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
+
 	icmp_param->data.icmph.checksum = 0;
 
 	inet->tos = ip_hdr(skb)->tos;
@@ -433,8 +454,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
-	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
-			       icmp_param->data.icmph.code))
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 	ip_rt_put(rt);
 out_unlock:
@@ -648,13 +668,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
-	if (!icmp_param)
-		return;
-
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		goto out_free;
+		goto out;
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
+
+	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+	if (!icmp_param)
+		goto out_unlock;
 
 	/*
 	 *	Construct source address and options.
@@ -682,7 +706,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
-		goto out_unlock;
+		goto out_free;
 
 
 	/*
@@ -706,8 +730,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
 			       type, code, icmp_param);
 	if (IS_ERR(rt))
-		goto out_unlock;
+		goto out_free;
 
+	/* peer icmp_ratelimit */
 	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		goto ende;
 
@@ -727,10 +752,10 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 ende:
 	ip_rt_put(rt);
-out_unlock:
-	icmp_xmit_unlock(sk);
 out_free:
 	kfree(icmp_param);
+out_unlock:
+	icmp_xmit_unlock(sk);
 out:;
 }
 EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3036f665e6c8..b26ae8b5c1ce 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -168,6 +168,30 @@ static bool is_ineligible(const struct sk_buff *skb)
 	return false;
 }
 
+static bool icmpv6_mask_allow(int type)
+{
+	/* Informational messages are not limited. */
+	if (type & ICMPV6_INFOMSG_MASK)
+		return true;
+
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return true;
+
+	return false;
+}
+
+static bool icmpv6_global_allow(int type)
+{
+	if (icmpv6_mask_allow(type))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  * Check the ICMP output rate limit
  */
@@ -178,12 +202,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	struct dst_entry *dst;
 	bool res = false;
 
-	/* Informational messages are not limited. */
-	if (type & ICMPV6_INFOMSG_MASK)
-		return true;
-
-	/* Do not limit pmtu discovery, it would break it. */
-	if (type == ICMPV6_PKT_TOOBIG)
+	if (icmpv6_mask_allow(type))
 		return true;
 
 	/*
@@ -200,20 +219,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
+		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		if (icmp_global_allow()) {
-			struct inet_peer *peer;
-
-			peer = inet_getpeer_v6(net->ipv6.peers,
-					       &fl6->daddr, 1);
-			res = inet_peer_xrlim_allow(peer, tmo);
-			if (peer)
-				inet_putpeer(peer);
-		}
+		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+		res = inet_peer_xrlim_allow(peer, tmo);
+		if (peer)
+			inet_putpeer(peer);
 	}
 	dst_release(dst);
 	return res;
@@ -493,6 +508,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
 		return;
+
+	if (!icmpv6_global_allow(type))
+		goto out;
+
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox