Netdev List
 help / color / mirror / Atom feed
* [PATCH net 5/5] net: fix races in page->_count manipulation
From: Eric Dumazet @ 2014-10-10  5:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Alexander Duyck, Andres Lagar-Cavilla, Greg Thelen,
	Hugh Dickins, David Rientjes, Eric Dumazet
In-Reply-To: <1412918694-22882-1-git-send-email-edumazet@google.com>

This is illegal to use atomic_set(&page->_count, ...) even if we 'own'
the page. Other entities in the kernel need to use get_page_unless_zero()
to get a reference to the page before testing page properties, so we could
loose a refcount increment.

The only case it is valid is when page->_count is 0

Fixes: 540eb7bf0bbed ("net: Update alloc frag to reduce get/put page usage and recycle pages")
Signed-off-by: Eric Dumaze <edumazet@google.com>
---
 net/core/skbuff.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a30d750647e7..829d013745ab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -360,18 +360,29 @@ refill:
 				goto end;
 		}
 		nc->frag.size = PAGE_SIZE << order;
-recycle:
-		atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1,
+			   &nc->frag.page->_count);
 		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
 		nc->frag.offset = 0;
 	}
 
 	if (nc->frag.offset + fragsz > nc->frag.size) {
-		/* avoid unnecessary locked operations if possible */
-		if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
-		    atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
-			goto recycle;
-		goto refill;
+		if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) {
+			if (!atomic_sub_and_test(nc->pagecnt_bias,
+						 &nc->frag.page->_count))
+				goto refill;
+			/* OK, page count is 0, we can safely set it */
+			atomic_set(&nc->frag.page->_count,
+				   NETDEV_PAGECNT_MAX_BIAS);
+		} else {
+			atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias,
+				   &nc->frag.page->_count);
+		}
+		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+		nc->frag.offset = 0;
 	}
 
 	data = page_address(nc->frag.page) + nc->frag.offset;
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net 4/5] mlx4: fix race accessing page->_count
From: Eric Dumazet @ 2014-10-10  5:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Alexander Duyck, Andres Lagar-Cavilla, Greg Thelen,
	Hugh Dickins, David Rientjes, Eric Dumazet
In-Reply-To: <1412918694-22882-1-git-send-email-edumazet@google.com>

This is illegal to use atomic_set(&page->_count, ...) even if we 'own'
the page. Other entities in the kernel need to use get_page_unless_zero()
to get a reference to the page before testing page properties, so we could
loose a refcount increment.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a33048ee9621..01660c595f5c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -76,10 +76,10 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 	page_alloc->dma = dma;
 	page_alloc->page_offset = frag_info->frag_align;
 	/* Not doing get_page() for each frag is a big win
-	 * on asymetric workloads.
+	 * on asymetric workloads. Note we can not use atomic_set().
 	 */
-	atomic_set(&page->_count,
-		   page_alloc->page_size / frag_info->frag_stride);
+	atomic_add(page_alloc->page_size / frag_info->frag_stride - 1,
+		   &page->_count);
 	return 0;
 }
 
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net 3/5] igb: fix race accessing page->_count
From: Eric Dumazet @ 2014-10-10  5:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Alexander Duyck, Andres Lagar-Cavilla, Greg Thelen,
	Hugh Dickins, David Rientjes, Eric Dumazet
In-Reply-To: <1412918694-22882-1-git-send-email-edumazet@google.com>

This is illegal to use atomic_set(&page->_count, 2) even if we 'own'
the page. Other entities in the kernel need to use get_page_unless_zero()
to get a reference to the page before testing page properties, so we could
loose a refcount increment.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d677b5a23b58..fec5212d4337 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1865,12 +1865,10 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 	/* flip page offset to other buffer */
 	rx_buffer->page_offset ^= truesize;
 
-	/*
-	 * since we are the only owner of the page and we need to
-	 * increment it, just set the value to 2 in order to avoid
-	 * an unecessary locked operation
+	/* Even if we own the page, we are not allowed to use atomic_set()
+	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_set(&page->_count, 2);
+	atomic_inc(&page->_count);
 #else
 	/* move offset up to the next cache line */
 	rx_buffer->page_offset += truesize;
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net 2/5] igb: fix race accessing page->_count
From: Eric Dumazet @ 2014-10-10  5:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Alexander Duyck, Andres Lagar-Cavilla, Greg Thelen,
	Hugh Dickins, David Rientjes, Eric Dumazet
In-Reply-To: <1412918694-22882-1-git-send-email-edumazet@google.com>

This is illegal to use atomic_set(&page->_count, 2) even if we 'own'
the page. Other entities in the kernel need to use get_page_unless_zero()
to get a reference to the page before testing page properties, so we could
loose a refcount increment.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index ae59c0b108c5..a21b14495ebd 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6545,11 +6545,10 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
 	/* flip page offset to other buffer */
 	rx_buffer->page_offset ^= IGB_RX_BUFSZ;
 
-	/* since we are the only owner of the page and we need to
-	 * increment it, just set the value to 2 in order to avoid
-	 * an unnecessary locked operation
+	/* Even if we own the page, we are not allowed to use atomic_set()
+	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_set(&page->_count, 2);
+	atomic_inc(&page->_count);
 #else
 	/* move offset up to the next cache line */
 	rx_buffer->page_offset += truesize;
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net 1/5] fm10k: fix race accessing page->_count
From: Eric Dumazet @ 2014-10-10  5:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Alexander Duyck, Andres Lagar-Cavilla, Greg Thelen,
	Hugh Dickins, David Rientjes, Eric Dumazet
In-Reply-To: <1412918694-22882-1-git-send-email-edumazet@google.com>

This is illegal to use atomic_set(&page->_count, 2) even if we 'own'
the page. Other entities in the kernel need to use get_page_unless_zero()
to get a reference to the page before testing page properties, so we could
loose a refcount increment.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 6c800a330d66..9d7118a0d67a 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -219,11 +219,10 @@ static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
 	/* flip page offset to other buffer */
 	rx_buffer->page_offset ^= FM10K_RX_BUFSZ;
 
-	/* since we are the only owner of the page and we need to
-	 * increment it, just set the value to 2 in order to avoid
-	 * an unnecessary locked operation
+	/* Even if we own the page, we are not allowed to use atomic_set()
+	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_set(&page->_count, 2);
+	atomic_inc(&page->_count);
 #else
 	/* move offset up to the next cache line */
 	rx_buffer->page_offset += truesize;
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net 0/5] net: fix races accessing page->_count
From: Eric Dumazet @ 2014-10-10  5:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Alexander Duyck, Andres Lagar-Cavilla, Greg Thelen,
	Hugh Dickins, David Rientjes, Eric Dumazet

This is illegal to use atomic_set(&page->_count, ...) even if we 'own'
the page. Other entities in the kernel need to use get_page_unless_zero()
to get a reference to the page before testing page properties, so we could
loose a refcount increment.

The only case it is valid is when page->_count is 0, we can use this in
__netdev_alloc_frag()

Note that I never seen crashes caused by these races, the issue was reported
by Andres Lagar-Cavilla and Hugh Dickins.

Eric Dumazet (5):
  fm10k: fix race accessing page->_count
  igb: fix race accessing page->_count
  igb: fix race accessing page->_count
  mlx4: fix race accessing page->_count
  net: fix races in page->_count manipulation

 drivers/net/ethernet/intel/fm10k/fm10k_main.c |  7 +++----
 drivers/net/ethernet/intel/igb/igb_main.c     |  7 +++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  8 +++-----
 drivers/net/ethernet/mellanox/mlx4/en_rx.c    |  6 +++---
 net/core/skbuff.c                             | 25 ++++++++++++++++++-------
 5 files changed, 30 insertions(+), 23 deletions(-)

-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply

* Re: [PATCH v2] networking: fm10k: Fix build failure
From: Jeff Kirsher @ 2014-10-10  5:22 UTC (permalink / raw)
  To: Pranith Kumar
  Cc: Don, open list:INTEL ETHERNET DR..., Bruce Allan,
	Jesse Brandeburg, open list, Ronciak,
	open list:NETWORKING DRIVERS, Linux NICS, John
In-Reply-To: <1412918346-24763-1-git-send-email-bobby.prani@gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 1354 bytes --]

On Fri, 2014-10-10 at 01:19 -0400, Pranith Kumar wrote:
> The latest linus git tip (3.18-rc1) fails with the following build failure. Fix
> this by making PTP support explicit for fm10k driver.
> 
> rivers/built-in.o: In function `fm10k_ptp_register':
> (.text+0x12e760): undefined reference to `ptp_clock_registER'
> drivers/built-in.o: In function `fm10k_ptp_unregister':
> (.text+0x12e7dc): undefined reference to `ptp_clock_unregister'
> Makefile:930: recipe for target 'vmlinux' failed
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>

Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Dave- go ahead and pull this in directly, no need for me to put this
through our internal process.

> ---
>  drivers/net/ethernet/intel/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
> index 6a6d5ee..6919adb 100644
> --- a/drivers/net/ethernet/intel/Kconfig
> +++ b/drivers/net/ethernet/intel/Kconfig
> @@ -304,6 +304,7 @@ config FM10K
>  	tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support"
>  	default n
>  	depends on PCI_MSI
> +	select PTP_1588_CLOCK
>  	---help---
>  	  This driver supports Intel(R) FM10000 Ethernet Switch Host
>  	  Interface.  For more information on how to identify your adapter,



[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 430 bytes --]

------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk

[-- Attachment #3: Type: text/plain, Size: 257 bytes --]

_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH v2] networking: fm10k: Fix build failure
From: David Miller @ 2014-10-10  5:20 UTC (permalink / raw)
  To: bobby.prani
  Cc: linux.nics, e1000-devel, bruce.w.allan, jesse.brandeburg,
	linux-kernel, john.ronciak, netdev
In-Reply-To: <1412918346-24763-1-git-send-email-bobby.prani@gmail.com>

From: Pranith Kumar <bobby.prani@gmail.com>
Date: Fri, 10 Oct 2014 01:19:06 -0400

> The latest linus git tip (3.18-rc1) fails with the following build failure. Fix
> this by making PTP support explicit for fm10k driver.
> 
> rivers/built-in.o: In function `fm10k_ptp_register':
> (.text+0x12e760): undefined reference to `ptp_clock_registER'
> drivers/built-in.o: In function `fm10k_ptp_unregister':
> (.text+0x12e7dc): undefined reference to `ptp_clock_unregister'
> Makefile:930: recipe for target 'vmlinux' failed
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>

Applied, thanks.

------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* [PATCH v2] networking: fm10k: Fix build failure
From: Pranith Kumar @ 2014-10-10  5:19 UTC (permalink / raw)
  To: Jeff Kirsher, Jesse Brandeburg, Bruce Allan, Carolyn Wyborny,
	Don Skidmore, Greg Rose, Matthew Vick, John Ronciak,
	Mitch Williams, Linux NICS, open list:INTEL ETHERNET DR...,
	open list:NETWORKING DRIVERS, open list

The latest linus git tip (3.18-rc1) fails with the following build failure. Fix
this by making PTP support explicit for fm10k driver.

rivers/built-in.o: In function `fm10k_ptp_register':
(.text+0x12e760): undefined reference to `ptp_clock_registER'
drivers/built-in.o: In function `fm10k_ptp_unregister':
(.text+0x12e7dc): undefined reference to `ptp_clock_unregister'
Makefile:930: recipe for target 'vmlinux' failed

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 drivers/net/ethernet/intel/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 6a6d5ee..6919adb 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -304,6 +304,7 @@ config FM10K
 	tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support"
 	default n
 	depends on PCI_MSI
+	select PTP_1588_CLOCK
 	---help---
 	  This driver supports Intel(R) FM10000 Ethernet Switch Host
 	  Interface.  For more information on how to identify your adapter,
-- 
1.9.1


------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related

* Re: [PATCH net-next 0/2] sunvnet: Packet processing in non-interrupt context.
From: Raghuram Kothakota @ 2014-10-10  5:13 UTC (permalink / raw)
  To: David Miller; +Cc: sowmini.varadhan, netdev
In-Reply-To: <20141010.010320.1545286759850518718.davem@davemloft.net>

> 
> Linux's TX path it fully parallelized and multiqueue, so you can have
> as many parallel TX threads of control executing over a specific
> device as you can provide TX queues.

Thanks, I guess we need to explore Tx queues as well.

-Raghuram
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 0/2] sunvnet: Packet processing in non-interrupt context.
From: David Miller @ 2014-10-10  5:03 UTC (permalink / raw)
  To: Raghuram.Kothakota; +Cc: sowmini.varadhan, netdev
In-Reply-To: <960E9AF5-0319-44C7-9A1C-37EAEE52E78E@oracle.com>

From: Raghuram Kothakota <Raghuram.Kothakota@oracle.com>
Date: Thu, 9 Oct 2014 21:56:45 -0700

> Sorry, I used incorrect terminology in my email. My knowledge of LLTX
> is limited and I am still learning. I was not referring to the LLTX, but  about 
> the implementation of  sunvnet transmit path and receive paths without locks. To me that
> means only one thread of execution exists at a given time and I was
> referring to it as single threadedness,  which limits performance on SPARC CMT
> processors today. Using  methods to increase parallelism will help especially
> when the traffic involves multiple connections, mainly from the point of view
> of using multiple vCPUs to perform the processing where possible. 

Let's not speak in generalities, but rather about specifical
implementations of specific things.

Linux's TX path it fully parallelized and multiqueue, so you can have
as many parallel TX threads of control executing over a specific
device as you can provide TX queues.

^ permalink raw reply

* Re: [PATCH] networking: fm10k: Fix build failure
From: David Miller @ 2014-10-10  5:01 UTC (permalink / raw)
  To: bobby.prani
  Cc: linux.nics, e1000-devel, bruce.w.allan, jesse.brandeburg,
	linux-kernel, john.ronciak, netdev
In-Reply-To: <1412916929-21592-1-git-send-email-bobby.prani@gmail.com>

From: Pranith Kumar <bobby.prani@gmail.com>
Date: Fri, 10 Oct 2014 00:55:29 -0400

> The latest linus git tip (3.18-rc1) fails with the following build failure. Fix
> this by making PTP support explicit for fm10k driver.
> 
> rivers/built-in.o: In function `fm10k_ptp_register':
> (.text+0x12e760): undefined reference to `ptp_clock_registER'
> drivers/built-in.o: In function `fm10k_ptp_unregister':
> (.text+0x12e7dc): undefined reference to `ptp_clock_unregister'
> Makefile:930: recipe for target 'vmlinux' failed
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>

Please follow what other drivers do, which is to use "select" on this
Kconfig symbol.

Thanks.

------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH net-next 0/2] sunvnet: Packet processing in non-interrupt context.
From: Raghuram Kothakota @ 2014-10-10  4:56 UTC (permalink / raw)
  To: David Miller; +Cc: sowmini.varadhan, netdev
In-Reply-To: <20141010.003628.646817757614430018.davem@davemloft.net>


On Oct 9, 2014, at 9:36 PM, David Miller <davem@davemloft.net> wrote:

> From: Raghuram Kothakota <Raghuram.Kothakota@oracle.com>
> Date: Thu, 9 Oct 2014 18:10:24 -0700
> 
>> Lock less Tx and Rx implementation is very nice, but requires to the
>> code path to single threaded to achieve it.
> 
> I would like to know how you believe the Linux LLTX "lockless TX"
> facility works.
> 

Sorry, I used incorrect terminology in my email. My knowledge of LLTX
is limited and I am still learning. I was not referring to the LLTX, but  about 
the implementation of  sunvnet transmit path and receive paths without locks. To me that
means only one thread of execution exists at a given time and I was
referring to it as single threadedness,  which limits performance on SPARC CMT
processors today. Using  methods to increase parallelism will help especially
when the traffic involves multiple connections, mainly from the point of view
of using multiple vCPUs to perform the processing where possible. 

-Raghuram

> It's not truly lockless, it just turns off the generic networking TX
> path per-queue spinlock and puts the full burdon on the driver.  It's
> just moving the locking from one place to another, not eliminating it.

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] networking: fm10k: Fix build failure
From: Pranith Kumar @ 2014-10-10  4:55 UTC (permalink / raw)
  To: Jeff Kirsher, Jesse Brandeburg, Bruce Allan, Carolyn Wyborny,
	Don Skidmore, Greg Rose, Matthew Vick, John Ronciak,
	Mitch Williams, Linux NICS, open list:INTEL ETHERNET DR...,
	open list:NETWORKING DRIVERS, open list

The latest linus git tip (3.18-rc1) fails with the following build failure. Fix
this by making PTP support explicit for fm10k driver.

rivers/built-in.o: In function `fm10k_ptp_register':
(.text+0x12e760): undefined reference to `ptp_clock_registER'
drivers/built-in.o: In function `fm10k_ptp_unregister':
(.text+0x12e7dc): undefined reference to `ptp_clock_unregister'
Makefile:930: recipe for target 'vmlinux' failed

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 drivers/net/ethernet/intel/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 6a6d5ee..281d998 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -303,7 +303,7 @@ config I40EVF
 config FM10K
 	tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support"
 	default n
-	depends on PCI_MSI
+	depends on PCI_MSI && PTP_1588_CLOCK
 	---help---
 	  This driver supports Intel(R) FM10000 Ethernet Switch Host
 	  Interface.  For more information on how to identify your adapter,
-- 
1.9.1


------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related

* Re: [PATCH] net: fs_enet: error: 'SCCE_ENET_TXF' undeclared
From: David Miller @ 2014-10-10  4:51 UTC (permalink / raw)
  To: christophe.leroy
  Cc: pantelis.antoniou, vbordug, linux-kernel, linuxppc-dev, netdev
In-Reply-To: <20141009145443.74AD61AB275@localhost.localdomain>

From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu,  9 Oct 2014 16:54:43 +0200 (CEST)

> [linux-devel:devel-hourly-2014100909 3763/3915] drivers/net/ethernet/freescale/fs_enet/mac-scc.c:119:32: error: 'SCCE_ENET_TXF' undeclared
> 
> Due to patch d43a396 net: fs_enet: Add NAPI TX, it appears that some target
> compilations are broken.
> This is due to the fact that unlike the FEC, the SCC and FCC don't have a TXF
> event (complete Frame transmitted) but only TXB (buffer transmitted).
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] flow-dissector: Fix alignment issue in __skb_flow_get_ports
From: David Miller @ 2014-10-10  4:47 UTC (permalink / raw)
  To: alexander.duyck; +Cc: netdev
In-Reply-To: <20141010035840.21428.359.stgit@ahduyck-workstation.home>

From: alexander.duyck@gmail.com
Date: Thu, 09 Oct 2014 21:03:28 -0700

> From: Alexander Duyck <alexander.h.duyck@redhat.com>
> 
> This patch addresses a kernel unaligned access bug seen on a sparc64 system
> with an igb adapter.  Specifically the __skb_flow_get_ports was returning a
> be32 pointer which was then having the value directly returned.
> 
> In order to keep the handling of the ports consistent with how we were
> handling the IPv4 and IPv6 addresses I have instead replaced the assignment
> with a memcpy to the flow key ports value.  This way it should stay a
> memcpy on systems that cannot handle unaligned access, and will likely be
> converted to a 32b assignment on the systems that can support it.
> 
> Reported-by: David S. Miller <davem@davemloft.net>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>

Guess what the compiler will output for the memcpy()....

	*(u32 *)dest = *(u32 *)src;

Using memcpy() is never a valid way to avoid misaligned loads and
stores.

If the types have a given alignment, the compiler can legitimately
expand the memcpy() inline with suitably sized loads and stores.

Please see my other reply in the original thread, we have to use
the hardware when we can to manage this situation, by configuring
it to output two garbage bytes before the packet contents when
DMA'ing into power-of-2 aligned blocks of memory.

Thanks.

^ permalink raw reply

* Re: eth_get_headlen() and unaligned accesses...
From: David Miller @ 2014-10-10  4:43 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev
In-Reply-To: <54374E09.1020907@redhat.com>

From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 09 Oct 2014 20:10:01 -0700

> On 10/09/2014 05:12 PM, David Miller wrote:
>> So, we have a bit of a problem, this is on sparc64:
>>
>> [423475.740836] Kernel unaligned access at TPC[81d330]
>> __skb_flow_get_ports+0x70/0xe0
>> [423475.755756] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.17.0+ #2
>> [423475.767854] Call Trace:
>> [423475.772877]  [0000000000433288] kernel_unaligned_trap+0x368/0x5c0
>> [423475.785203]  [000000000042a824] sun4v_do_mna+0x84/0xa0
>> [423475.795624]  [0000000000406cd0] sun4v_mna+0x5c/0x68
>> [423475.805521]  [000000000081d330] __skb_flow_get_ports+0x70/0xe0
>> [423475.817323]  [000000000081d6ac] __skb_flow_dissect+0x1ac/0x460
>> [423475.829128]  [0000000000843c98] eth_get_headlen+0x38/0xa0
>> [423475.840083]  [0000000010064d54] igb_poll+0x8d4/0xf60 [igb]
>> [423475.851184]  [00000000008243c8] net_rx_action+0xa8/0x1c0
>>
>> The chip DMA's to the beginning of a frag page and (unless timestamps
>> are enabled) that's where the ethernet header begins.
>>
>> So any larger than 16-bit access to the IP and later headers will be
>> unaligned.
>>
>> We have various ways we can deal with this based upon the capabilities
>> of the chips involved.  Can we configure the IGB to put 2 "don't care"
>> bytes at the beginning of the packet?
> 
> The problem is the igb part expects to be able to use 2K buffers which
> means it will always try to use the full half of a page.

Let me try to ask this again.

Can you configure the MAC to put two garbage bytes at the head of
the packet data as it feeds it into the DMA fifos on the IGB chip?

This is an essential (again: _essential_) feature for chips that
manage RX buffers as power-of-2 chunks of pages, as it is the only
cheap way to get the IP headers 32-bit aligned in those power-of-2 DMA
buffer blocks.

That would solve the whole problem.

^ permalink raw reply

* Re: [PATCH net-next 0/2] sunvnet: Packet processing in non-interrupt context.
From: David Miller @ 2014-10-10  4:36 UTC (permalink / raw)
  To: Raghuram.Kothakota; +Cc: sowmini.varadhan, netdev
In-Reply-To: <EB9A1CD1-6E1D-409C-89CB-95CC9A5AFC1D@oracle.com>

From: Raghuram Kothakota <Raghuram.Kothakota@oracle.com>
Date: Thu, 9 Oct 2014 18:10:24 -0700

> Lock less Tx and Rx implementation is very nice, but requires to the
> code path to single threaded to achieve it.

I would like to know how you believe the Linux LLTX "lockless TX"
facility works.

It's not truly lockless, it just turns off the generic networking TX
path per-queue spinlock and puts the full burdon on the driver.  It's
just moving the locking from one place to another, not eliminating it.

^ permalink raw reply

* Re: [PATCH][net-next] Documentation: replace __sk_run_filter with __bpf_prog_run
From: Alexei Starovoitov @ 2014-10-10  4:20 UTC (permalink / raw)
  To: roy.qing.li, Daniel Borkmann; +Cc: netdev@vger.kernel.org
In-Reply-To: <1412912214-964-1-git-send-email-roy.qing.li@gmail.com>

On Thu, Oct 9, 2014 at 8:36 PM,  <roy.qing.li@gmail.com> wrote:
> From: Li RongQing <roy.qing.li@gmail.com>
>
> __sk_run_filter has been renamed as __bpf_prog_run, so replace them in comments
>
> Signed-off-by: Li RongQing <roy.qing.li@gmail.com>

Acked-by: Alexei Starovoitov <ast@plumgrid.com>

> ---
>  Documentation/networking/filter.txt |    4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
> index 5ce4d07..c22b797 100644
> --- a/Documentation/networking/filter.txt
> +++ b/Documentation/networking/filter.txt
> @@ -700,11 +700,11 @@ Some core changes of the new internal format:
>      bpf_exit
>
>    If f2 is JITed and the pointer stored to '_f2'. The calls f1 -> f2 -> f3 and
> -  returns will be seamless. Without JIT, __sk_run_filter() interpreter needs to
> +  returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to
>    be used to call into f2.
>
>    For practical reasons all eBPF programs have only one argument 'ctx' which is
> -  already placed into R1 (e.g. on __sk_run_filter() startup) and the programs
> +  already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs
>    can call kernel functions with up to 5 arguments. Calls with 6 or more arguments
>    are currently not supported, but these restrictions can be lifted if necessary
>    in the future.
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH][net-next] net: filter: replace sk_run_filter with __bpf_prog_run in comments
From: Alexei Starovoitov @ 2014-10-10  4:17 UTC (permalink / raw)
  To: roy.qing.li; +Cc: netdev@vger.kernel.org
In-Reply-To: <1412912231-1004-1-git-send-email-roy.qing.li@gmail.com>

On Thu, Oct 9, 2014 at 8:37 PM,  <roy.qing.li@gmail.com> wrote:
> From: Li RongQing <roy.qing.li@gmail.com>
>
> sk_run_filter has been renamed as __bpf_prog_run, so replace them in comments
>
> Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
> ---
>  net/core/filter.c |    8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index fcd3f67..9324601 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -51,9 +51,9 @@
>   *     @skb: buffer to filter
>   *
>   * Run the filter code and then cut skb->data to correct size returned by
> - * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
> + * __bpf_prog_run. If pkt_len is 0 we toss packet. If skb->len is smaller
>   * than pkt_len we keep whole skb->data. This is the socket level
> - * wrapper to sk_run_filter. It returns 0 if the packet should
> + * wrapper to __bpf_prog_run. It returns 0 if the packet should
>   * be accepted or -EPERM if the packet should be tossed.

not quite. I think in both of the above lines it's better to replace
sk_run_filter with SK_RUN_FILTER

>   *
>   */
> @@ -567,10 +567,10 @@ err:
>  /* Security:
>   *
>   * A BPF program is able to use 16 cells of memory to store intermediate
> - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
> + * values (check u32 mem[BPF_MEMWORDS] in __bpf_prog_run()).

this just wrong. Better to remove this line.

>   *
>   * As we dont want to clear mem[] array for each packet going through
> - * sk_run_filter(), we check that filter loaded by user never try to read
> + * __bpf_prog_run(), we check that filter loaded by user never try to read

here the change is fine.

^ permalink raw reply

* [PATCH] flow-dissector: Fix alignment issue in __skb_flow_get_ports
From: alexander.duyck @ 2014-10-10  4:03 UTC (permalink / raw)
  To: netdev, davem
In-Reply-To: <20141009.201248.1210454965155680255.davem@davemloft.net>

From: Alexander Duyck <alexander.h.duyck@redhat.com>

This patch addresses a kernel unaligned access bug seen on a sparc64 system
with an igb adapter.  Specifically the __skb_flow_get_ports was returning a
be32 pointer which was then having the value directly returned.

In order to keep the handling of the ports consistent with how we were
handling the IPv4 and IPv6 addresses I have instead replaced the assignment
with a memcpy to the flow key ports value.  This way it should stay a
memcpy on systems that cannot handle unaligned access, and will likely be
converted to a 32b assignment on the systems that can support it.

Reported-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 drivers/net/bonding/bond_main.c |    2 +-
 include/net/flow_keys.h         |   10 ++++++----
 net/core/flow_dissector.c       |   21 +++++++++++++--------
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c9ac06c..326eb3d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2993,7 +2993,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		return false;
 	}
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
-		fk->ports = skb_flow_get_ports(skb, noff, proto);
+		skb_flow_get_ports(fk, skb, noff, proto);
 
 	return true;
 }
diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index 7ee2df0..66235f4 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -33,11 +33,13 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys
 {
 	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
 }
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
+void __skb_flow_get_ports(struct flow_keys *flow, const struct sk_buff *skb,
+			  int thoff, u8 ip_proto, void *data, int hlen_proto);
+static inline void skb_flow_get_ports(struct flow_keys *flow,
+				      const struct sk_buff *skb, int thoff,
+				      u8 ip_proto)
 {
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+	__skb_flow_get_ports(flow, skb, thoff, ip_proto, NULL, 0);
 }
 u32 flow_hash_from_keys(struct flow_keys *keys);
 unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 8560dea..baf8fe3 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -28,6 +28,7 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i
 
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
+ * @flow: Flow key to place port informatin into
  * @skb: sk_buff to extract the ports from
  * @thoff: transport header offset
  * @ip_proto: protocol for which to get port offset
@@ -37,8 +38,8 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i
  * The function will try to retrieve the ports at offset thoff + poff where poff
  * is the protocol port offset returned from proto_ports_offset
  */
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen)
+void __skb_flow_get_ports(struct flow_keys *flow, const struct sk_buff *skb,
+			  int thoff, u8 ip_proto, void *data, int hlen)
 {
 	int poff = proto_ports_offset(ip_proto);
 
@@ -48,15 +49,19 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 	}
 
 	if (poff >= 0) {
-		__be32 *ports, _ports;
+		__be32 *ports;
 
 		ports = __skb_header_pointer(skb, thoff + poff,
-					     sizeof(_ports), data, hlen, &_ports);
-		if (ports)
-			return *ports;
+					     sizeof(flow->ports), data, hlen,
+					     &flow->ports);
+		if (ports) {
+			if (&flow->ports != ports)
+				memcpy(&flow->ports, ports, sizeof(flow->ports));
+			return;
+		}
 	}
 
-	return 0;
+	memset(&flow->ports, 0, sizeof(flow->ports));
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
@@ -231,7 +236,7 @@ ipv6:
 
 	flow->n_proto = proto;
 	flow->ip_proto = ip_proto;
-	flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
+	__skb_flow_get_ports(flow, skb, nhoff, ip_proto, data, hlen);
 	flow->thoff = (u16) nhoff;
 
 	return true;

^ permalink raw reply related

* RE: [net-next PATCH v1 1/3] net: sched: af_packet support for direct ring access
From: Zhou, Danny @ 2014-10-10  3:49 UTC (permalink / raw)
  To: David Miller
  Cc: willemb@google.com, john.fastabend@gmail.com, dborkman@redhat.com,
	fw@strlen.de, gerlitz.or@gmail.com, hannes@stressinduktion.org,
	netdev@vger.kernel.org, Ronciak, John, amirv@mellanox.com,
	eric.dumazet@gmail.com
In-Reply-To: <20141007.120534.1798634446901746809.davem@davemloft.net>


> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Wednesday, October 08, 2014 12:06 AM
> To: Zhou, Danny
> Cc: willemb@google.com; john.fastabend@gmail.com; dborkman@redhat.com; fw@strlen.de; gerlitz.or@gmail.com;
> hannes@stressinduktion.org; netdev@vger.kernel.org; Ronciak, John; amirv@mellanox.com; eric.dumazet@gmail.com
> Subject: Re: [net-next PATCH v1 1/3] net: sched: af_packet support for direct ring access
> 
> From: "Zhou, Danny" <danny.zhou@intel.com>
> Date: Tue, 7 Oct 2014 15:21:15 +0000
> 
> > Once qpairs split-off is done, the user space driver, as a slave
> > driver, will re-initialize those queues completely in user space by
> > using paddr(in the case of DPDK, vaddr of DPDK used huge pages are
> > translated to paddr) to fill in the packet descriptors.  As of
> > security concern raised in previous discussion, the reason we
> > think(BTW, correct me if I am wrong) af_packet is most suitable is
> > because only user application with root permission is allowed to
> > successfully split-off queue pairs and mmap a small window of PCIe
> > I/O space to user space, so concern regarding "device can DMA
> > from/to any arbitrary physical memory." is not that big. As all user
> > space device drivers based on UIO mechanism has the same concern
> > issue, VFIO adds protection but it is based on IOMMU which is
> > specific to Intel silicons.
> 
> Wait a second.
> 
> If there is no memory protection performed I'm not merging this.
> 
> I thought the user has to associate a fixed pool of memory to the
> queueus, the kernel attaches that memory, and then the user cannot
> modify the addresses _AT_ _ALL_.
> 
> If the user can modify the addresses in the descriptors and make
> the chip crap on random memory, this is a non-starter.
> 
> Sorry.

Fairly enough, we will manage to add memory protection in future versions. 
Several options are under investigation.

^ permalink raw reply

* [PATCH][net-next] net: filter: replace sk_run_filter with __bpf_prog_run in comments
From: roy.qing.li @ 2014-10-10  3:37 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com> 

sk_run_filter has been renamed as __bpf_prog_run, so replace them in comments

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 net/core/filter.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index fcd3f67..9324601 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -51,9 +51,9 @@
  *	@skb: buffer to filter
  *
  * Run the filter code and then cut skb->data to correct size returned by
- * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * __bpf_prog_run. If pkt_len is 0 we toss packet. If skb->len is smaller
  * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to sk_run_filter. It returns 0 if the packet should
+ * wrapper to __bpf_prog_run. It returns 0 if the packet should
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
@@ -567,10 +567,10 @@ err:
 /* Security:
  *
  * A BPF program is able to use 16 cells of memory to store intermediate
- * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
+ * values (check u32 mem[BPF_MEMWORDS] in __bpf_prog_run()).
  *
  * As we dont want to clear mem[] array for each packet going through
- * sk_run_filter(), we check that filter loaded by user never try to read
+ * __bpf_prog_run(), we check that filter loaded by user never try to read
  * a cell if not previously written, and we check all branches to be sure
  * a malicious user doesn't try to abuse us.
  */
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH][net-next] Documentation: replace __sk_run_filter with __bpf_prog_run
From: roy.qing.li @ 2014-10-10  3:36 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com> 

__sk_run_filter has been renamed as __bpf_prog_run, so replace them in comments

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 Documentation/networking/filter.txt |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 5ce4d07..c22b797 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -700,11 +700,11 @@ Some core changes of the new internal format:
     bpf_exit
 
   If f2 is JITed and the pointer stored to '_f2'. The calls f1 -> f2 -> f3 and
-  returns will be seamless. Without JIT, __sk_run_filter() interpreter needs to
+  returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to
   be used to call into f2.
 
   For practical reasons all eBPF programs have only one argument 'ctx' which is
-  already placed into R1 (e.g. on __sk_run_filter() startup) and the programs
+  already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs
   can call kernel functions with up to 5 arguments. Calls with 6 or more arguments
   are currently not supported, but these restrictions can be lifted if necessary
   in the future.
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH net-next 2/2] macvlan: optimize the receive path
From: Jason Baron @ 2014-10-10  3:13 UTC (permalink / raw)
  To: davem; +Cc: eric.dumazet, stephen, vyasevich, kaber, netdev
In-Reply-To: <cover.1412908929.git.jbaron@akamai.com>

The netif_rx() call on the fast path of macvlan_handle_frame() appears to
be there to ensure that we properly throttle incoming packets. However, it
would appear as though the proper throttling is already in place for all
possible ingress paths, and that the call is redundant. If packets are arriving
from the physical NIC, we've already throttled them by this point. Otherwise,
if they are coming via macvlan_queue_xmit(), it calls either
'dev_forward_skb()', which ends up calling netif_rx_internal(), or else in
the broadcast case, we are throttling via macvlan_broadcast_enqueue().

The test results below are from off the box to an lxc instance running macvlan.
Once the tranactions/sec stop increasing, the cpu idle time has gone to 0.
Results are from a quad core Intel E3-1270 V2@3.50GHz box with bnx2x 10G card.

for i in {10,100,200,300,400,500};
do super_netperf $i -H $ip -t TCP_RR; done
Average of 5 runs.

trans/sec 		 trans/sec
(3.17-rc7-net-next)      (3.17-rc7-net-next + this patch)
----------               ----------
208101                   211534 (+1.6%)
839493                   850162 (+1.3%)
845071                   844053 (-.12%)
816330                   819623 (+.4%)
778700                   789938 (+1.4%)
735984                   754408 (+2.5%)

Signed-off-by: Jason Baron <jbaron@akamai.com>
---
 drivers/net/macvlan.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index c7c58af..29b3bb4 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -407,7 +407,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 	const struct macvlan_dev *src;
 	struct net_device *dev;
 	unsigned int len = 0;
-	int ret = NET_RX_DROP;
+	int ret;
+	rx_handler_result_t handle_res;
 
 	port = macvlan_port_get_rcu(skb->dev);
 	if (is_multicast_ether_addr(eth->h_dest)) {
@@ -423,6 +424,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			vlan = src;
 			ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
 			      netif_rx(skb);
+			handle_res = RX_HANDLER_CONSUMED;
 			goto out;
 		}
 
@@ -448,17 +450,20 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 	}
 	len = skb->len + ETH_HLEN;
 	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
+	if (!skb) {
+		ret = NET_RX_DROP;
+		handle_res = RX_HANDLER_CONSUMED;
 		goto out;
+	}
 
 	skb->dev = dev;
 	skb->pkt_type = PACKET_HOST;
 
-	ret = netif_rx(skb);
-
+	ret = NET_RX_SUCCESS;
+	handle_res = RX_HANDLER_ANOTHER;
 out:
 	macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
-	return RX_HANDLER_CONSUMED;
+	return handle_res;
 }
 
 static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
-- 
1.8.2.rc2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox