Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2 1/4] bnxt_en: rename some xdp functions
From: Michael Chan @ 2019-07-08 21:53 UTC (permalink / raw)
  To: davem, gospo; +Cc: netdev, hawk, ast, ilias.apalodimas
In-Reply-To: <1562622784-29918-1-git-send-email-michael.chan@broadcom.com>

From: Andy Gospodarek <gospo@broadcom.com>

Renaming bnxt_xmit_xdp to __bnxt_xmit_xdp to get ready for XDP_REDIRECT
support and reduce confusion/namespace collision.

Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c     | 8 ++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h     | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index a6c7baf..21a0431 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2799,7 +2799,7 @@ static int bnxt_run_loopback(struct bnxt *bp)
 		dev_kfree_skb(skb);
 		return -EIO;
 	}
-	bnxt_xmit_xdp(bp, txr, map, pkt_size, 0);
+	__bnxt_xmit_xdp(bp, txr, map, pkt_size, 0);
 
 	/* Sync BD data before updating doorbell */
 	wmb();
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 0184ef6..4bc9595 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -19,8 +19,8 @@
 #include "bnxt.h"
 #include "bnxt_xdp.h"
 
-void bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
-		   dma_addr_t mapping, u32 len, u16 rx_prod)
+void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
+		     dma_addr_t mapping, u32 len, u16 rx_prod)
 {
 	struct bnxt_sw_tx_bd *tx_buf;
 	struct tx_bd *txbd;
@@ -132,8 +132,8 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 		*event = BNXT_TX_EVENT;
 		dma_sync_single_for_device(&pdev->dev, mapping + offset, *len,
 					   bp->rx_dir);
-		bnxt_xmit_xdp(bp, txr, mapping + offset, *len,
-			      NEXT_RX(rxr->rx_prod));
+		__bnxt_xmit_xdp(bp, txr, mapping + offset, *len,
+				NEXT_RX(rxr->rx_prod));
 		bnxt_reuse_rx_data(rxr, cons, page);
 		return true;
 	default:
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
index 414b748..b36087b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
@@ -10,8 +10,8 @@
 #ifndef BNXT_XDP_H
 #define BNXT_XDP_H
 
-void bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
-		   dma_addr_t mapping, u32 len, u16 rx_prod);
+void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
+		     dma_addr_t mapping, u32 len, u16 rx_prod);
 void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts);
 bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 		 struct page *page, u8 **data_ptr, unsigned int *len,
-- 
2.5.1


^ permalink raw reply related

* [PATCH net-next v2 2/4] bnxt_en: Refactor __bnxt_xmit_xdp().
From: Michael Chan @ 2019-07-08 21:53 UTC (permalink / raw)
  To: davem, gospo; +Cc: netdev, hawk, ast, ilias.apalodimas
In-Reply-To: <1562622784-29918-1-git-send-email-michael.chan@broadcom.com>

__bnxt_xmit_xdp() is used by XDP_TX and ethtool loopback packet transmit.
Refactor it so that it can be re-used by the XDP_REDIRECT logic.
Restructure the TX interrupt handler logic to cleanly separate XDP_TX
logic in preparation for XDP_REDIRECT.

Acked-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c     | 33 ++++++++++++++++-------
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h     |  5 ++--
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 4b3ae92..bf12cfc 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -596,6 +596,7 @@ struct bnxt_sw_tx_bd {
 	DEFINE_DMA_UNMAP_ADDR(mapping);
 	u8			is_gso;
 	u8			is_push;
+	u8			action;
 	union {
 		unsigned short		nr_frags;
 		u16			rx_prod;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 21a0431..a0f3277 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2799,7 +2799,7 @@ static int bnxt_run_loopback(struct bnxt *bp)
 		dev_kfree_skb(skb);
 		return -EIO;
 	}
-	__bnxt_xmit_xdp(bp, txr, map, pkt_size, 0);
+	bnxt_xmit_bd(bp, txr, map, pkt_size);
 
 	/* Sync BD data before updating doorbell */
 	wmb();
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 4bc9595..41e232e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -19,8 +19,9 @@
 #include "bnxt.h"
 #include "bnxt_xdp.h"
 
-void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
-		     dma_addr_t mapping, u32 len, u16 rx_prod)
+struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp,
+				   struct bnxt_tx_ring_info *txr,
+				   dma_addr_t mapping, u32 len)
 {
 	struct bnxt_sw_tx_bd *tx_buf;
 	struct tx_bd *txbd;
@@ -29,7 +30,6 @@ void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
 
 	prod = txr->tx_prod;
 	tx_buf = &txr->tx_buf_ring[prod];
-	tx_buf->rx_prod = rx_prod;
 
 	txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
 	flags = (len << TX_BD_LEN_SHIFT) | (1 << TX_BD_FLAGS_BD_CNT_SHIFT) |
@@ -40,30 +40,43 @@ void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
 
 	prod = NEXT_TX(prod);
 	txr->tx_prod = prod;
+	return tx_buf;
+}
+
+static void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
+			    dma_addr_t mapping, u32 len, u16 rx_prod)
+{
+	struct bnxt_sw_tx_bd *tx_buf;
+
+	tx_buf = bnxt_xmit_bd(bp, txr, mapping, len);
+	tx_buf->rx_prod = rx_prod;
+	tx_buf->action = XDP_TX;
 }
 
 void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 {
 	struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
 	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	bool rx_doorbell_needed = false;
 	struct bnxt_sw_tx_bd *tx_buf;
 	u16 tx_cons = txr->tx_cons;
 	u16 last_tx_cons = tx_cons;
-	u16 rx_prod;
 	int i;
 
 	for (i = 0; i < nr_pkts; i++) {
-		last_tx_cons = tx_cons;
+		tx_buf = &txr->tx_buf_ring[tx_cons];
+
+		if (tx_buf->action == XDP_TX) {
+			rx_doorbell_needed = true;
+			last_tx_cons = tx_cons;
+		}
 		tx_cons = NEXT_TX(tx_cons);
 	}
 	txr->tx_cons = tx_cons;
-	if (bnxt_tx_avail(bp, txr) == bp->tx_ring_size) {
-		rx_prod = rxr->rx_prod;
-	} else {
+	if (rx_doorbell_needed) {
 		tx_buf = &txr->tx_buf_ring[last_tx_cons];
-		rx_prod = tx_buf->rx_prod;
+		bnxt_db_write(bp, &rxr->rx_db, tx_buf->rx_prod);
 	}
-	bnxt_db_write(bp, &rxr->rx_db, rx_prod);
 }
 
 /* returns the following:
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
index b36087b..20e470c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
@@ -10,8 +10,9 @@
 #ifndef BNXT_XDP_H
 #define BNXT_XDP_H
 
-void __bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
-		     dma_addr_t mapping, u32 len, u16 rx_prod);
+struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp,
+				   struct bnxt_tx_ring_info *txr,
+				   dma_addr_t mapping, u32 len);
 void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts);
 bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 		 struct page *page, u8 **data_ptr, unsigned int *len,
-- 
2.5.1


^ permalink raw reply related

* [PATCH net-next v2 0/4] bnxt_en: Add XDP_REDIRECT support.
From: Michael Chan @ 2019-07-08 21:53 UTC (permalink / raw)
  To: davem, gospo; +Cc: netdev, hawk, ast, ilias.apalodimas

This patch series adds XDP_REDIRECT support by Andy Gospodarek.

Andy Gospodarek (3):
  bnxt_en: rename some xdp functions
  bnxt_en: optimized XDP_REDIRECT support
  bnxt_en: add page_pool support

Michael Chan (1):
  bnxt_en: Refactor __bnxt_xmit_xdp().

 drivers/net/ethernet/broadcom/Kconfig             |   1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  72 ++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  17 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c     | 144 +++++++++++++++++++---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h     |   7 +-
 6 files changed, 214 insertions(+), 29 deletions(-)

-- 
2.5.1

^ permalink raw reply

* Re: [EXT] Re: [PATCH net-next v2 4/4] qed*: Add devlink support for configuration attributes.
From: Jakub Kicinski @ 2019-07-08 21:47 UTC (permalink / raw)
  To: Sudarsana Reddy Kalluru
  Cc: davem@davemloft.net, netdev@vger.kernel.org, Michal Kalderon,
	Ariel Elior, Jiri Pirko
In-Reply-To: <MN2PR18MB25280224F5DDDFE8D86B234CD3F60@MN2PR18MB2528.namprd18.prod.outlook.com>

On Mon, 8 Jul 2019 02:31:15 +0000, Sudarsana Reddy Kalluru wrote:
> > > > > +			Type: u8
> > > > > +			Configuration mode: Permanent
> > > > > +
> > > > > +dcbx_mode		[PORT, DRIVER-SPECIFIC]
> > > > > +			Configure DCBX mode for the device.
> > > > > +			Supported dcbx modes are,
> > > > > +			    Disabled(0), IEEE(1), CEE(2) and
> > > > > Dynamic(3)
> > > > > +			Type: u8
> > > > > +			Configuration mode: Permanent  
> > > >
> > > > Why is this a permanent parameter?
> > > >  
> > > This specifies the dcbx_mode to be configured in non-volatile memory.
> > > The value is persistent and is used in the next load of OS or the mfw.  
> > 
> > And it can't be changed at runtime?  
>
> Run time dcbx params are not affected via this interface, it only
> updates config on permanent storage of the port.

IOW it affects the defaults after boot?  It'd be preferable to have 
the DCB uAPI handle "persistent"/default settings if that's the case.

^ permalink raw reply

* Re: [PATCH bpf-next] selftests/bpf: make verifier loop tests arch independent
From: Ilya Leoshkevich @ 2019-07-08 21:44 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Y Song, Stanislav Fomichev, netdev, bpf, David Miller,
	Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20190708212012.GA9509@mini-arch>

> Am 08.07.2019 um 23:20 schrieb Stanislav Fomichev <sdf@fomichev.me>:
> 
> On 07/08, Ilya Leoshkevich wrote:
>> 
>> 
>>> Am 08.07.2019 um 18:13 schrieb Stanislav Fomichev <sdf@fomichev.me>:
>>> 
>>> On 07/03, Y Song wrote:
>>>> On Wed, Jul 3, 2019 at 1:51 PM Stanislav Fomichev <sdf@google.com> wrote:
>>>>> 
>>>>> Take the first x bytes of pt_regs for scalability tests, there is
>>>>> no real reason we need x86 specific rax.
>>>>> 
>>>>> Signed-off-by: Stanislav Fomichev <sdf@google.com>
>>>>> ---
>>>>> tools/testing/selftests/bpf/progs/loop1.c | 3 ++-
>>>>> tools/testing/selftests/bpf/progs/loop2.c | 3 ++-
>>>>> tools/testing/selftests/bpf/progs/loop3.c | 3 ++-
>>>>> 3 files changed, 6 insertions(+), 3 deletions(-)
>>>>> 
>>>>> diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c
>>>>> index dea395af9ea9..d530c61d2517 100644
>>>>> --- a/tools/testing/selftests/bpf/progs/loop1.c
>>>>> +++ b/tools/testing/selftests/bpf/progs/loop1.c
>>>>> @@ -14,11 +14,12 @@ SEC("raw_tracepoint/kfree_skb")
>>>>> int nested_loops(volatile struct pt_regs* ctx)
>>>>> {
>>>>>       int i, j, sum = 0, m;
>>>>> +       volatile int *any_reg = (volatile int *)ctx;
>>>>> 
>>>>>       for (j = 0; j < 300; j++)
>>>>>               for (i = 0; i < j; i++) {
>>>>>                       if (j & 1)
>>>>> -                               m = ctx->rax;
>>>>> +                               m = *any_reg;
>>>> 
>>>> I agree. ctx->rax here is only to generate some operations, which
>>>> cannot be optimized away by the compiler. dereferencing a volatile
>>>> pointee may just serve that purpose.
>>>> 
>>>> Comparing the byte code generated with ctx->rax and *any_reg, they are
>>>> slightly different. Using *any_reg is slighly worse, but this should
>>>> be still okay for the test.
>>>> 
>>>>>                       else
>>>>>                               m = j;
>>>>>                       sum += i * m;
>>>>> diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c
>>>>> index 0637bd8e8bcf..91bb89d901e3 100644
>>>>> --- a/tools/testing/selftests/bpf/progs/loop2.c
>>>>> +++ b/tools/testing/selftests/bpf/progs/loop2.c
>>>>> @@ -14,9 +14,10 @@ SEC("raw_tracepoint/consume_skb")
>>>>> int while_true(volatile struct pt_regs* ctx)
>>>>> {
>>>>>       int i = 0;
>>>>> +       volatile int *any_reg = (volatile int *)ctx;
>>>>> 
>>>>>       while (true) {
>>>>> -               if (ctx->rax & 1)
>>>>> +               if (*any_reg & 1)
>>>>>                       i += 3;
>>>>>               else
>>>>>                       i += 7;
>>>>> diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c
>>>>> index 30a0f6cba080..3a7f12d7186c 100644
>>>>> --- a/tools/testing/selftests/bpf/progs/loop3.c
>>>>> +++ b/tools/testing/selftests/bpf/progs/loop3.c
>>>>> @@ -14,9 +14,10 @@ SEC("raw_tracepoint/consume_skb")
>>>>> int while_true(volatile struct pt_regs* ctx)
>>>>> {
>>>>>       __u64 i = 0, sum = 0;
>>>>> +       volatile __u64 *any_reg = (volatile __u64 *)ctx;
>>>>>       do {
>>>>>               i++;
>>>>> -               sum += ctx->rax;
>>>>> +               sum += *any_reg;
>>>>>       } while (i < 0x100000000ULL);
>>>>>       return sum;
>>>>> }
>>>>> --
>>>>> 2.22.0.410.gd8fdbe21b5-goog
>>>> 
>>>> Ilya Leoshkevich (iii@linux.ibm.com, cc'ed) has another patch set
>>>> trying to solve this problem by introducing s360 arch register access
>>>> macros. I guess for now that patch set is not needed any more?
>>> Oh, I missed them. Do they fix the tests for other (non-s360) arches as
>>> well? I was trying to fix the issue by not depending on any arch
>>> specific stuff because the test really doesn't care :-)
>> 
>> They are supposed to work for everything that defines PT_REGS_RC in
>> bpf_helpers.h, but I have to admit I tested only x86_64 and s390.
>> 
>> The main source of problems with my approach were mismatching definitions
>> of struct pt_regs for userspace and kernel, and because of that there was
>> some tweaking required for both arches. I will double check how it looks
>> for others (arm, mips, ppc, sparc) tomorrow.
> Thanks, I've tested your patches and they fix my issue as well. So you
> can have my Tested-by if we'd go with your approach.
> 
> One thing I don't understand is: why do you add 'ifdef __KERNEL__' to
> the bpf_helpers.h for x86 case? Who is using bpf_helpers.h with
> __KERNEL__ defined? Is it perf?

That’s samples/bpf. Also, there is a modified copy of it in bcc
(src/cc/export/helpers.h), which also gets built with __KERNEL__.

Best regards,
Ilya

^ permalink raw reply

* Re: [PATCH net-next,v3 00/11] netfilter: add hardware offload infrastructure
From: David Miller @ 2019-07-08 21:39 UTC (permalink / raw)
  To: pablo
  Cc: netdev, thomas.lendacky, f.fainelli, ariel.elior, michael.chan,
	madalin.bucur, yisen.zhuang, salil.mehta, jeffrey.t.kirsher,
	tariqt, saeedm, jiri, idosch, jakub.kicinski, peppe.cavallaro,
	grygorii.strashko, andrew, vivien.didelot, alexandre.torgue,
	joabreu, linux-net-drivers, ogerlitz, Manish.Chopra,
	marcelo.leitner, mkubecek, venkatkumar.duvvuru, maxime.chevallier,
	cphealy, netfilter-devel
In-Reply-To: <20190708160614.2226-1-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon,  8 Jul 2019 18:06:02 +0200

> This patchset adds support for Netfilter hardware offloads.

Pablo if you can address Jiri's feedback fully and repost this by
Tuesday (PST timezone) I will allow it into this merge window.

Thank you.

^ permalink raw reply

* Re: [PATCH V5 net-next 4/6] dt-bindings: ptp: Introduce MII time stamping devices.
From: Rob Herring @ 2019-07-08 21:38 UTC (permalink / raw)
  To: Richard Cochran
  Cc: netdev, David Miller, devicetree, Andrew Lunn, Florian Fainelli,
	Jacob Keller, Mark Rutland, Miroslav Lichvar, Willem de Bruijn
In-Reply-To: <d786656435c64160d50014beb3d3d9d1aaf6f22d.1559281985.git.richardcochran@gmail.com>

On Thu, May 30, 2019 at 10:56:24PM -0700, Richard Cochran wrote:
> This patch add a new binding that allows non-PHY MII time stamping
> devices to find their buses.  The new documentation covers both the
> generic binding and one upcoming user.
> 
> Signed-off-by: Richard Cochran <richardcochran@gmail.com>
> Reviewed-by: Andrew Lunn <andrew@lunn.ch>
> ---
>  Documentation/devicetree/bindings/ptp/ptp-ines.txt | 35 ++++++++++++++++++
>  .../devicetree/bindings/ptp/timestamper.txt        | 41 ++++++++++++++++++++++
>  2 files changed, 76 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/ptp/ptp-ines.txt
>  create mode 100644 Documentation/devicetree/bindings/ptp/timestamper.txt
> 
> diff --git a/Documentation/devicetree/bindings/ptp/ptp-ines.txt b/Documentation/devicetree/bindings/ptp/ptp-ines.txt
> new file mode 100644
> index 000000000000..4dee9eb89455
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/ptp/ptp-ines.txt
> @@ -0,0 +1,35 @@
> +ZHAW InES PTP time stamping IP core
> +
> +The IP core needs two different kinds of nodes.  The control node
> +lives somewhere in the memory map and specifies the address of the
> +control registers.  There can be up to three port handles placed as
> +attributes of PHY nodes.  These associate a particular MII bus with a
> +port index within the IP core.
> +
> +Required properties of the control node:
> +
> +- compatible:		"ines,ptp-ctrl"

This is an IP block that gets integrated into SoCs? It's not very 
specific given that there could be different versions of the IP block 
and SoC vendors can integrate various versions of the IP block in their 
own unique (i.e. buggy) way.

> +- reg:			physical address and size of the register bank
> +
> +Required format of the port handle within the PHY node:
> +
> +- timestamper:		provides control node reference and
> +			the port channel within the IP core
> +
> +Example:
> +
> +	tstamper: timestamper@60000000 {
> +		compatible = "ines,ptp-ctrl";
> +		reg = <0x60000000 0x80>;
> +	};
> +
> +	ethernet@80000000 {
> +		...
> +		mdio {
> +			...
> +			phy@3 {

ethernet-phy is the correct node name.

> +				...
> +				timestamper = <&tstamper 0>;
> +			};
> +		};
> +	};
> diff --git a/Documentation/devicetree/bindings/ptp/timestamper.txt b/Documentation/devicetree/bindings/ptp/timestamper.txt
> new file mode 100644
> index 000000000000..88ea0bc7d662
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/ptp/timestamper.txt
> @@ -0,0 +1,41 @@
> +Time stamps from MII bus snooping devices
> +
> +This binding supports non-PHY devices that snoop the MII bus and
> +provide time stamps.  In contrast to PHY time stamping drivers (which
> +can simply attach their interface directly to the PHY instance), stand
> +alone MII time stamping drivers use this binding to specify the
> +connection between the snooping device and a given network interface.
> +
> +Non-PHY MII time stamping drivers typically talk to the control
> +interface over another bus like I2C, SPI, UART, or via a memory mapped
> +peripheral.  This controller device is associated with one or more
> +time stamping channels, each of which snoops on a MII bus.
> +
> +The "timestamper" property lives in a phy node and links a time
> +stamping channel from the controller device to that phy's MII bus.
> +
> +Example:
> +
> +	tstamper: timestamper@10000000 {
> +		compatible = "bigcorp,ts-ctrl";

Would be better to use a real example here.

> +	};
> +
> +	ethernet@20000000 {
> +		mdio {
> +			phy@1 {
> +				timestamper = <&tstamper 0>;
> +			};
> +		};
> +	};
> +
> +	ethernet@30000000 {
> +		mdio {
> +			phy@2 {
> +				timestamper = <&tstamper 1>;
> +			};
> +		};
> +	};
> +
> +In this example, time stamps from the MII bus attached to phy@1 will
> +appear on time stamp channel 0 (zero), and those from phy@2 appear on
> +channel 1.
> -- 
> 2.11.0
> 

^ permalink raw reply

* [PATCH v9 net-next 1/5] net: core: page_pool: add user refcnt and reintroduce page_pool_destroy
From: Ivan Khoronzhuk @ 2019-07-08 21:34 UTC (permalink / raw)
  To: grygorii.strashko, hawk, davem
  Cc: ast, linux-kernel, linux-omap, xdp-newbies, ilias.apalodimas,
	netdev, daniel, jakub.kicinski, john.fastabend, Ivan Khoronzhuk,
	Jesper Dangaard Brouer
In-Reply-To: <20190708213432.8525-1-ivan.khoronzhuk@linaro.org>

Jesper recently removed page_pool_destroy() (from driver invocation)
and moved shutdown and free of page_pool into xdp_rxq_info_unreg(),
in-order to handle in-flight packets/pages. This created an asymmetry
in drivers create/destroy pairs.

This patch reintroduce page_pool_destroy and add page_pool user
refcnt. This serves the purpose to simplify drivers error handling as
driver now drivers always calls page_pool_destroy() and don't need to
track if xdp_rxq_info_reg_mem_model() was unsuccessful.

This could be used for a special cases where a single RX-queue (with a
single page_pool) provides packets for two net_device'es, and thus
needs to register the same page_pool twice with two xdp_rxq_info
structures.

This patch is primarily to ease API usage for drivers. The recently
merged netsec driver, actually have a bug in this area, which is
solved by this API change.

This patch is a modified version of Ivan Khoronzhuk's original patch.

Link: https://lore.kernel.org/netdev/20190625175948.24771-2-ivan.khoronzhuk@linaro.org/
Fixes: 5c67bf0ec4d0 ("net: netsec: Use page_pool API")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  4 +--
 drivers/net/ethernet/socionext/netsec.c       |  8 ++----
 include/net/page_pool.h                       | 25 +++++++++++++++++++
 net/core/page_pool.c                          |  8 ++++++
 net/core/xdp.c                                |  3 +++
 5 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 83194d56434d..10efd69de7ef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -577,8 +577,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		}
 		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
 						 MEM_TYPE_PAGE_POOL, rq->page_pool);
-		if (err)
-			page_pool_free(rq->page_pool);
 	}
 	if (err)
 		goto err_free;
@@ -646,6 +644,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
@@ -680,6 +679,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 	}
 
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 460777449cd9..d7307ab90d74 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -1212,15 +1212,11 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 		}
 	}
 
-	/* Rx is currently using page_pool
-	 * since the pool is created during netsec_setup_rx_dring(), we need to
-	 * free the pool manually if the registration failed
-	 */
+	/* Rx is currently using page_pool */
 	if (id == NETSEC_RING_RX) {
 		if (xdp_rxq_info_is_reg(&dring->xdp_rxq))
 			xdp_rxq_info_unreg(&dring->xdp_rxq);
-		else
-			page_pool_free(dring->page_pool);
+		page_pool_destroy(dring->page_pool);
 	}
 
 	memset(dring->desc, 0, sizeof(struct netsec_desc) * DESC_NUM);
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ee9c871d2043..2cbcdbdec254 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -101,6 +101,12 @@ struct page_pool {
 	struct ptr_ring ring;
 
 	atomic_t pages_state_release_cnt;
+
+	/* A page_pool is strictly tied to a single RX-queue being
+	 * protected by NAPI, due to above pp_alloc_cache. This
+	 * refcnt serves purpose is to simplify drivers error handling.
+	 */
+	refcount_t user_cnt;
 };
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -134,6 +140,15 @@ static inline void page_pool_free(struct page_pool *pool)
 #endif
 }
 
+/* Drivers use this instead of page_pool_free */
+static inline void page_pool_destroy(struct page_pool *pool)
+{
+	if (!pool)
+		return;
+
+	page_pool_free(pool);
+}
+
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
@@ -201,4 +216,14 @@ static inline bool is_page_pool_compiled_in(void)
 #endif
 }
 
+static inline void page_pool_get(struct page_pool *pool)
+{
+	refcount_inc(&pool->user_cnt);
+}
+
+static inline bool page_pool_put(struct page_pool *pool)
+{
+	return refcount_dec_and_test(&pool->user_cnt);
+}
+
 #endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index b366f59885c1..3272dc7a8c81 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -49,6 +49,9 @@ static int page_pool_init(struct page_pool *pool,
 
 	atomic_set(&pool->pages_state_release_cnt, 0);
 
+	/* Driver calling page_pool_create() also call page_pool_destroy() */
+	refcount_set(&pool->user_cnt, 1);
+
 	if (pool->p.flags & PP_FLAG_DMA_MAP)
 		get_device(pool->p.dev);
 
@@ -70,6 +73,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
 		kfree(pool);
 		return ERR_PTR(err);
 	}
+
 	return pool;
 }
 EXPORT_SYMBOL(page_pool_create);
@@ -356,6 +360,10 @@ static void __warn_in_flight(struct page_pool *pool)
 
 void __page_pool_free(struct page_pool *pool)
 {
+	/* Only last user actually free/release resources */
+	if (!page_pool_put(pool))
+		return;
+
 	WARN(pool->alloc.count, "API usage violation");
 	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 829377cc83db..d7bf62ffbb5e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -370,6 +370,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 		goto err;
 	}
 
+	if (type == MEM_TYPE_PAGE_POOL)
+		page_pool_get(xdp_alloc->page_pool);
+
 	mutex_unlock(&mem_id_lock);
 
 	trace_mem_connect(xdp_alloc, xdp_rxq);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v9 net-next 3/5] net: ethernet: ti: davinci_cpdma: allow desc split while down
From: Ivan Khoronzhuk @ 2019-07-08 21:34 UTC (permalink / raw)
  To: grygorii.strashko, hawk, davem
  Cc: ast, linux-kernel, linux-omap, xdp-newbies, ilias.apalodimas,
	netdev, daniel, jakub.kicinski, john.fastabend, Ivan Khoronzhuk
In-Reply-To: <20190708213432.8525-1-ivan.khoronzhuk@linaro.org>

That's possible to set ring params while interfaces are down. When
interface gets up it uses number of descs to fill rx queue and on
later on changes to create rx pools. Usually, this resplit can happen
after phy is up, but it can be needed before this, so allow it to
happen while setting number of rx descs, when interfaces are down.
Also, if no dependency on intf state, move it to cpdma layer, where
it should be.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw_ethtool.c  | 17 +++++++++++------
 drivers/net/ethernet/ti/davinci_cpdma.c | 17 ++++++++++++++++-
 drivers/net/ethernet/ti/davinci_cpdma.h |  3 +--
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index f60dc1dfc443..c477e6b620d6 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -650,7 +650,7 @@ int cpsw_set_ringparam(struct net_device *ndev,
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int ret;
+	int descs_num, ret;
 
 	/* ignore ering->tx_pending - only rx_pending adjustment is supported */
 
@@ -659,20 +659,25 @@ int cpsw_set_ringparam(struct net_device *ndev,
 	    ering->rx_pending > (cpsw->descs_pool_size - CPSW_MAX_QUEUES))
 		return -EINVAL;
 
-	if (ering->rx_pending == cpdma_get_num_rx_descs(cpsw->dma))
+	descs_num = cpdma_get_num_rx_descs(cpsw->dma);
+	if (ering->rx_pending == descs_num)
 		return 0;
 
 	cpsw_suspend_data_pass(ndev);
 
-	cpdma_set_num_rx_descs(cpsw->dma, ering->rx_pending);
+	ret = cpdma_set_num_rx_descs(cpsw->dma, ering->rx_pending);
+	if (ret) {
+		if (cpsw_resume_data_pass(ndev))
+			goto err;
 
-	if (cpsw->usage_count)
-		cpdma_chan_split_pool(cpsw->dma);
+		return ret;
+	}
 
 	ret = cpsw_resume_data_pass(ndev);
 	if (!ret)
 		return 0;
-
+err:
+	cpdma_set_num_rx_descs(cpsw->dma, descs_num);
 	dev_err(cpsw->dev, "cannot set ring params, closing device\n");
 	dev_close(ndev);
 	return ret;
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 4e693c3aab27..0ca2a1a254de 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1423,8 +1423,23 @@ int cpdma_get_num_tx_descs(struct cpdma_ctlr *ctlr)
 	return ctlr->num_tx_desc;
 }
 
-void cpdma_set_num_rx_descs(struct cpdma_ctlr *ctlr, int num_rx_desc)
+int cpdma_set_num_rx_descs(struct cpdma_ctlr *ctlr, int num_rx_desc)
 {
+	unsigned long flags;
+	int temp, ret;
+
+	spin_lock_irqsave(&ctlr->lock, flags);
+
+	temp = ctlr->num_rx_desc;
 	ctlr->num_rx_desc = num_rx_desc;
 	ctlr->num_tx_desc = ctlr->pool->num_desc - ctlr->num_rx_desc;
+	ret = cpdma_chan_split_pool(ctlr);
+	if (ret) {
+		ctlr->num_rx_desc = temp;
+		ctlr->num_tx_desc = ctlr->pool->num_desc - ctlr->num_rx_desc;
+	}
+
+	spin_unlock_irqrestore(&ctlr->lock, flags);
+
+	return ret;
 }
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 0271a20c2e09..d3cfe234d16a 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -116,8 +116,7 @@ enum cpdma_control {
 int cpdma_control_get(struct cpdma_ctlr *ctlr, int control);
 int cpdma_control_set(struct cpdma_ctlr *ctlr, int control, int value);
 int cpdma_get_num_rx_descs(struct cpdma_ctlr *ctlr);
-void cpdma_set_num_rx_descs(struct cpdma_ctlr *ctlr, int num_rx_desc);
+int cpdma_set_num_rx_descs(struct cpdma_ctlr *ctlr, int num_rx_desc);
 int cpdma_get_num_tx_descs(struct cpdma_ctlr *ctlr);
-int cpdma_chan_split_pool(struct cpdma_ctlr *ctlr);
 
 #endif
-- 
2.17.1


^ permalink raw reply related

* [PATCH v9 net-next 2/5] net: ethernet: ti: davinci_cpdma: add dma mapped submit
From: Ivan Khoronzhuk @ 2019-07-08 21:34 UTC (permalink / raw)
  To: grygorii.strashko, hawk, davem
  Cc: ast, linux-kernel, linux-omap, xdp-newbies, ilias.apalodimas,
	netdev, daniel, jakub.kicinski, john.fastabend, Ivan Khoronzhuk
In-Reply-To: <20190708213432.8525-1-ivan.khoronzhuk@linaro.org>

In case if dma mapped packet needs to be sent, like with XDP
page pool, the "mapped" submit can be used. This patch adds dma
mapped submit based on regular one.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---

v8..v9:
- fix warnings on arm64 caused by typos in type casting

 drivers/net/ethernet/ti/davinci_cpdma.c | 89 ++++++++++++++++++++++---
 drivers/net/ethernet/ti/davinci_cpdma.h |  4 ++
 2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 5cf1758d425b..4e693c3aab27 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -139,6 +139,7 @@ struct submit_info {
 	int directed;
 	void *token;
 	void *data;
+	int flags;
 	int len;
 };
 
@@ -184,6 +185,8 @@ static struct cpdma_control_info controls[] = {
 				 (directed << CPDMA_TO_PORT_SHIFT));	\
 	} while (0)
 
+#define CPDMA_DMA_EXT_MAP		BIT(16)
+
 static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
 {
 	struct cpdma_desc_pool *pool = ctlr->pool;
@@ -1015,6 +1018,7 @@ static int cpdma_chan_submit_si(struct submit_info *si)
 	struct cpdma_chan		*chan = si->chan;
 	struct cpdma_ctlr		*ctlr = chan->ctlr;
 	int				len = si->len;
+	int				swlen = len;
 	struct cpdma_desc __iomem	*desc;
 	dma_addr_t			buffer;
 	u32				mode;
@@ -1036,16 +1040,22 @@ static int cpdma_chan_submit_si(struct submit_info *si)
 		chan->stats.runt_transmit_buff++;
 	}
 
-	buffer = dma_map_single(ctlr->dev, si->data, len, chan->dir);
-	ret = dma_mapping_error(ctlr->dev, buffer);
-	if (ret) {
-		cpdma_desc_free(ctlr->pool, desc, 1);
-		return -EINVAL;
-	}
-
 	mode = CPDMA_DESC_OWNER | CPDMA_DESC_SOP | CPDMA_DESC_EOP;
 	cpdma_desc_to_port(chan, mode, si->directed);
 
+	if (si->flags & CPDMA_DMA_EXT_MAP) {
+		buffer = (dma_addr_t)si->data;
+		dma_sync_single_for_device(ctlr->dev, buffer, len, chan->dir);
+		swlen |= CPDMA_DMA_EXT_MAP;
+	} else {
+		buffer = dma_map_single(ctlr->dev, si->data, len, chan->dir);
+		ret = dma_mapping_error(ctlr->dev, buffer);
+		if (ret) {
+			cpdma_desc_free(ctlr->pool, desc, 1);
+			return -EINVAL;
+		}
+	}
+
 	/* Relaxed IO accessors can be used here as there is read barrier
 	 * at the end of write sequence.
 	 */
@@ -1055,7 +1065,7 @@ static int cpdma_chan_submit_si(struct submit_info *si)
 	writel_relaxed(mode | len, &desc->hw_mode);
 	writel_relaxed((uintptr_t)si->token, &desc->sw_token);
 	writel_relaxed(buffer, &desc->sw_buffer);
-	writel_relaxed(len, &desc->sw_len);
+	writel_relaxed(swlen, &desc->sw_len);
 	desc_read(desc, sw_len);
 
 	__cpdma_chan_submit(chan, desc);
@@ -1079,6 +1089,32 @@ int cpdma_chan_idle_submit(struct cpdma_chan *chan, void *token, void *data,
 	si.data = data;
 	si.len = len;
 	si.directed = directed;
+	si.flags = 0;
+
+	spin_lock_irqsave(&chan->lock, flags);
+	if (chan->state == CPDMA_STATE_TEARDOWN) {
+		spin_unlock_irqrestore(&chan->lock, flags);
+		return -EINVAL;
+	}
+
+	ret = cpdma_chan_submit_si(&si);
+	spin_unlock_irqrestore(&chan->lock, flags);
+	return ret;
+}
+
+int cpdma_chan_idle_submit_mapped(struct cpdma_chan *chan, void *token,
+				  dma_addr_t data, int len, int directed)
+{
+	struct submit_info si;
+	unsigned long flags;
+	int ret;
+
+	si.chan = chan;
+	si.token = token;
+	si.data = (void *)data;
+	si.len = len;
+	si.directed = directed;
+	si.flags = CPDMA_DMA_EXT_MAP;
 
 	spin_lock_irqsave(&chan->lock, flags);
 	if (chan->state == CPDMA_STATE_TEARDOWN) {
@@ -1103,6 +1139,32 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
 	si.data = data;
 	si.len = len;
 	si.directed = directed;
+	si.flags = 0;
+
+	spin_lock_irqsave(&chan->lock, flags);
+	if (chan->state != CPDMA_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&chan->lock, flags);
+		return -EINVAL;
+	}
+
+	ret = cpdma_chan_submit_si(&si);
+	spin_unlock_irqrestore(&chan->lock, flags);
+	return ret;
+}
+
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
+			     dma_addr_t data, int len, int directed)
+{
+	struct submit_info si;
+	unsigned long flags;
+	int ret;
+
+	si.chan = chan;
+	si.token = token;
+	si.data = (void *)data;
+	si.len = len;
+	si.directed = directed;
+	si.flags = CPDMA_DMA_EXT_MAP;
 
 	spin_lock_irqsave(&chan->lock, flags);
 	if (chan->state != CPDMA_STATE_ACTIVE) {
@@ -1140,10 +1202,17 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
 	uintptr_t			token;
 
 	token      = desc_read(desc, sw_token);
-	buff_dma   = desc_read(desc, sw_buffer);
 	origlen    = desc_read(desc, sw_len);
 
-	dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+	buff_dma   = desc_read(desc, sw_buffer);
+	if (origlen & CPDMA_DMA_EXT_MAP) {
+		origlen &= ~CPDMA_DMA_EXT_MAP;
+		dma_sync_single_for_cpu(ctlr->dev, buff_dma, origlen,
+					chan->dir);
+	} else {
+		dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
+	}
+
 	cpdma_desc_free(pool, desc, 1);
 	(*chan->handler)((void *)token, outlen, status);
 }
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 9343c8c73c1b..0271a20c2e09 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -77,8 +77,12 @@ int cpdma_chan_stop(struct cpdma_chan *chan);
 
 int cpdma_chan_get_stats(struct cpdma_chan *chan,
 			 struct cpdma_chan_stats *stats);
+int cpdma_chan_submit_mapped(struct cpdma_chan *chan, void *token,
+			     dma_addr_t data, int len, int directed);
 int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
 		      int len, int directed);
+int cpdma_chan_idle_submit_mapped(struct cpdma_chan *chan, void *token,
+				  dma_addr_t data, int len, int directed);
 int cpdma_chan_idle_submit(struct cpdma_chan *chan, void *token, void *data,
 			   int len, int directed);
 int cpdma_chan_process(struct cpdma_chan *chan, int quota);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v9 net-next 4/5] net: ethernet: ti: cpsw_ethtool: allow res split while down
From: Ivan Khoronzhuk @ 2019-07-08 21:34 UTC (permalink / raw)
  To: grygorii.strashko, hawk, davem
  Cc: ast, linux-kernel, linux-omap, xdp-newbies, ilias.apalodimas,
	netdev, daniel, jakub.kicinski, john.fastabend, Ivan Khoronzhuk
In-Reply-To: <20190708213432.8525-1-ivan.khoronzhuk@linaro.org>

That's possible to set channel num while interfaces are down. When
interface gets up it should resplit budget. This resplit can happen
after phy is up but only if speed is changed, so should be set before
this, for this allow it to happen while changing number of channels,
when interfaces are down.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw_ethtool.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index c477e6b620d6..e4d7185fde49 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -620,8 +620,7 @@ int cpsw_set_channels_common(struct net_device *ndev,
 		}
 	}
 
-	if (cpsw->usage_count)
-		cpsw_split_res(cpsw);
+	cpsw_split_res(cpsw);
 
 	ret = cpsw_resume_data_pass(ndev);
 	if (!ret)
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH net-next 0/4] bnxt_en: Add XDP_REDIRECT support.
From: David Miller @ 2019-07-08 21:34 UTC (permalink / raw)
  To: michael.chan; +Cc: gospo, netdev, hawk, ast
In-Reply-To: <1562398578-26020-1-git-send-email-michael.chan@broadcom.com>

From: Michael Chan <michael.chan@broadcom.com>
Date: Sat,  6 Jul 2019 03:36:14 -0400

> This patch series adds XDP_REDIRECT support by Andy Gospodarek.
> 
> Andy Gospodarek (3):
>   bnxt_en: rename some xdp functions
>   bnxt_en: optimized XDP_REDIRECT support
>   bnxt_en: add page_pool support
> 
> Michael Chan (1):
>   bnxt_en: Refactor __bnxt_xmit_xdp().

Andy et al. please respin this ASAP with the feedback you've received
and I will apply it.

Thank you.

^ permalink raw reply

* [PATCH v9 net-next 5/5] net: ethernet: ti: cpsw: add XDP support
From: Ivan Khoronzhuk @ 2019-07-08 21:34 UTC (permalink / raw)
  To: grygorii.strashko, hawk, davem
  Cc: ast, linux-kernel, linux-omap, xdp-newbies, ilias.apalodimas,
	netdev, daniel, jakub.kicinski, john.fastabend, Ivan Khoronzhuk
In-Reply-To: <20190708213432.8525-1-ivan.khoronzhuk@linaro.org>

Add XDP support based on rx page_pool allocator, one frame per page.
Page pool allocator is used with assumption that only one rx_handler
is running simultaneously. DMA map/unmap is reused from page pool
despite there is no need to map whole page.

Due to specific of cpsw, the same TX/RX handler can be used by 2
network devices, so special fields in buffer are added to identify
an interface the frame is destined to. Thus XDP works for both
interfaces, that allows to test xdp redirect between two interfaces
easily. Also, each rx queue have own page pools, but common for both
netdevs.

XDP prog is common for all channels till appropriate changes are added
in XDP infrastructure. Also, once page_pool recycling becomes part of
skb netstack some simplifications can be added, like removing
page_pool_release_page() before skb receive.

In order to keep rx_dev while redirect, that can be somehow used in
future, do flush in rx_handler, that allows to keep rx dev the same
while redirect. It allows to conform with tracing rx_dev pointed
by Jesper.

Also, there is probability, that XDP generic code can be extended to
support multi ndev drivers like this one, using same rx queue for
several ndevs, based on switchdev for instance or else. In this case,
driver can be modified like exposed here:
https://lkml.org/lkml/2019/7/3/243

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/Kconfig        |   1 +
 drivers/net/ethernet/ti/cpsw.c         | 502 ++++++++++++++++++++++---
 drivers/net/ethernet/ti/cpsw_ethtool.c |  37 +-
 drivers/net/ethernet/ti/cpsw_priv.h    |   7 +
 4 files changed, 488 insertions(+), 59 deletions(-)

diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index a800d3417411..834afca3a019 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -50,6 +50,7 @@ config TI_CPSW
 	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
 	select TI_DAVINCI_MDIO
 	select MFD_SYSCON
+	select PAGE_POOL
 	select REGMAP
 	---help---
 	  This driver supports TI's CPSW Ethernet Switch.
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 32b7b3b74a6b..f16aefd8870b 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -31,6 +31,10 @@
 #include <linux/if_vlan.h>
 #include <linux/kmemleak.h>
 #include <linux/sys_soc.h>
+#include <net/page_pool.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
 
 #include <linux/pinctrl/consumer.h>
 #include <net/pkt_cls.h>
@@ -60,6 +64,10 @@ static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
 module_param(descs_pool_size, int, 0444);
 MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
 
+/* The buf includes headroom compatible with both skb and xdpf */
+#define CPSW_HEADROOM_NA (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN)
+#define CPSW_HEADROOM  ALIGN(CPSW_HEADROOM_NA, sizeof(long))
+
 #define for_each_slave(priv, func, arg...)				\
 	do {								\
 		struct cpsw_slave *slave;				\
@@ -74,6 +82,11 @@ MODULE_PARM_DESC(descs_pool_size, "Number of CPDMA CPPI descriptors in pool");
 				(func)(slave++, ##arg);			\
 	} while (0)
 
+#define CPSW_XMETA_OFFSET	ALIGN(sizeof(struct xdp_frame), sizeof(long))
+
+#define CPSW_XDP_CONSUMED		1
+#define CPSW_XDP_PASS			0
+
 static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
 				    __be16 proto, u16 vid);
 
@@ -337,24 +350,58 @@ void cpsw_intr_disable(struct cpsw_common *cpsw)
 	return;
 }
 
+static int cpsw_is_xdpf_handle(void *handle)
+{
+	return (unsigned long)handle & BIT(0);
+}
+
+static void *cpsw_xdpf_to_handle(struct xdp_frame *xdpf)
+{
+	return (void *)((unsigned long)xdpf | BIT(0));
+}
+
+static struct xdp_frame *cpsw_handle_to_xdpf(void *handle)
+{
+	return (struct xdp_frame *)((unsigned long)handle & ~BIT(0));
+}
+
+struct __aligned(sizeof(long)) cpsw_meta_xdp {
+	struct net_device *ndev;
+	int ch;
+};
+
 void cpsw_tx_handler(void *token, int len, int status)
 {
+	struct cpsw_meta_xdp	*xmeta;
+	struct xdp_frame	*xdpf;
+	struct net_device	*ndev;
 	struct netdev_queue	*txq;
-	struct sk_buff		*skb = token;
-	struct net_device	*ndev = skb->dev;
-	struct cpsw_common	*cpsw = ndev_to_cpsw(ndev);
+	struct sk_buff		*skb;
+	int			ch;
+
+	if (cpsw_is_xdpf_handle(token)) {
+		xdpf = cpsw_handle_to_xdpf(token);
+		xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+		ndev = xmeta->ndev;
+		ch = xmeta->ch;
+		xdp_return_frame(xdpf);
+	} else {
+		skb = token;
+		ndev = skb->dev;
+		ch = skb_get_queue_mapping(skb);
+		cpts_tx_timestamp(ndev_to_cpsw(ndev)->cpts, skb);
+		dev_kfree_skb_any(skb);
+	}
 
 	/* Check whether the queue is stopped due to stalled tx dma, if the
 	 * queue is stopped then start the queue as we have free desc for tx
 	 */
-	txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
+	txq = netdev_get_tx_queue(ndev, ch);
 	if (unlikely(netif_tx_queue_stopped(txq)))
 		netif_tx_wake_queue(txq);
 
-	cpts_tx_timestamp(cpsw->cpts, skb);
 	ndev->stats.tx_packets++;
 	ndev->stats.tx_bytes += len;
-	dev_kfree_skb_any(skb);
 }
 
 static void cpsw_rx_vlan_encap(struct sk_buff *skb)
@@ -400,24 +447,252 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
 	}
 }
 
+static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *xdpf,
+			     struct page *page)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	struct cpsw_meta_xdp *xmeta;
+	struct cpdma_chan *txch;
+	dma_addr_t dma;
+	int ret, port;
+
+	xmeta = (void *)xdpf + CPSW_XMETA_OFFSET;
+	xmeta->ndev = priv->ndev;
+	xmeta->ch = 0;
+	txch = cpsw->txv[0].ch;
+
+	port = priv->emac_port + cpsw->data.dual_emac;
+	if (page) {
+		dma = page_pool_get_dma_addr(page);
+		dma += xdpf->headroom + sizeof(struct xdp_frame);
+		ret = cpdma_chan_submit_mapped(txch, cpsw_xdpf_to_handle(xdpf),
+					       dma, xdpf->len, port);
+	} else {
+		if (sizeof(*xmeta) > xdpf->headroom) {
+			xdp_return_frame_rx_napi(xdpf);
+			return -EINVAL;
+		}
+
+		ret = cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf),
+					xdpf->data, xdpf->len, port);
+	}
+
+	if (ret) {
+		priv->ndev->stats.tx_dropped++;
+		xdp_return_frame_rx_napi(xdpf);
+	}
+
+	return ret;
+}
+
+static int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp,
+			struct page *page)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	struct net_device *ndev = priv->ndev;
+	int ret = CPSW_XDP_CONSUMED;
+	struct xdp_frame *xdpf;
+	struct bpf_prog *prog;
+	u32 act;
+
+	rcu_read_lock();
+
+	prog = READ_ONCE(priv->xdp_prog);
+	if (!prog) {
+		ret = CPSW_XDP_PASS;
+		goto out;
+	}
+
+	act = bpf_prog_run_xdp(prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+		ret = CPSW_XDP_PASS;
+		break;
+	case XDP_TX:
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf))
+			goto drop;
+
+		cpsw_xdp_tx_frame(priv, xdpf, page);
+		break;
+	case XDP_REDIRECT:
+		if (xdp_do_redirect(ndev, xdp, prog))
+			goto drop;
+
+		/*  Have to flush here, per packet, instead of doing it in bulk
+		 *  at the end of the napi handler. The RX devices on this
+		 *  particular hardware is sharing a common queue, so the
+		 *  incoming device might change per packet.
+		 */
+		xdp_do_flush_map();
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+		trace_xdp_exception(ndev, prog, act);
+		/* fall through -- handle aborts by dropping packet */
+	case XDP_DROP:
+		goto drop;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+drop:
+	rcu_read_unlock();
+	page_pool_recycle_direct(cpsw->page_pool[ch], page);
+	return ret;
+}
+
+static unsigned int cpsw_rxbuf_total_len(unsigned int len)
+{
+	len += CPSW_HEADROOM;
+	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	return SKB_DATA_ALIGN(len);
+}
+
+static struct page_pool *cpsw_create_page_pool(struct cpsw_common *cpsw,
+					       int size)
+{
+	struct page_pool_params pp_params;
+	struct page_pool *pool;
+
+	pp_params.order = 0;
+	pp_params.flags = PP_FLAG_DMA_MAP;
+	pp_params.pool_size = size;
+	pp_params.nid = NUMA_NO_NODE;
+	pp_params.dma_dir = DMA_BIDIRECTIONAL;
+	pp_params.dev = cpsw->dev;
+
+	pool = page_pool_create(&pp_params);
+	if (IS_ERR(pool))
+		dev_err(cpsw->dev, "cannot create rx page pool\n");
+
+	return pool;
+}
+
+static int cpsw_ndev_create_xdp_rxq(struct cpsw_priv *priv, int ch)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	struct xdp_rxq_info *rxq;
+	struct page_pool *pool;
+	int ret;
+
+	pool = cpsw->page_pool[ch];
+	rxq = &priv->xdp_rxq[ch];
+
+	ret = xdp_rxq_info_reg(rxq, priv->ndev, ch);
+	if (ret)
+		return ret;
+
+	ret = xdp_rxq_info_reg_mem_model(rxq, MEM_TYPE_PAGE_POOL, pool);
+	if (ret)
+		xdp_rxq_info_unreg(rxq);
+
+	return ret;
+}
+
+static void cpsw_ndev_destroy_xdp_rxq(struct cpsw_priv *priv, int ch)
+{
+	struct xdp_rxq_info *rxq = &priv->xdp_rxq[ch];
+
+	if (!xdp_rxq_info_is_reg(rxq))
+		return;
+
+	xdp_rxq_info_unreg(rxq);
+}
+
+static int cpsw_create_rx_pool(struct cpsw_common *cpsw, int ch)
+{
+	struct page_pool *pool;
+	int ret = 0, pool_size;
+
+	pool_size = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
+	pool = cpsw_create_page_pool(cpsw, pool_size);
+	if (IS_ERR(pool))
+		ret = PTR_ERR(pool);
+	else
+		cpsw->page_pool[ch] = pool;
+
+	return ret;
+}
+
+void cpsw_destroy_xdp_rxqs(struct cpsw_common *cpsw)
+{
+	struct net_device *ndev;
+	int i, ch;
+
+	for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
+		for (i = 0; i < cpsw->data.slaves; i++) {
+			ndev = cpsw->slaves[i].ndev;
+			if (!ndev)
+				continue;
+
+			cpsw_ndev_destroy_xdp_rxq(netdev_priv(ndev), ch);
+		}
+
+		page_pool_destroy(cpsw->page_pool[ch]);
+		cpsw->page_pool[ch] = NULL;
+	}
+}
+
+int cpsw_create_xdp_rxqs(struct cpsw_common *cpsw)
+{
+	struct net_device *ndev;
+	int i, ch, ret;
+
+	for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
+		ret = cpsw_create_rx_pool(cpsw, ch);
+		if (ret)
+			goto err_cleanup;
+
+		/* using same page pool is allowed as no running rx handlers
+		 * simultaneously for both ndevs
+		 */
+		for (i = 0; i < cpsw->data.slaves; i++) {
+			ndev = cpsw->slaves[i].ndev;
+			if (!ndev)
+				continue;
+
+			ret = cpsw_ndev_create_xdp_rxq(netdev_priv(ndev), ch);
+			if (ret)
+				goto err_cleanup;
+		}
+	}
+
+	return 0;
+
+err_cleanup:
+	cpsw_destroy_xdp_rxqs(cpsw);
+
+	return ret;
+}
+
 static void cpsw_rx_handler(void *token, int len, int status)
 {
-	struct cpdma_chan	*ch;
-	struct sk_buff		*skb = token;
-	struct sk_buff		*new_skb;
-	struct net_device	*ndev = skb->dev;
-	int			ret = 0, port;
-	struct cpsw_common	*cpsw = ndev_to_cpsw(ndev);
+	struct page		*new_page, *page = token;
+	void			*pa = page_address(page);
+	struct cpsw_meta_xdp	*xmeta = pa + CPSW_XMETA_OFFSET;
+	struct cpsw_common	*cpsw = ndev_to_cpsw(xmeta->ndev);
+	int			pkt_size = cpsw->rx_packet_max;
+	int			ret = 0, port, ch = xmeta->ch;
+	int			headroom = CPSW_HEADROOM;
+	struct net_device	*ndev = xmeta->ndev;
 	struct cpsw_priv	*priv;
+	struct page_pool	*pool;
+	struct sk_buff		*skb;
+	struct xdp_buff		xdp;
+	dma_addr_t		dma;
 
-	if (cpsw->data.dual_emac) {
+	if (cpsw->data.dual_emac && status >= 0) {
 		port = CPDMA_RX_SOURCE_PORT(status);
-		if (port) {
+		if (port)
 			ndev = cpsw->slaves[--port].ndev;
-			skb->dev = ndev;
-		}
 	}
 
+	priv = netdev_priv(ndev);
+	pool = cpsw->page_pool[ch];
 	if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
 		/* In dual emac mode check for all interfaces */
 		if (cpsw->data.dual_emac && cpsw->usage_count &&
@@ -426,43 +701,87 @@ static void cpsw_rx_handler(void *token, int len, int status)
 			 * is already down and the other interface is up
 			 * and running, instead of freeing which results
 			 * in reducing of the number of rx descriptor in
-			 * DMA engine, requeue skb back to cpdma.
+			 * DMA engine, requeue page back to cpdma.
 			 */
-			new_skb = skb;
+			new_page = page;
 			goto requeue;
 		}
 
-		/* the interface is going down, skbs are purged */
-		dev_kfree_skb_any(skb);
+		/* the interface is going down, pages are purged */
+		page_pool_recycle_direct(pool, page);
 		return;
 	}
 
-	new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
-	if (new_skb) {
-		skb_copy_queue_mapping(new_skb, skb);
-		skb_put(skb, len);
-		if (status & CPDMA_RX_VLAN_ENCAP)
-			cpsw_rx_vlan_encap(skb);
-		priv = netdev_priv(ndev);
-		if (priv->rx_ts_enabled)
-			cpts_rx_timestamp(cpsw->cpts, skb);
-		skb->protocol = eth_type_trans(skb, ndev);
-		netif_receive_skb(skb);
-		ndev->stats.rx_bytes += len;
-		ndev->stats.rx_packets++;
-		kmemleak_not_leak(new_skb);
-	} else {
+	new_page = page_pool_dev_alloc_pages(pool);
+	if (unlikely(!new_page)) {
+		new_page = page;
 		ndev->stats.rx_dropped++;
-		new_skb = skb;
+		goto requeue;
+	}
+
+	if (priv->xdp_prog) {
+		if (status & CPDMA_RX_VLAN_ENCAP) {
+			xdp.data = pa + CPSW_HEADROOM +
+				   CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+			xdp.data_end = xdp.data + len -
+				       CPSW_RX_VLAN_ENCAP_HDR_SIZE;
+		} else {
+			xdp.data = pa + CPSW_HEADROOM;
+			xdp.data_end = xdp.data + len;
+		}
+
+		xdp_set_data_meta_invalid(&xdp);
+
+		xdp.data_hard_start = pa;
+		xdp.rxq = &priv->xdp_rxq[ch];
+
+		ret = cpsw_run_xdp(priv, ch, &xdp, page);
+		if (ret != CPSW_XDP_PASS)
+			goto requeue;
+
+		/* XDP prog might have changed packet data and boundaries */
+		len = xdp.data_end - xdp.data;
+		headroom = xdp.data - xdp.data_hard_start;
+
+		/* XDP prog can modify vlan tag, so can't use encap header */
+		status &= ~CPDMA_RX_VLAN_ENCAP;
 	}
 
+	/* pass skb to netstack if no XDP prog or returned XDP_PASS */
+	skb = build_skb(pa, cpsw_rxbuf_total_len(pkt_size));
+	if (!skb) {
+		ndev->stats.rx_dropped++;
+		page_pool_recycle_direct(pool, page);
+		goto requeue;
+	}
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, len);
+	skb->dev = ndev;
+	if (status & CPDMA_RX_VLAN_ENCAP)
+		cpsw_rx_vlan_encap(skb);
+	if (priv->rx_ts_enabled)
+		cpts_rx_timestamp(cpsw->cpts, skb);
+	skb->protocol = eth_type_trans(skb, ndev);
+
+	/* unmap page as no netstack skb page recycling */
+	page_pool_release_page(pool, page);
+	netif_receive_skb(skb);
+
+	ndev->stats.rx_bytes += len;
+	ndev->stats.rx_packets++;
+
 requeue:
-	ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
-	ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
-				skb_tailroom(new_skb), 0);
+	xmeta = page_address(new_page) + CPSW_XMETA_OFFSET;
+	xmeta->ndev = ndev;
+	xmeta->ch = ch;
+
+	dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM;
+	ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
+				       pkt_size, 0);
 	if (ret < 0) {
 		WARN_ON(ret == -ENOMEM);
-		dev_kfree_skb_any(new_skb);
+		page_pool_recycle_direct(pool, new_page);
 	}
 }
 
@@ -1032,33 +1351,39 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 int cpsw_fill_rx_channels(struct cpsw_priv *priv)
 {
 	struct cpsw_common *cpsw = priv->cpsw;
-	struct sk_buff *skb;
+	struct cpsw_meta_xdp *xmeta;
+	struct page_pool *pool;
+	struct page *page;
 	int ch_buf_num;
 	int ch, i, ret;
+	dma_addr_t dma;
 
 	for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
+		pool = cpsw->page_pool[ch];
 		ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
 		for (i = 0; i < ch_buf_num; i++) {
-			skb = __netdev_alloc_skb_ip_align(priv->ndev,
-							  cpsw->rx_packet_max,
-							  GFP_KERNEL);
-			if (!skb) {
-				cpsw_err(priv, ifup, "cannot allocate skb\n");
+			page = page_pool_dev_alloc_pages(pool);
+			if (!page) {
+				cpsw_err(priv, ifup, "allocate rx page err\n");
 				return -ENOMEM;
 			}
 
-			skb_set_queue_mapping(skb, ch);
-			ret = cpdma_chan_idle_submit(cpsw->rxv[ch].ch, skb,
-						     skb->data,
-						     skb_tailroom(skb), 0);
+			xmeta = page_address(page) + CPSW_XMETA_OFFSET;
+			xmeta->ndev = priv->ndev;
+			xmeta->ch = ch;
+
+			dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM;
+			ret = cpdma_chan_idle_submit_mapped(cpsw->rxv[ch].ch,
+							    page, dma,
+							    cpsw->rx_packet_max,
+							    0);
 			if (ret < 0) {
 				cpsw_err(priv, ifup,
-					 "cannot submit skb to channel %d rx, error %d\n",
+					 "cannot submit page to channel %d rx, error %d\n",
 					 ch, ret);
-				kfree_skb(skb);
+				page_pool_recycle_direct(pool, page);
 				return ret;
 			}
-			kmemleak_not_leak(skb);
 		}
 
 		cpsw_info(priv, ifup, "ch %d rx, submitted %d descriptors\n",
@@ -1394,6 +1719,13 @@ static int cpsw_ndo_open(struct net_device *ndev)
 			enable_irq(cpsw->irqs_table[0]);
 		}
 
+		/* create rxqs for both infs in dual mac as they use same pool
+		 * and must be destroyed together when no users.
+		 */
+		ret = cpsw_create_xdp_rxqs(cpsw);
+		if (ret < 0)
+			goto err_cleanup;
+
 		ret = cpsw_fill_rx_channels(priv);
 		if (ret < 0)
 			goto err_cleanup;
@@ -1422,9 +1754,10 @@ static int cpsw_ndo_open(struct net_device *ndev)
 err_cleanup:
 	if (!cpsw->usage_count) {
 		cpdma_ctlr_stop(cpsw->dma);
-		for_each_slave(priv, cpsw_slave_stop, cpsw);
+		cpsw_destroy_xdp_rxqs(cpsw);
 	}
 
+	for_each_slave(priv, cpsw_slave_stop, cpsw);
 	pm_runtime_put_sync(cpsw->dev);
 	netif_carrier_off(priv->ndev);
 	return ret;
@@ -1447,6 +1780,7 @@ static int cpsw_ndo_stop(struct net_device *ndev)
 		cpsw_intr_disable(cpsw);
 		cpdma_ctlr_stop(cpsw->dma);
 		cpsw_ale_stop(cpsw->ale);
+		cpsw_destroy_xdp_rxqs(cpsw);
 	}
 	for_each_slave(priv, cpsw_slave_stop, cpsw);
 
@@ -2004,6 +2338,64 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type,
 	}
 }
 
+static int cpsw_xdp_prog_setup(struct cpsw_priv *priv, struct netdev_bpf *bpf)
+{
+	struct bpf_prog *prog = bpf->prog;
+
+	if (!priv->xdpi.prog && !prog)
+		return 0;
+
+	if (!xdp_attachment_flags_ok(&priv->xdpi, bpf))
+		return -EBUSY;
+
+	WRITE_ONCE(priv->xdp_prog, prog);
+
+	xdp_attachment_setup(&priv->xdpi, bpf);
+
+	return 0;
+}
+
+static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+
+	switch (bpf->command) {
+	case XDP_SETUP_PROG:
+		return cpsw_xdp_prog_setup(priv, bpf);
+
+	case XDP_QUERY_PROG:
+		return xdp_attachment_query(&priv->xdpi, bpf);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
+			     struct xdp_frame **frames, u32 flags)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct xdp_frame *xdpf;
+	int i, drops = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	for (i = 0; i < n; i++) {
+		xdpf = frames[i];
+		if (xdpf->len < CPSW_MIN_PACKET_SIZE) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+			continue;
+		}
+
+		if (cpsw_xdp_tx_frame(priv, xdpf, NULL))
+			drops++;
+	}
+
+	return n - drops;
+}
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void cpsw_ndo_poll_controller(struct net_device *ndev)
 {
@@ -2032,6 +2424,8 @@ static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_vlan_rx_add_vid	= cpsw_ndo_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= cpsw_ndo_vlan_rx_kill_vid,
 	.ndo_setup_tc           = cpsw_ndo_setup_tc,
+	.ndo_bpf		= cpsw_ndo_bpf,
+	.ndo_xdp_xmit		= cpsw_ndo_xdp_xmit,
 };
 
 static void cpsw_get_drvinfo(struct net_device *ndev,
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index e4d7185fde49..31248a6cc642 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -578,6 +578,18 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx,
 	return 0;
 }
 
+static void cpsw_fail(struct cpsw_common *cpsw)
+{
+	struct net_device *ndev;
+	int i;
+
+	for (i = 0; i < cpsw->data.slaves; i++) {
+		ndev = cpsw->slaves[i].ndev;
+		if (ndev)
+			dev_close(ndev);
+	}
+}
+
 int cpsw_set_channels_common(struct net_device *ndev,
 			     struct ethtool_channels *chs,
 			     cpdma_handler_fn rx_handler)
@@ -585,7 +597,7 @@ int cpsw_set_channels_common(struct net_device *ndev,
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
 	struct net_device *sl_ndev;
-	int i, ret;
+	int i, new_pools, ret;
 
 	ret = cpsw_check_ch_settings(cpsw, chs);
 	if (ret < 0)
@@ -593,6 +605,8 @@ int cpsw_set_channels_common(struct net_device *ndev,
 
 	cpsw_suspend_data_pass(ndev);
 
+	new_pools = (chs->rx_count != cpsw->rx_ch_num) && cpsw->usage_count;
+
 	ret = cpsw_update_channels_res(priv, chs->rx_count, 1, rx_handler);
 	if (ret)
 		goto err;
@@ -622,12 +636,19 @@ int cpsw_set_channels_common(struct net_device *ndev,
 
 	cpsw_split_res(cpsw);
 
+	if (new_pools) {
+		cpsw_destroy_xdp_rxqs(cpsw);
+		ret = cpsw_create_xdp_rxqs(cpsw);
+		if (ret)
+			goto err;
+	}
+
 	ret = cpsw_resume_data_pass(ndev);
 	if (!ret)
 		return 0;
 err:
 	dev_err(priv->dev, "cannot update channels number, closing device\n");
-	dev_close(ndev);
+	cpsw_fail(cpsw);
 	return ret;
 }
 
@@ -647,8 +668,7 @@ void cpsw_get_ringparam(struct net_device *ndev,
 int cpsw_set_ringparam(struct net_device *ndev,
 		       struct ethtool_ringparam *ering)
 {
-	struct cpsw_priv *priv = netdev_priv(ndev);
-	struct cpsw_common *cpsw = priv->cpsw;
+	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
 	int descs_num, ret;
 
 	/* ignore ering->tx_pending - only rx_pending adjustment is supported */
@@ -672,13 +692,20 @@ int cpsw_set_ringparam(struct net_device *ndev,
 		return ret;
 	}
 
+	if (cpsw->usage_count) {
+		cpsw_destroy_xdp_rxqs(cpsw);
+		ret = cpsw_create_xdp_rxqs(cpsw);
+		if (ret)
+			goto err;
+	}
+
 	ret = cpsw_resume_data_pass(ndev);
 	if (!ret)
 		return 0;
 err:
 	cpdma_set_num_rx_descs(cpsw->dma, descs_num);
 	dev_err(cpsw->dev, "cannot set ring params, closing device\n");
-	dev_close(ndev);
+	cpsw_fail(cpsw);
 	return ret;
 }
 
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 04795b97ee71..086bf38a4736 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -346,6 +346,7 @@ struct cpsw_common {
 	int				rx_ch_num, tx_ch_num;
 	int				speed;
 	int				usage_count;
+	struct page_pool		*page_pool[CPSW_MAX_QUEUES];
 };
 
 struct cpsw_priv {
@@ -360,6 +361,10 @@ struct cpsw_priv {
 	int				shp_cfg_speed;
 	int				tx_ts_enabled;
 	int				rx_ts_enabled;
+	struct bpf_prog			*xdp_prog;
+	struct xdp_rxq_info		xdp_rxq[CPSW_MAX_QUEUES];
+	struct xdp_attachment_info	xdpi;
+
 	u32 emac_port;
 	struct cpsw_common *cpsw;
 };
@@ -391,6 +396,8 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv);
 void cpsw_intr_enable(struct cpsw_common *cpsw);
 void cpsw_intr_disable(struct cpsw_common *cpsw);
 void cpsw_tx_handler(void *token, int len, int status);
+int cpsw_create_xdp_rxqs(struct cpsw_common *cpsw);
+void cpsw_destroy_xdp_rxqs(struct cpsw_common *cpsw);
 
 /* ethtool */
 u32 cpsw_get_msglevel(struct net_device *ndev);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v9 net-next 0/5] net: ethernet: ti: cpsw: Add XDP support
From: Ivan Khoronzhuk @ 2019-07-08 21:34 UTC (permalink / raw)
  To: grygorii.strashko, hawk, davem
  Cc: ast, linux-kernel, linux-omap, xdp-newbies, ilias.apalodimas,
	netdev, daniel, jakub.kicinski, john.fastabend, Ivan Khoronzhuk

This patchset adds XDP support for TI cpsw driver and base it on
page_pool allocator. It was verified on af_xdp socket drop,
af_xdp l2f, ebpf XDP_DROP, XDP_REDIRECT, XDP_PASS, XDP_TX.

It was verified with following configs enabled:
CONFIG_JIT=y
CONFIG_BPFILTER=y
CONFIG_BPF_SYSCALL=y
CONFIG_XDP_SOCKETS=y
CONFIG_BPF_EVENTS=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_BPF_JIT=y
CONFIG_CGROUP_BPF=y

Link on previous v7:
https://lkml.org/lkml/2019/7/4/715

Also regular tests with iperf2 were done in order to verify impact on
regular netstack performance, compared with base commit:
https://pastebin.com/JSMT0iZ4

v8..v9:
- fix warnings on arm64 caused by typos in type casting

v7..v8:
- corrected dma calculation based on headroom instead of hard start
- minor comment changes

v6..v7:
- rolled back to v4 solution but with small modification
- picked up patch:
  https://www.spinics.net/lists/netdev/msg583145.html
- added changes related to netsec fix and cpsw


v5..v6:
- do changes that is rx_dev while redirect/flush cycle is kept the same
- dropped net: ethernet: ti: davinci_cpdma: return handler status
- other changes desc in patches

v4..v5:
- added two plreliminary patches:
  net: ethernet: ti: davinci_cpdma: allow desc split while down
  net: ethernet: ti: cpsw_ethtool: allow res split while down
- added xdp alocator refcnt on xdp level, avoiding page pool refcnt
- moved flush status as separate argument for cpdma_chan_process
- reworked cpsw code according to last changes to allocator
- added missed statistic counter

v3..v4:
- added page pool user counter
- use same pool for ndevs in dual mac
- restructured page pool create/destroy according to the last changes in API

v2..v3:
- each rxq and ndev has its own page pool

v1..v2:
- combined xdp_xmit functions
- used page allocation w/o refcnt juggle
- unmapped page for skb netstack
- moved rxq/page pool allocation to open/close pair
- added several preliminary patches:
  net: page_pool: add helper function to retrieve dma addresses
  net: page_pool: add helper function to unmap dma addresses
  net: ethernet: ti: cpsw: use cpsw as drv data
  net: ethernet: ti: cpsw_ethtool: simplify slave loops

Ivan Khoronzhuk (5):
  net: core: page_pool: add user refcnt and reintroduce
    page_pool_destroy
  net: ethernet: ti: davinci_cpdma: add dma mapped submit
  net: ethernet: ti: davinci_cpdma: allow desc split while down
  net: ethernet: ti: cpsw_ethtool: allow res split while down
  net: ethernet: ti: cpsw: add XDP support

 .../net/ethernet/mellanox/mlx5/core/en_main.c |   4 +-
 drivers/net/ethernet/socionext/netsec.c       |   8 +-
 drivers/net/ethernet/ti/Kconfig               |   1 +
 drivers/net/ethernet/ti/cpsw.c                | 502 ++++++++++++++++--
 drivers/net/ethernet/ti/cpsw_ethtool.c        |  57 +-
 drivers/net/ethernet/ti/cpsw_priv.h           |   7 +
 drivers/net/ethernet/ti/davinci_cpdma.c       | 106 +++-
 drivers/net/ethernet/ti/davinci_cpdma.h       |   7 +-
 include/net/page_pool.h                       |  25 +
 net/core/page_pool.c                          |   8 +
 net/core/xdp.c                                |   3 +
 11 files changed, 640 insertions(+), 88 deletions(-)

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH bpf] xdp: fix possible cq entry leak
From: William Tu @ 2019-07-08 21:25 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Ilya Maximets, Netdev, LKML, bpf, Xdp, David S. Miller,
	Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <CAJ+HfNi2EdLwtq9SfccZBymDMv_cW5+vxB-JLqxyvYS_TG3ScA@mail.gmail.com>

On Thu, Jul 4, 2019 at 11:49 PM Björn Töpel <bjorn.topel@gmail.com> wrote:
>
> On Thu, 4 Jul 2019 at 16:25, Ilya Maximets <i.maximets@samsung.com> wrote:
> >
> > Completion queue address reservation could not be undone.
> > In case of bad 'queue_id' or skb allocation failure, reserved entry
> > will be leaked reducing the total capacity of completion queue.
> >
> > Fix that by moving reservation to the point where failure is not
> > possible. Additionally, 'queue_id' checking moved out from the loop
> > since there is no point to check it there.
> >
>
> Good catch, Ilya! Thanks for the patch!
>
> Acked-by: Björn Töpel <bjorn.topel@intel.com>
>
Thanks
Tested-by: William Tu <u9012063@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-next] selftests/bpf: make verifier loop tests arch independent
From: Stanislav Fomichev @ 2019-07-08 21:20 UTC (permalink / raw)
  To: Ilya Leoshkevich
  Cc: Y Song, Stanislav Fomichev, netdev, bpf, David Miller,
	Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <99593C98-5DEC-4B18-AE6D-271DD8A8A7F6@linux.ibm.com>

On 07/08, Ilya Leoshkevich wrote:
> 
> 
> > Am 08.07.2019 um 18:13 schrieb Stanislav Fomichev <sdf@fomichev.me>:
> > 
> > On 07/03, Y Song wrote:
> >> On Wed, Jul 3, 2019 at 1:51 PM Stanislav Fomichev <sdf@google.com> wrote:
> >>> 
> >>> Take the first x bytes of pt_regs for scalability tests, there is
> >>> no real reason we need x86 specific rax.
> >>> 
> >>> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> >>> ---
> >>> tools/testing/selftests/bpf/progs/loop1.c | 3 ++-
> >>> tools/testing/selftests/bpf/progs/loop2.c | 3 ++-
> >>> tools/testing/selftests/bpf/progs/loop3.c | 3 ++-
> >>> 3 files changed, 6 insertions(+), 3 deletions(-)
> >>> 
> >>> diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c
> >>> index dea395af9ea9..d530c61d2517 100644
> >>> --- a/tools/testing/selftests/bpf/progs/loop1.c
> >>> +++ b/tools/testing/selftests/bpf/progs/loop1.c
> >>> @@ -14,11 +14,12 @@ SEC("raw_tracepoint/kfree_skb")
> >>> int nested_loops(volatile struct pt_regs* ctx)
> >>> {
> >>>        int i, j, sum = 0, m;
> >>> +       volatile int *any_reg = (volatile int *)ctx;
> >>> 
> >>>        for (j = 0; j < 300; j++)
> >>>                for (i = 0; i < j; i++) {
> >>>                        if (j & 1)
> >>> -                               m = ctx->rax;
> >>> +                               m = *any_reg;
> >> 
> >> I agree. ctx->rax here is only to generate some operations, which
> >> cannot be optimized away by the compiler. dereferencing a volatile
> >> pointee may just serve that purpose.
> >> 
> >> Comparing the byte code generated with ctx->rax and *any_reg, they are
> >> slightly different. Using *any_reg is slighly worse, but this should
> >> be still okay for the test.
> >> 
> >>>                        else
> >>>                                m = j;
> >>>                        sum += i * m;
> >>> diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c
> >>> index 0637bd8e8bcf..91bb89d901e3 100644
> >>> --- a/tools/testing/selftests/bpf/progs/loop2.c
> >>> +++ b/tools/testing/selftests/bpf/progs/loop2.c
> >>> @@ -14,9 +14,10 @@ SEC("raw_tracepoint/consume_skb")
> >>> int while_true(volatile struct pt_regs* ctx)
> >>> {
> >>>        int i = 0;
> >>> +       volatile int *any_reg = (volatile int *)ctx;
> >>> 
> >>>        while (true) {
> >>> -               if (ctx->rax & 1)
> >>> +               if (*any_reg & 1)
> >>>                        i += 3;
> >>>                else
> >>>                        i += 7;
> >>> diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c
> >>> index 30a0f6cba080..3a7f12d7186c 100644
> >>> --- a/tools/testing/selftests/bpf/progs/loop3.c
> >>> +++ b/tools/testing/selftests/bpf/progs/loop3.c
> >>> @@ -14,9 +14,10 @@ SEC("raw_tracepoint/consume_skb")
> >>> int while_true(volatile struct pt_regs* ctx)
> >>> {
> >>>        __u64 i = 0, sum = 0;
> >>> +       volatile __u64 *any_reg = (volatile __u64 *)ctx;
> >>>        do {
> >>>                i++;
> >>> -               sum += ctx->rax;
> >>> +               sum += *any_reg;
> >>>        } while (i < 0x100000000ULL);
> >>>        return sum;
> >>> }
> >>> --
> >>> 2.22.0.410.gd8fdbe21b5-goog
> >> 
> >> Ilya Leoshkevich (iii@linux.ibm.com, cc'ed) has another patch set
> >> trying to solve this problem by introducing s360 arch register access
> >> macros. I guess for now that patch set is not needed any more?
> > Oh, I missed them. Do they fix the tests for other (non-s360) arches as
> > well? I was trying to fix the issue by not depending on any arch
> > specific stuff because the test really doesn't care :-)
> 
> They are supposed to work for everything that defines PT_REGS_RC in
> bpf_helpers.h, but I have to admit I tested only x86_64 and s390.
> 
> The main source of problems with my approach were mismatching definitions
> of struct pt_regs for userspace and kernel, and because of that there was
> some tweaking required for both arches. I will double check how it looks
> for others (arm, mips, ppc, sparc) tomorrow.
Thanks, I've tested your patches and they fix my issue as well. So you
can have my Tested-by if we'd go with your approach.

One thing I don't understand is: why do you add 'ifdef __KERNEL__' to
the bpf_helpers.h for x86 case? Who is using bpf_helpers.h with
__KERNEL__ defined? Is it perf?

> Best regards,
> Ilya

^ permalink raw reply

* Re: [PATCH net-next v3 3/3] net: stmmac: Introducing support for Page Pool
From: David Miller @ 2019-07-08 21:15 UTC (permalink / raw)
  To: Jose.Abreu
  Cc: ilias.apalodimas, linux-kernel, netdev, linux-stm32,
	linux-arm-kernel, Joao.Pinto, peppe.cavallaro, alexandre.torgue,
	brouer, arnd
In-Reply-To: <BN8PR12MB32667BCA58B617432CACE677D3F60@BN8PR12MB3266.namprd12.prod.outlook.com>

From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 8 Jul 2019 16:08:07 +0000

> From: Ilias Apalodimas <ilias.apalodimas@linaro.org> | Date: Fri, Jul 
> 05, 2019 at 16:24:53
> 
>> Well ideally we'd like to get the change in before the merge window ourselves,
>> since we dont want to remove->re-add the same function in stable kernels. If
>> that doesn't go in i am fine fixing it in the next merge window i guess, since
>> it offers substantial speedups
> 
> I think the series is marked as "Changes Requested" in patchwork. What's 
> the status of this ?

That means I expect a respin based upon feedback or similar.  If Ilias and
you agreed to put this series in as-is, my apologies and just resend the
series with appropriate ACK and Review tags added.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v5 3/5] devlink: Introduce PCI PF port flavour and port attribute
From: Jakub Kicinski @ 2019-07-08 21:14 UTC (permalink / raw)
  To: Parav Pandit; +Cc: netdev, jiri, saeedm
In-Reply-To: <20190708041549.56601-4-parav@mellanox.com>

On Sun,  7 Jul 2019 23:15:47 -0500, Parav Pandit wrote:
> diff --git a/net/core/devlink.c b/net/core/devlink.c
> index 3e5f8204c36f..88b2cf207cb2 100644
> --- a/net/core/devlink.c
> +++ b/net/core/devlink.c
> @@ -519,6 +519,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
>  	if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL &&
>  	    devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
>  	    devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA)
>  		return 0;
> +	if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {

Thanks for making the changes!  I'm not sure how this would work, tho.
We return early if flavour is not phys/cpu/dsa, so how can flavour be
pci here?..

> +		if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
> +				attrs->pci_pf.pf))
> +			return -EMSGSIZE;
> +	}
>  	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
>  			attrs->physical.port_number))
>  		return -EMSGSIZE;

^ permalink raw reply

* Re: [PATCH v3 1/7] dt-bindings: net: Add bindings for Realtek PHYs
From: Andrew Lunn @ 2019-07-08 21:13 UTC (permalink / raw)
  To: Matthias Kaehlcke
  Cc: David S . Miller, Rob Herring, Mark Rutland, Florian Fainelli,
	Heiner Kallweit, netdev, devicetree, linux-kernel,
	Douglas Anderson
In-Reply-To: <20190708200136.GM250418@google.com>

On Mon, Jul 08, 2019 at 01:01:36PM -0700, Matthias Kaehlcke wrote:
> On Mon, Jul 08, 2019 at 09:46:15PM +0200, Andrew Lunn wrote:
> > On Mon, Jul 08, 2019 at 12:24:53PM -0700, Matthias Kaehlcke wrote:
> > > Add the 'realtek,eee-led-mode-disable' property to disable EEE
> > > LED mode on Realtek PHYs that support it.
> > > 
> > > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > > ---
> > > TODO: adapt PHY core to deal with optional compatible strings
> > 
> > Yes. Does this even work at the moment? I would expect
> > of_mdiobus_child_is_phy() to return false, indicating the device is
> > not actually a PHY.
> 
> Indeed, it currently doesn't work atm. I found that removing the check
> for dev->of_node in of_mdiobus_link_mdiodev() helps, but I imagine
> doing (only) this might have undesired side-effects.

O.K.

Please put RFC in patches like this which don't actually work and so
should not be merged. We don't want David accidentally merging them!

       Andrew

^ permalink raw reply

* Re: [PATCH] tipc: ensure skb->lock is initialised
From: Chris Packham @ 2019-07-08 21:13 UTC (permalink / raw)
  To: Eric Dumazet, jon.maloy@ericsson.com, ying.xue@windriver.com,
	davem@davemloft.net
  Cc: netdev@vger.kernel.org, tipc-discussion@lists.sourceforge.net,
	linux-kernel@vger.kernel.org
In-Reply-To: <361940337b0d4833a5634ddd1e1896a9@svr-chch-ex1.atlnz.lc>

On 9/07/19 8:43 AM, Chris Packham wrote:
> On 8/07/19 8:18 PM, Eric Dumazet wrote:
>>
>>
>> On 7/8/19 12:53 AM, Chris Packham wrote:
>>> tipc_named_node_up() creates a skb list. It passes the list to
>>> tipc_node_xmit() which has some code paths that can call
>>> skb_queue_purge() which relies on the list->lock being initialised.
>>> Ensure tipc_named_node_up() uses skb_queue_head_init() so that the lock
>>> is explicitly initialised.
>>>
>>> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
>>
>> I would rather change the faulty skb_queue_purge() to __skb_queue_purge()
>>
> 
> Makes sense. I'll look at that for v2.
> 

Actually maybe not. tipc_rcast_xmit(), tipc_node_xmit_skb(), 
tipc_send_group_msg(), __tipc_sendmsg(), __tipc_sendstream(), and 
tipc_sk_timeout() all use skb_queue_head_init(). So my original change 
brings tipc_named_node_up() into line with them.

I think it should be safe for tipc_node_xmit() to use 
__skb_queue_purge() since all the callers seem to have exclusive access 
to the list of skbs. It still seems that the callers should all use 
skb_queue_head_init() for consistency.

^ permalink raw reply

* Re: [PATCH 1/2] forcedeth: add recv cache make nic work steadily
From: Jakub Kicinski @ 2019-07-08 20:52 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: netdev, davem
In-Reply-To: <1562307568-21549-2-git-send-email-yanjun.zhu@oracle.com>

On Fri,  5 Jul 2019 02:19:27 -0400, Zhu Yanjun wrote:
> A recv cache is added. The size of recv cache is 1000Mb / skb_length.
> When the system memory is not enough, this recv cache can make nic work
> steadily.
> When nic is up, this recv cache and work queue are created. When nic
> is down, this recv cache will be destroyed and delayed workqueue is
> canceled.
> When nic is polled or rx interrupt is triggerred, rx handler will
> get a skb from recv cache. Then the state of recv cache is checked.
> If recv cache is not in filling up state, a work is queued to fill
> up recv cache.
> When skb size is changed, the old recv cache is destroyed and new recv
> cache is created.
> When the system memory is not enough, the allocation of skb failed.
> recv cache will continue allocate skb until the recv cache is filled up.
> When the system memory is not enough, this can make nic work steadily.
> Becase of recv cache, the performance of nic is enhanced.
> 
> CC: Joe Jin <joe.jin@oracle.com>
> CC: Junxiao Bi <junxiao.bi@oracle.com>
> Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>

Could you tell us a little more about the use case and the system
condition?

> diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
> index b327b29..a673005 100644
> --- a/drivers/net/ethernet/nvidia/forcedeth.c
> +++ b/drivers/net/ethernet/nvidia/forcedeth.c
> @@ -674,6 +674,11 @@ struct nv_ethtool_stats {
>  	u64 tx_broadcast;
>  };
>  
> +/* 1000Mb is 125M bytes, 125 * 1024 * 1024 bytes
> + * The length of recv cache is 125M / skb_length
> + */
> +#define RECV_CACHE_LIST_LENGTH		(125 * 1024 * 1024 / np->rx_buf_sz)
> +
>  #define NV_DEV_STATISTICS_V3_COUNT (sizeof(struct nv_ethtool_stats)/sizeof(u64))
>  #define NV_DEV_STATISTICS_V2_COUNT (NV_DEV_STATISTICS_V3_COUNT - 3)
>  #define NV_DEV_STATISTICS_V1_COUNT (NV_DEV_STATISTICS_V2_COUNT - 6)
> @@ -844,8 +849,18 @@ struct fe_priv {
>  	char name_rx[IFNAMSIZ + 3];       /* -rx    */
>  	char name_tx[IFNAMSIZ + 3];       /* -tx    */
>  	char name_other[IFNAMSIZ + 6];    /* -other */
> +
> +	/* This is to schedule work */
> +	struct delayed_work     recv_cache_work;
> +	/* This list is to store skb queue for recv */
> +	struct sk_buff_head recv_list;
> +	unsigned long nv_recv_list_state;
>  };
>  
> +/* This is recv list state to fill up recv cache */
> +enum recv_list_state {
> +	RECV_LIST_ALLOCATE
> +};
>  /*
>   * Maximum number of loops until we assume that a bit in the irq mask
>   * is stuck. Overridable with module param.
> @@ -1804,7 +1819,11 @@ static int nv_alloc_rx(struct net_device *dev)
>  		less_rx = np->last_rx.orig;
>  
>  	while (np->put_rx.orig != less_rx) {
> -		struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + NV_RX_ALLOC_PAD);
> +		struct sk_buff *skb = skb_dequeue(&np->recv_list);
> +
> +		if (!test_bit(RECV_LIST_ALLOCATE, &np->nv_recv_list_state))
> +			schedule_delayed_work(&np->recv_cache_work, 0);

Interesting, this seems to be coming up in multiple places recently..

Could you explain why you have your own RECV_LIST_ALLOCATE bit here?
Workqueue implementation itself uses an atomic bit to avoid scheduling
work mutliple times:

bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
			   struct delayed_work *dwork, unsigned long delay)
{
	struct work_struct *work = &dwork->work;
	bool ret = false;
	unsigned long flags;

	/* read the comment in __queue_work() */
	local_irq_save(flags);

	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
		__queue_delayed_work(cpu, wq, dwork, delay);
		ret = true;
	}

	local_irq_restore(flags);
	return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

>  		if (likely(skb)) {
>  			np->put_rx_ctx->skb = skb;
>  			np->put_rx_ctx->dma = dma_map_single(&np->pci_dev->dev,
> @@ -1845,7 +1864,11 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
>  		less_rx = np->last_rx.ex;
>  
>  	while (np->put_rx.ex != less_rx) {
> -		struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + NV_RX_ALLOC_PAD);
> +		struct sk_buff *skb = skb_dequeue(&np->recv_list);
> +
> +		if (!test_bit(RECV_LIST_ALLOCATE, &np->nv_recv_list_state))
> +			schedule_delayed_work(&np->recv_cache_work, 0);

It seems a little heavy to schedule this work on every packet, would it
make sense to add this in nv_napi_poll() instead?

>  		if (likely(skb)) {
>  			np->put_rx_ctx->skb = skb;
>  			np->put_rx_ctx->dma = dma_map_single(&np->pci_dev->dev,
> @@ -1957,6 +1980,40 @@ static void nv_init_tx(struct net_device *dev)
>  	}
>  }
>  
> +static void nv_init_recv_cache(struct net_device *dev)
> +{
> +	struct fe_priv *np = netdev_priv(dev);
> +
> +	skb_queue_head_init(&np->recv_list);
> +	while (skb_queue_len(&np->recv_list) < RECV_CACHE_LIST_LENGTH) {
> +		struct sk_buff *skb = netdev_alloc_skb(dev,
> +				 np->rx_buf_sz + NV_RX_ALLOC_PAD);
> +		/* skb is null. This indicates that memory is not
> +		 * enough.
> +		 */
> +		if (unlikely(!skb)) {
> +			ndelay(3);
> +			continue;

Does this path ever hit?  Seems like doing an ndelay() and retrying
allocation is not the best idea for non-preempt kernel.  

Also perhaps you should consider using __netdev_alloc_skb() and passing
GFP_KERNEL, that way the system has a chance to go into memory reclaim
(I presume this function can sleep).

> +		}
> +
> +		skb_queue_tail(&np->recv_list, skb);
> +	}
> +}
> +
> +static void nv_destroy_recv_cache(struct net_device *dev)
> +{
> +	struct sk_buff *skb;
> +	struct fe_priv *np = netdev_priv(dev);
> +
> +	cancel_delayed_work_sync(&np->recv_cache_work);
> +	WARN_ON(delayed_work_pending(&np->recv_cache_work));
> +
> +	while ((skb = skb_dequeue(&np->recv_list)))
> +		kfree_skb(skb);

skb_queue_purge()

> +	WARN_ON(skb_queue_len(&np->recv_list));
> +}
> +
>  static int nv_init_ring(struct net_device *dev)
>  {
>  	struct fe_priv *np = netdev_priv(dev);
> @@ -3047,6 +3104,8 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu)
>  		nv_drain_rxtx(dev);
>  		/* reinit driver view of the rx queue */
>  		set_bufsize(dev);
> +		nv_destroy_recv_cache(dev);
> +		nv_init_recv_cache(dev);
>  		if (nv_init_ring(dev)) {
>  			if (!np->in_shutdown)
>  				mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
> @@ -4074,6 +4133,32 @@ static void nv_free_irq(struct net_device *dev)
>  	}
>  }
>  
> +static void nv_recv_cache_worker(struct work_struct *work)
> +{
> +	struct fe_priv *np = container_of(work, struct fe_priv,
> +					  recv_cache_work.work);
> +
> +	set_bit(RECV_LIST_ALLOCATE, &np->nv_recv_list_state);
> +	while (skb_queue_len(&np->recv_list) < RECV_CACHE_LIST_LENGTH) {
> +		struct sk_buff *skb = netdev_alloc_skb(np->dev,
> +				np->rx_buf_sz + NV_RX_ALLOC_PAD);
> +		/* skb is null. This indicates that memory is not
> +		 * enough.
> +		 * When the system memory is not enough, the kernel
> +		 * will compact memory or drop caches. At that time,
> +		 * if memory allocation fails, it had better wait some
> +		 * time for memory.
> +		 */
> +		if (unlikely(!skb)) {
> +			ndelay(3);
> +			continue;

Same comments as for the init function.

> +		}
> +
> +		skb_queue_tail(&np->recv_list, skb);
> +	}
> +	clear_bit(RECV_LIST_ALLOCATE, &np->nv_recv_list_state);
> +}
> +
>  static void nv_do_nic_poll(struct timer_list *t)
>  {
>  	struct fe_priv *np = from_timer(np, t, nic_poll);
> @@ -4129,6 +4214,8 @@ static void nv_do_nic_poll(struct timer_list *t)
>  			nv_drain_rxtx(dev);
>  			/* reinit driver view of the rx queue */
>  			set_bufsize(dev);
> +			nv_destroy_recv_cache(dev);
> +			nv_init_recv_cache(dev);
>  			if (nv_init_ring(dev)) {
>  				if (!np->in_shutdown)
>  					mod_timer(&np->oom_kick, jiffies + OOM_REFILL);


^ permalink raw reply

* Re: kernel BUG at lib/lockref.c:LINE!
From: syzbot @ 2019-07-08 20:50 UTC (permalink / raw)
  To: arvid.brodin, darrick.wong, davem, linux-fsdevel, linux-kernel,
	linux-xfs, netdev, sfr, syzkaller-bugs, viro
In-Reply-To: <000000000000b519af058d3091d1@google.com>

syzbot has bisected this bug to:

commit 867c90eeea9d81ad1336881b61a4dcf692fc5d50
Author: Stephen Rothwell <sfr@canb.auug.org.au>
Date:   Mon Jul 8 00:22:38 2019 +0000

     Merge remote-tracking branch 'xfs/for-next'

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=16c69bc3a00000
start commit:   d58b5ab9 Add linux-next specific files for 20190708
git tree:       linux-next
final crash:    https://syzkaller.appspot.com/x/report.txt?x=15c69bc3a00000
console output: https://syzkaller.appspot.com/x/log.txt?x=11c69bc3a00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=bf9882946ecc11d9
dashboard link: https://syzkaller.appspot.com/bug?extid=f70e9b00f8c7d4187bd0
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=173375c7a00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1536f9bfa00000

Reported-by: syzbot+f70e9b00f8c7d4187bd0@syzkaller.appspotmail.com
Fixes: 867c90eeea9d ("Merge remote-tracking branch 'xfs/for-next'")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

^ permalink raw reply

* Re: [PATCH] tipc: ensure skb->lock is initialised
From: Chris Packham @ 2019-07-08 20:43 UTC (permalink / raw)
  To: Eric Dumazet, jon.maloy@ericsson.com, ying.xue@windriver.com,
	davem@davemloft.net
  Cc: netdev@vger.kernel.org, tipc-discussion@lists.sourceforge.net,
	linux-kernel@vger.kernel.org
In-Reply-To: <2298b9eb-100f-6130-60c4-0e5e2c7b84d1@gmail.com>

On 8/07/19 8:18 PM, Eric Dumazet wrote:
> 
> 
> On 7/8/19 12:53 AM, Chris Packham wrote:
>> tipc_named_node_up() creates a skb list. It passes the list to
>> tipc_node_xmit() which has some code paths that can call
>> skb_queue_purge() which relies on the list->lock being initialised.
>> Ensure tipc_named_node_up() uses skb_queue_head_init() so that the lock
>> is explicitly initialised.
>>
>> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> 
> I would rather change the faulty skb_queue_purge() to __skb_queue_purge()
> 

Makes sense. I'll look at that for v2.

^ permalink raw reply

* Re: [PATCH ghak90 V6 02/10] audit: add container id
From: Paul Moore @ 2019-07-08 20:43 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: Tycho Andersen, Serge E. Hallyn, containers, linux-api,
	Linux-Audit Mailing List, linux-fsdevel, LKML, netdev,
	netfilter-devel, sgrubb, omosnace, dhowells, simo, Eric Paris,
	ebiederm, nhorman
In-Reply-To: <20190708181237.5poheliito7zpvmc@madcap2.tricolour.ca>

On July 8, 2019 8:12:56 PM Richard Guy Briggs <rgb@redhat.com> wrote:

> On 2019-05-30 19:26, Paul Moore wrote:
>> On Thu, May 30, 2019 at 5:29 PM Tycho Andersen <tycho@tycho.ws> wrote:
>>> On Thu, May 30, 2019 at 03:29:32PM -0400, Paul Moore wrote:
>>>>
>>>>
>>>> [REMINDER: It is an "*audit* container ID" and not a general
>>>> "container ID" ;)  Smiley aside, I'm not kidding about that part.]
>>>
>>> This sort of seems like a distinction without a difference; presumably
>>> audit is going to want to differentiate between everything that people
>>> in userspace call a container. So you'll have to support all this
>>> insanity anyway, even if it's "not a container ID".
>>
>> That's not quite right.  Audit doesn't care about what a container is,
>> or is not, it also doesn't care if the "audit container ID" actually
>> matches the ID used by the container engine in userspace and I think
>> that is a very important line to draw.  Audit is simply given a value
>> which it calls the "audit container ID", it ensures that the value is
>> inherited appropriately (e.g. children inherit their parent's audit
>> container ID), and it uses the value in audit records to provide some
>> additional context for log analysis.  The distinction isn't limited to
>> the value itself, but also to how it is used; it is an "audit
>> container ID" and not a "container ID" because this value is
>> exclusively for use by the audit subsystem.  We are very intentionally
>> not adding a generic container ID to the kernel.  If the kernel does
>> ever grow a general purpose container ID we will be one of the first
>> ones in line to make use of it, but we are not going to be the ones to
>> generically add containers to the kernel.  Enough people already hate
>> audit ;)
>>
>>>> I'm not interested in supporting/merging something that isn't useful;
>>>> if this doesn't work for your use case then we need to figure out what
>>>> would work.  It sounds like nested containers are much more common in
>>>> the lxc world, can you elaborate a bit more on this?
>>>>
>>>>
>>>> As far as the possible solutions you mention above, I'm not sure I
>>>> like the per-userns audit container IDs, I'd much rather just emit the
>>>> necessary tracking information via the audit record stream and let the
>>>> log analysis tools figure it out.  However, the bigger question is how
>>>> to limit (re)setting the audit container ID when you are in a non-init
>>>> userns.  For reasons already mentioned, using capable() is a non
>>>> starter for everything but the initial userns, and using ns_capable()
>>>> is equally poor as it essentially allows any userns the ability to
>>>> munge it's audit container ID (obviously not good).  It appears we
>>>> need a different method for controlling access to the audit container
>>>> ID.
>>>
>>> One option would be to make it a string, and have it be append only.
>>> That should be safe with no checks.
>>>
>>> I know there was a long thread about what type to make this thing. I
>>> think you could accomplish the append-only-ness with a u64 if you had
>>> some rule about only allowing setting lower order bits than those that
>>> are already set. With 4 bits for simplicity:
>>>
>>> 1100         # initial container id
>>> 1100 -> 1011 # not allowed
>>> 1100 -> 1101 # allowed, but now 1101 is set in stone since there are
>>>       # no lower order bits left
>>>
>>> There are probably fancier ways to do it if you actually understand
>>> math :)
>>
>> ;)
>>
>>> Since userns nesting is limited to 32 levels (right now, IIRC), and
>>> you have 64 bits, this might be reasonable. You could just teach
>>> container engines to use the first say N bits for themselves, with a 1
>>> bit for the barrier at the end.
>>
>> I like the creativity, but I worry that at some point these
>> limitations are going to be raised (limits have a funny way of doing
>> that over time) and we will be in trouble.  I say "trouble" because I
>> want to be able to quickly do an audit container ID comparison and
>> we're going to pay a penalty for these larger values (we'll need this
>> when we add multiple auditd support and the requisite record routing).
>>
>> Thinking about this makes me also realize we probably need to think a
>> bit longer about audit container ID conflicts between orchestrators.
>> Right now we just take the value that is given to us by the
>> orchestrator, but if we want to allow multiple container orchestrators
>> to work without some form of cooperation in userspace (I think we have
>> to assume the orchestrators will not talk to each other) we likely
>> need to have some way to block reuse of an audit container ID.  We
>> would either need to prevent the orchestrator from explicitly setting
>> an audit container ID to a currently in use value, or instead generate
>> the audit container ID in the kernel upon an event triggered by the
>> orchestrator (e.g. a write to a /proc file).  I suspect we should
>> start looking at the idr code, I think we will need to make use of it.
>
> To address this, I'd suggest that it is enforced to only allow the
> setting of descendants and to maintain a master list of audit container
> identifiers (with a hash table if necessary later) that includes the
> container owner.
>
> This also allows the orchestrator/engine to inject processes into
> existing containers by checking that the audit container identifier is
> only used again by the same owner.
>
> I have working code for both.

Just a quick note that due to some holiday travel I'm not going to be able to adequately respond to your latest messages on this thread for at least another week, likely a bit more.  I'm only checking mail to put out fires, and the audit container ID work tends to be something that starts them ;)

--
paul moore
www.paul-moore.com





^ permalink raw reply

* Re: [PATCH bpf-next 1/2] bpf, libbpf: add a new API bpf_object__reuse_maps()
From: Anton Protopopov @ 2019-07-08 20:37 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Daniel Borkmann, Alexei Starovoitov, Martin KaFai Lau, Song Liu,
	Yonghong Song, Networking, bpf, open list, Andrii Nakryiko
In-Reply-To: <CAEf4BzaGGVv2z8jB8MnT7=gnn4nG0cp7DGYxfnnnpohOT=ujCA@mail.gmail.com>

пн, 8 июл. 2019 г. в 13:54, Andrii Nakryiko <andrii.nakryiko@gmail.com>:
>
> On Fri, Jul 5, 2019 at 2:53 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >
> > On 07/05/2019 10:44 PM, Anton Protopopov wrote:
> > > Add a new API bpf_object__reuse_maps() which can be used to replace all maps in
> > > an object by maps pinned to a directory provided in the path argument.  Namely,
> > > each map M in the object will be replaced by a map pinned to path/M.name.
> > >
> > > Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
> > > ---
> > >  tools/lib/bpf/libbpf.c   | 34 ++++++++++++++++++++++++++++++++++
> > >  tools/lib/bpf/libbpf.h   |  2 ++
> > >  tools/lib/bpf/libbpf.map |  1 +
> > >  3 files changed, 37 insertions(+)
> > >
> > > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > > index 4907997289e9..84c9e8f7bfd3 100644
> > > --- a/tools/lib/bpf/libbpf.c
> > > +++ b/tools/lib/bpf/libbpf.c
> > > @@ -3144,6 +3144,40 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
> > >       return 0;
> > >  }
> > >
> > > +int bpf_object__reuse_maps(struct bpf_object *obj, const char *path)
>
> As is, bpf_object__reuse_maps() can be easily implemented by user
> applications, as it's only using public libbpf APIs, so I'm not 100%
> sure we need to add method like that to libbpf.

The bpf_object__reuse_maps() can definitely be implemented by user
applications, however, to use it a user also needs to re-implement the
bpf_prog_load_xattr funciton, so it seemed to me that adding this
functionality to the library is a better way.

>
> > > +{
> > > +     struct bpf_map *map;
> > > +
> > > +     if (!obj)
> > > +             return -ENOENT;
> > > +
> > > +     if (!path)
> > > +             return -EINVAL;
> > > +
> > > +     bpf_object__for_each_map(map, obj) {
> > > +             int len, err;
> > > +             int pinned_map_fd;
> > > +             char buf[PATH_MAX];
> >
> > We'd need to skip the case of bpf_map__is_internal(map) since they are always
> > recreated for the given object.
> >
> > > +             len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map));
> > > +             if (len < 0) {
> > > +                     return -EINVAL;
> > > +             } else if (len >= PATH_MAX) {
> > > +                     return -ENAMETOOLONG;
> > > +             }
> > > +
> > > +             pinned_map_fd = bpf_obj_get(buf);
> > > +             if (pinned_map_fd < 0)
> > > +                     return pinned_map_fd;
> >
> > Should we rather have a new map definition attribute that tells to reuse
> > the map if it's pinned in bpf fs, and if not, we create it and later on
> > pin it? This is what iproute2 is doing and which we're making use of heavily.
>
> I'd like something like that as well. This would play nicely with
> recently added BTF-defined maps as well.
>
> I think it should be not just pin/don't pin flag, but rather pinning
> strategy, to accommodate various typical strategies of handling maps
> that are already pinned. So something like this:
>
> 1. BPF_PIN_NOTHING - default, don't pin;
> 2. BPF_PIN_EXCLUSIVE - pin, but if map is already pinned - fail;
> 3. BPF_PIN_SET - pin; if existing map exists, reset its state to be
> exact state of object's map;
> 4. BPF_PIN_MERGE - pin, if map exists, fill in NULL entries only (this
> is how Cilium is pinning PROG_ARRAY maps, if I understand correctly);
> 5. BPF_PIN_MERGE_OVERWRITE - pin, if map exists, overwrite non-NULL values.
>
> This list is only for illustrative purposes, ideally people that have
> a lot of experience using pinning for real-world use cases would chime
> in on what strategies are useful and make sense.

My case was simply to reuse existing maps when reloading a program.
Does it make sense for you to add only the simplest cases of listed above?

Also, libbpf doesn't use standard naming conventions for pinning maps.
Does it make sense to provide a list of already open maps to the
bpf_prog_load_xattr function as an attribute? In this case a user
can execute his own policy on pinning, but still will have an option
to reuse, reset, and merge maps.

>
> > In bpf_object__reuse_maps() bailing out if bpf_obj_get() fails is perhaps
> > too limiting for a generic API as new version of an object file may contain
> > new maps which are not yet present in bpf fs at that point.
> >
> > > +             err = bpf_map__reuse_fd(map, pinned_map_fd);
> > > +             if (err)
> > > +                     return err;
> > > +     }
> > > +
> > > +     return 0;
> > > +}
> > > +
> > >  int bpf_object__pin_programs(struct bpf_object *obj, const char *path)
> > >  {
> > >       struct bpf_program *prog;
> > > diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> > > index d639f47e3110..7fe465a1be76 100644
> > > --- a/tools/lib/bpf/libbpf.h
> > > +++ b/tools/lib/bpf/libbpf.h
> > > @@ -82,6 +82,8 @@ int bpf_object__variable_offset(const struct bpf_object *obj, const char *name,
> > >  LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path);
> > >  LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj,
> > >                                     const char *path);
> > > +LIBBPF_API int bpf_object__reuse_maps(struct bpf_object *obj,
> > > +                                   const char *path);
> > >  LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj,
> > >                                       const char *path);
> > >  LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj,
> > > diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> > > index 2c6d835620d2..66a30be6696c 100644
> > > --- a/tools/lib/bpf/libbpf.map
> > > +++ b/tools/lib/bpf/libbpf.map
> > > @@ -172,5 +172,6 @@ LIBBPF_0.0.4 {
> > >               btf_dump__new;
> > >               btf__parse_elf;
> > >               bpf_object__load_xattr;
> > > +             bpf_object__reuse_maps;
> > >               libbpf_num_possible_cpus;
> > >  } LIBBPF_0.0.3;
> > >
> >

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox