Netdev List
 help / color / mirror / Atom feed
* [PATCH 08/10] fsl/fman: define frame description command UPD
From: Yangbo Lu @ 2018-06-04  7:08 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu
In-Reply-To: <20180604070837.19265-1-yangbo.lu@nxp.com>

Defined frame description command FM_FD_CMD_UPD for
prepended data updating.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
---
 drivers/net/ethernet/freescale/fman/fman.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.h b/drivers/net/ethernet/freescale/fman/fman.h
index bfa02e0..935c317 100644
--- a/drivers/net/ethernet/freescale/fman/fman.h
+++ b/drivers/net/ethernet/freescale/fman/fman.h
@@ -41,6 +41,7 @@
 /* Frame queue Context Override */
 #define FM_FD_CMD_FCO                   0x80000000
 #define FM_FD_CMD_RPD                   0x40000000  /* Read Prepended Data */
+#define FM_FD_CMD_UPD			0x20000000  /* Update Prepended Data */
 #define FM_FD_CMD_DTC                   0x10000000  /* Do L4 Checksum */
 
 /* TX-Port: Unsupported Format */
-- 
1.7.1

^ permalink raw reply related

* [PATCH 09/10] dpaa_eth: add support for hardware timestamping
From: Yangbo Lu @ 2018-06-04  7:08 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu
In-Reply-To: <20180604070837.19265-1-yangbo.lu@nxp.com>

This patch is to add hardware timestamping support
for dpaa_eth. On Rx, timestamping is enabled for
all frames. On Tx, we only instruct the hardware
to timestamp the frames marked accordingly by the
stack.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
---
 drivers/net/ethernet/freescale/dpaa/Kconfig    |   12 +++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |  119 +++++++++++++++++++++++-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |    5 +
 3 files changed, 133 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/Kconfig b/drivers/net/ethernet/freescale/dpaa/Kconfig
index a654736..da34138 100644
--- a/drivers/net/ethernet/freescale/dpaa/Kconfig
+++ b/drivers/net/ethernet/freescale/dpaa/Kconfig
@@ -8,3 +8,15 @@ menuconfig FSL_DPAA_ETH
 	  supporting the Freescale QorIQ chips.
 	  Depends on Freescale Buffer Manager and Queue Manager
 	  driver and Frame Manager Driver.
+
+if FSL_DPAA_ETH
+config FSL_DPAA_ETH_TS
+	bool "DPAA hardware timestamping support"
+	select PTP_1588_CLOCK_QORIQ
+	default n
+	help
+	  Enable DPAA hardware timestamping support.
+	  This option is useful for applications to get
+	  hardware time stamps on the Ethernet packets
+	  using the SO_TIMESTAMPING API.
+endif
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index fd43f98..864cfb0 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -1168,7 +1168,11 @@ static int dpaa_eth_init_tx_port(struct fman_port *port, struct dpaa_fq *errq,
 	buf_prefix_content.priv_data_size = buf_layout->priv_data_size;
 	buf_prefix_content.pass_prs_result = true;
 	buf_prefix_content.pass_hash_result = true;
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	buf_prefix_content.pass_time_stamp = true;
+#else
 	buf_prefix_content.pass_time_stamp = false;
+#endif
 	buf_prefix_content.data_align = DPAA_FD_DATA_ALIGNMENT;
 
 	params.specific_params.non_rx_params.err_fqid = errq->fqid;
@@ -1210,7 +1214,11 @@ static int dpaa_eth_init_rx_port(struct fman_port *port, struct dpaa_bp **bps,
 	buf_prefix_content.priv_data_size = buf_layout->priv_data_size;
 	buf_prefix_content.pass_prs_result = true;
 	buf_prefix_content.pass_hash_result = true;
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	buf_prefix_content.pass_time_stamp = true;
+#else
 	buf_prefix_content.pass_time_stamp = false;
+#endif
 	buf_prefix_content.data_align = DPAA_FD_DATA_ALIGNMENT;
 
 	rx_p = &params.specific_params.rx_params;
@@ -1592,6 +1600,18 @@ static int dpaa_eth_refill_bpools(struct dpaa_priv *priv)
 	return 0;
 }
 
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+static int dpaa_get_tstamp_ns(struct net_device *net_dev, u64 *ns,
+			      struct fman_port *port, const void *data)
+{
+	if (!fman_port_get_tstamp_field(port, data, ns)) {
+		be64_to_cpus(ns);
+		return 0;
+	}
+	return -EINVAL;
+}
+#endif
+
 /* Cleanup function for outgoing frame descriptors that were built on Tx path,
  * either contiguous frames or scatter/gather ones.
  * Skb freeing is not handled here.
@@ -1615,6 +1635,24 @@ static int dpaa_eth_refill_bpools(struct dpaa_priv *priv)
 	skbh = (struct sk_buff **)phys_to_virt(addr);
 	skb = *skbh;
 
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	if (priv->tx_tstamp &&
+	    skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) {
+		struct skb_shared_hwtstamps shhwtstamps;
+		u64 ns;
+
+		memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+
+		if (!dpaa_get_tstamp_ns(priv->net_dev, &ns,
+					priv->mac_dev->port[TX],
+					(void *)skbh)) {
+			shhwtstamps.hwtstamp = ns_to_ktime(ns);
+			skb_tstamp_tx(skb, &shhwtstamps);
+		} else {
+			dev_warn(dev, "dpaa_get_tstamp_ns failed!\n");
+		}
+	}
+#endif
 	if (unlikely(qm_fd_get_format(fd) == qm_fd_sg)) {
 		nr_frags = skb_shinfo(skb)->nr_frags;
 		dma_unmap_single(dev, addr, qm_fd_get_offset(fd) +
@@ -2086,6 +2124,14 @@ static int dpaa_start_xmit(struct sk_buff *skb, struct net_device *net_dev)
 	if (unlikely(err < 0))
 		goto skb_to_fd_failed;
 
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	if (priv->tx_tstamp &&
+	    skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) {
+		fd.cmd |= FM_FD_CMD_UPD;
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+	}
+#endif
+
 	if (likely(dpaa_xmit(priv, percpu_stats, queue_mapping, &fd) == 0))
 		return NETDEV_TX_OK;
 
@@ -2304,6 +2350,23 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal,
 	if (!skb)
 		return qman_cb_dqrr_consume;
 
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	if (priv->rx_tstamp) {
+		struct skb_shared_hwtstamps *shhwtstamps;
+		u64 ns;
+
+		shhwtstamps = skb_hwtstamps(skb);
+		memset(shhwtstamps, 0, sizeof(*shhwtstamps));
+
+		if (!dpaa_get_tstamp_ns(priv->net_dev, &ns,
+					priv->mac_dev->port[RX],
+					vaddr))
+			shhwtstamps->hwtstamp = ns_to_ktime(ns);
+		else
+			dev_warn(net_dev->dev.parent, "dpaa_get_tstamp_ns failed!\n");
+	}
+#endif
+
 	skb->protocol = eth_type_trans(skb, net_dev);
 
 	if (net_dev->features & NETIF_F_RXHASH && priv->keygen_in_use &&
@@ -2523,11 +2586,61 @@ static int dpaa_eth_stop(struct net_device *net_dev)
 	return err;
 }
 
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+static int dpaa_ts_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct dpaa_priv *priv = netdev_priv(dev);
+	struct hwtstamp_config config;
+
+	if (copy_from_user(&config, rq->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+		/* Couldn't disable rx/tx timestamping separately.
+		 * Do nothing here.
+		 */
+		priv->tx_tstamp = false;
+		break;
+	case HWTSTAMP_TX_ON:
+		priv->mac_dev->set_tstamp(priv->mac_dev->fman_mac, true);
+		priv->tx_tstamp = true;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (config.rx_filter == HWTSTAMP_FILTER_NONE) {
+		/* Couldn't disable rx/tx timestamping separately.
+		 * Do nothing here.
+		 */
+		priv->rx_tstamp = false;
+	} else {
+		priv->mac_dev->set_tstamp(priv->mac_dev->fman_mac, true);
+		priv->rx_tstamp = true;
+		/* TS is set for all frame types, not only those requested */
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+	}
+
+	return copy_to_user(rq->ifr_data, &config, sizeof(config)) ?
+			-EFAULT : 0;
+}
+#endif /* CONFIG_FSL_DPAA_ETH_TS */
+
 static int dpaa_ioctl(struct net_device *net_dev, struct ifreq *rq, int cmd)
 {
-	if (!net_dev->phydev)
-		return -EINVAL;
-	return phy_mii_ioctl(net_dev->phydev, rq, cmd);
+	int ret = -EINVAL;
+
+	if (cmd == SIOCGMIIREG) {
+		if (net_dev->phydev)
+			return phy_mii_ioctl(net_dev->phydev, rq, cmd);
+	}
+
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	if (cmd == SIOCSHWTSTAMP)
+		return dpaa_ts_ioctl(net_dev, rq, cmd);
+#endif
+	return ret;
 }
 
 static const struct net_device_ops dpaa_ops = {
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index bd94220..e8214fc 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -182,6 +182,11 @@ struct dpaa_priv {
 
 	struct dpaa_buffer_layout buf_layout[2];
 	u16 rx_headroom;
+
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+	bool tx_tstamp; /* Tx timestamping enabled */
+	bool rx_tstamp; /* Rx timestamping enabled */
+#endif
 };
 
 /* from dpaa_ethtool.c */
-- 
1.7.1

^ permalink raw reply related

* [PATCH 10/10] dpaa_eth: add the get_ts_info interface for ethtool
From: Yangbo Lu @ 2018-06-04  7:08 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu
In-Reply-To: <20180604070837.19265-1-yangbo.lu@nxp.com>

Added the get_ts_info interface for ethtool to check
the timestamping capability.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c |   44 ++++++++++++++++++++
 1 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 2f933b6..a20b434 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -32,6 +32,9 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/string.h>
+#include <linux/of_platform.h>
+#include <linux/net_tstamp.h>
+#include <linux/fsl/ptp_qoriq.h>
 
 #include "dpaa_eth.h"
 #include "mac.h"
@@ -515,6 +518,46 @@ static int dpaa_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
 	return ret;
 }
 
+static int dpaa_get_ts_info(struct net_device *net_dev,
+			    struct ethtool_ts_info *info)
+{
+	struct device *dev = net_dev->dev.parent;
+	struct device_node *mac_node = dev->of_node;
+	struct device_node *fman_node = NULL, *ptp_node = NULL;
+	struct platform_device *ptp_dev = NULL;
+	struct qoriq_ptp *ptp = NULL;
+
+	fman_node = of_get_parent(mac_node);
+	if (fman_node)
+		ptp_node = of_parse_phandle(fman_node, "ptimer-handle", 0);
+
+	if (ptp_node)
+		ptp_dev = of_find_device_by_node(ptp_node);
+
+	if (ptp_dev)
+		ptp = platform_get_drvdata(ptp_dev);
+
+	if (ptp) {
+		info->phc_index = ptp->phc_index;
+#ifdef CONFIG_FSL_DPAA_ETH_TS
+		info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE |
+					SOF_TIMESTAMPING_RX_HARDWARE |
+					SOF_TIMESTAMPING_RAW_HARDWARE;
+		info->tx_types = (1 << HWTSTAMP_TX_OFF) |
+				 (1 << HWTSTAMP_TX_ON);
+		info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) |
+				   (1 << HWTSTAMP_FILTER_ALL);
+		return 0;
+#endif
+	} else {
+		info->phc_index = -1;
+	}
+
+	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE;
+	return 0;
+}
+
 const struct ethtool_ops dpaa_ethtool_ops = {
 	.get_drvinfo = dpaa_get_drvinfo,
 	.get_msglevel = dpaa_get_msglevel,
@@ -530,4 +573,5 @@ static int dpaa_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
 	.set_link_ksettings = dpaa_set_link_ksettings,
 	.get_rxnfc = dpaa_get_rxnfc,
 	.set_rxnfc = dpaa_set_rxnfc,
+	.get_ts_info = dpaa_get_ts_info,
 };
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next 0/2] cls_flower: Various fixes
From: Roi Dayan @ 2018-06-04  7:35 UTC (permalink / raw)
  To: Jiri Pirko, Cong Wang
  Cc: Paul Blakey, Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers, Yevgeny Kliteynik, Shahar Klein,
	Mark Bloch, Or Gerlitz
In-Reply-To: <20180603193911.GA3013@nanopsycho>



On 03/06/2018 22:39, Jiri Pirko wrote:
> Sun, Jun 03, 2018 at 08:33:25PM CEST, xiyou.wangcong@gmail.com wrote:
>> On Wed, May 30, 2018 at 1:17 AM, Paul Blakey <paulb@mellanox.com> wrote:
>>> Two of the fixes are for my multiple mask patch
>>>
>>> Paul Blakey (2):
>>>    cls_flower: Fix missing free of rhashtable
>>>    cls_flower: Fix comparing of old filter mask with new filter
>>
>> Both are bug fixes and one-line fixes, so definitely should go
>> to -net tree and -stable tree.
> 
> I agree.
> 

it's because the commit they fix doesn't exists in net yet.

^ permalink raw reply

* Re: [PATCH net v2 0/2] ip[6] tunnels: fix mtu calculations
From: Nicolas Dichtel @ 2018-06-04  7:56 UTC (permalink / raw)
  To: David Miller; +Cc: petrm, idosch, netdev
In-Reply-To: <20180601.135741.863528561800652928.davem@davemloft.net>

Le 01/06/2018 à 19:57, David Miller a écrit :
[snip]
> I think the 0xfff8 value might come from the requirement that ipv6
> fragments need to be a multiple of 8 bytes long.
> 
Oh, thanks for the explanation!

^ permalink raw reply

* Re: [PATCH net-next] netfilter: fix null-ptr-deref in nf_nat_decode_session
From: Pablo Neira Ayuso @ 2018-06-04  8:01 UTC (permalink / raw)
  To: Prashant Bhole
  Cc: Florian Westphal, David S . Miller, Jozsef Kadlecsik, netdev,
	netfilter-devel
In-Reply-To: <3c408f33-9c9c-b16f-3433-8931ea3e4bae@lab.ntt.co.jp>

On Mon, Jun 04, 2018 at 10:10:08AM +0900, Prashant Bhole wrote:
> CC netfilter-devel
> 
> On 5/28/2018 7:52 PM, Florian Westphal wrote:
> > Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp> wrote:
> > > Add null check for nat_hook in nf_nat_decode_session()
> > 
> > Acked-by: Florian Westphal <fw@strlen.de>
> 
> Hi Pablo,
> Just pinging in case this patch was missed.

The original submission was missing Cc: netfilter-devel@vger.kernel.org,
so patchwork didn't catch it:

http://patchwork.ozlabs.org/project/netfilter-devel/list/

Will include this patch in the next batch. Thanks.

^ permalink raw reply

* Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs
From: Dmitry Vyukov @ 2018-06-04  8:34 UTC (permalink / raw)
  To: Xin Long
  Cc: Marcelo Ricardo Leitner, Neal Cardwell, Michael Tuexen,
	Neil Horman, Netdev, linux-sctp, David Miller, David Ahern,
	Eric Dumazet, syzkaller
In-Reply-To: <CADvbK_fbKbH2wm6Xurr+ELVag-LvyQdL+peJd=wp7OL7_zMZTQ@mail.gmail.com>

On Tue, May 29, 2018 at 7:45 PM, Xin Long <lucien.xin@gmail.com> wrote:
> On Wed, May 30, 2018 at 1:06 AM, Marcelo Ricardo Leitner
> <marcelo.leitner@gmail.com> wrote:
>> On Tue, May 29, 2018 at 12:03:46PM -0400, Neal Cardwell wrote:
>>> On Tue, May 29, 2018 at 11:45 AM Marcelo Ricardo Leitner <
>>> marcelo.leitner@gmail.com> wrote:
>>> > - patch2 - fix rtx attack vector
>>> >    - Add the floor value to rto_min to HZ/20 (which fits the values
>>> >      that Michael shared on the other email)
>>>
>>> I would encourage allowing minimum RTO values down to 5ms, if the ACK
>>> policy in the receiver makes this feasible. Our experience is that in
>>> datacenter environments it can be advantageous to allow timer-based loss
>>> recoveries using timeout values as low as 5ms, e.g.:
>>
>> Thanks Neal. On Xin's tests, the hearbeat timer becomes an issue at
>> ~25ms already. Xin, can you share more details on the hw, which CPU
>> was used?

Hi,

Did we reach any decision on this? This continues to produce bug
reports on syzbot.

I am not sure whom you are asking, because Xin is you unless I am
missing something :)
But if you mean syzbot hardware, then it's GCE VMs with modern Intel
CPUs but an important aspect is a heavy-debug config (which you can
take from here https://syzkaller.appspot.com/bug?extid=3dcd59a1f907245f891f)
and systematic bug reporting. So if it's any flaky in your testing, it
will produce dozens of bug emails on syzbot.


> It was on a KVM guest,  "-smp 2,cores=1,threads=1,sockets=2"
> # lscpu
> Architecture:          x86_64
> CPU op-mode(s):        32-bit, 64-bit
> Byte Order:            Little Endian
> CPU(s):                2
> On-line CPU(s) list:   0,1
> Thread(s) per core:    1
> Core(s) per socket:    1
> Socket(s):             2
> NUMA node(s):          1
> Vendor ID:             GenuineIntel
> CPU family:            6
> Model:                 13
> Model name:            QEMU Virtual CPU version 1.5.3
> Stepping:              3
> CPU MHz:               2397.222
> BogoMIPS:              4794.44
> Hypervisor vendor:     KVM
> Virtualization type:   full
> L1d cache:             32K
> L1i cache:             32K
> L2 cache:              4096K
> NUMA node0 CPU(s):     0,1
> Flags:                 fpu de pse tsc msr pae mce cx8 apic sep mtrr
> pge mca cmov pse36 clflush mmx fxsr sse sse2 syscall nx lm rep_good
> nopl cpuid pni cx16 hypervisor lahf_lm abm pti
>
> If we're counting on max_t to fix this CPU stuck. It should not that
> matter if min rto < the value causing that stuck.
>
>>
>> Anyway, what about we add a floor to rto_max too, so that RTO can
>> actually grow into something bigger that don't hog the CPU? Like:
>> rto_min floor = 5ms
>> rto_max floor = 50ms
>>
>>   Marcelo

^ permalink raw reply

* Re: [PATCH 1/3] sh_eth: make sh_eth_soft_swap() work on ARM
From: Geert Uytterhoeven @ 2018-06-04  8:48 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, Linux-Renesas
In-Reply-To: <c83970ff-337d-6628-6976-e1665c44dd0b@cogentembedded.com>

On Sat, Jun 2, 2018 at 9:37 PM, Sergei Shtylyov
<sergei.shtylyov@cogentembedded.com> wrote:
> Browsing  thru the driver disassembly, I noticed that ARM gcc generated
> no  code  whatsoever for sh_eth_soft_swap() while building a little-endian
> kernel -- apparently __LITTLE_ENDIAN__ was not being #define'd, however
> it got implicitly #define'd when building with the SH gcc (I could only
> find the explicit #define __LITTLE_ENDIAN that was #include'd when building
> a little-endian kernel).  Luckily, the Ether controller  only doing big-
> endian DMA is encountered on the early SH771x SoCs only and all ARM SoCs
> implement EDMR.DE and thus set 'sh_eth_cpu_data::hw_swap'. But anyway, we
> need to fix the #ifdef inside sh_eth_soft_swap() to something that would
> work on all architectures...
>
> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 1/3] sh_eth: make sh_eth_soft_swap() work on ARM
From: Geert Uytterhoeven @ 2018-06-04  8:49 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, Linux-Renesas
In-Reply-To: <CAMuHMdVb2q=S6ucYCpwnVse-COefcuUGT6DO5Ympo+BjiVN9sg@mail.gmail.com>

On Mon, Jun 4, 2018 at 10:48 AM, Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
> On Sat, Jun 2, 2018 at 9:37 PM, Sergei Shtylyov
> <sergei.shtylyov@cogentembedded.com> wrote:
>> Browsing  thru the driver disassembly, I noticed that ARM gcc generated
>> no  code  whatsoever for sh_eth_soft_swap() while building a little-endian
>> kernel -- apparently __LITTLE_ENDIAN__ was not being #define'd, however
>> it got implicitly #define'd when building with the SH gcc (I could only
>> find the explicit #define __LITTLE_ENDIAN that was #include'd when building
>> a little-endian kernel).  Luckily, the Ether controller  only doing big-
>> endian DMA is encountered on the early SH771x SoCs only and all ARM SoCs
>> implement EDMR.DE and thus set 'sh_eth_cpu_data::hw_swap'. But anyway, we
>> need to fix the #ifdef inside sh_eth_soft_swap() to something that would
>> work on all architectures...
>>
>> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
>
> Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>

Forgot to say: nice catch ;-)

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 2/3] sh_eth: uninline sh_eth_soft_swap()
From: Geert Uytterhoeven @ 2018-06-04  8:50 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, Linux-Renesas
In-Reply-To: <a1eb21c8-ab44-3022-b239-0b5c33691d3b@cogentembedded.com>

On Sat, Jun 2, 2018 at 9:38 PM, Sergei Shtylyov
<sergei.shtylyov@cogentembedded.com> wrote:
> sh_eth_tsu_soft_swap() is called twice by the driver, remove *inline* and
> move  that function  from the header to the driver itself to let gcc decide
> whether to expand it inline or not...
>
> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 3/3] sh_eth: use DIV_ROUND_UP() in sh_eth_soft_swap()
From: Geert Uytterhoeven @ 2018-06-04  8:55 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, Linux-Renesas
In-Reply-To: <995798f8-69e0-6471-8eca-eefc420dbe21@cogentembedded.com>

On Sat, Jun 2, 2018 at 9:40 PM, Sergei Shtylyov
<sergei.shtylyov@cogentembedded.com> wrote:
> When initializing 'maxp' in sh_eth_soft_swap(), the buffer length needs
> to be rounded  up -- that's just asking for DIV_ROUND_UP()!
>
> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 0/3] sh_eth: fix & clean up sh_eth_soft_swap()
From: Geert Uytterhoeven @ 2018-06-04  9:01 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, Linux-Renesas
In-Reply-To: <9027499a-0e19-7721-a17f-26e86885da3f@cogentembedded.com>

Hi Sergei,

On Sat, Jun 2, 2018 at 9:32 PM, Sergei Shtylyov
<sergei.shtylyov@cogentembedded.com> wrote:
> Here's a set of 3 patches against DaveM's 'net-next.git' repo. First one fixes an
> old buffer endiannes issue (luckily, the ARM SoCs are smart enough to not actually
> care) plus couple clean ups around sh_eth_soft_swap()...
>
> [1/1] sh_eth: make sh_eth_soft_swap() work on ARM
> [2/3] sh_eth: uninline sh_eth_soft_swap()
> [3/3] sh_eth: use DIV_ROUND_UP() in sh_eth_soft_swap()

Does the swapping actually work?
In sh_eth_rx(), it's called before dma_unmap_single(), without calling
dma_sync_single_for_cpu() first.
Shouldn't it be called after the unmap instead?

In addition, why is it passed the dma_addr converted to virt, while the skb
address is available?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH net-next v2 1/3] bpf: implement bpf_get_current_cgroup_id() helper
From: Daniel Borkmann @ 2018-06-04  9:08 UTC (permalink / raw)
  To: Yonghong Song, ast, netdev; +Cc: kernel-team
In-Reply-To: <20180603225943.2370719-2-yhs@fb.com>

On 06/04/2018 12:59 AM, Yonghong Song wrote:
> bpf has been used extensively for tracing. For example, bcc
> contains an almost full set of bpf-based tools to trace kernel
> and user functions/events. Most tracing tools are currently
> either filtered based on pid or system-wide.
> 
> Containers have been used quite extensively in industry and
> cgroup is often used together to provide resource isolation
> and protection. Several processes may run inside the same
> container. It is often desirable to get container-level tracing
> results as well, e.g. syscall count, function count, I/O
> activity, etc.
> 
> This patch implements a new helper, bpf_get_current_cgroup_id(),
> which will return cgroup id based on the cgroup within which
> the current task is running.
> 
> The later patch will provide an example to show that
> userspace can get the same cgroup id so it could
> configure a filter or policy in the bpf program based on
> task cgroup id.
> 
> The helper is currently implemented for tracing. It can
> be added to other program types as well when needed.
> 
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h      |  1 +
>  include/uapi/linux/bpf.h |  8 +++++++-
>  kernel/bpf/core.c        |  1 +
>  kernel/bpf/helpers.c     | 15 +++++++++++++++
>  kernel/trace/bpf_trace.c |  2 ++
>  5 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index bbe2974..995c3b1 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
>  extern const struct bpf_func_proto bpf_get_stack_proto;
>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
>  extern const struct bpf_func_proto bpf_sock_hash_update_proto;
> +extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
>  
>  /* Shared helpers among cBPF and eBPF. */
>  void bpf_user_rnd_init_once(void);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f0b6608..18712b0 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2070,6 +2070,11 @@ union bpf_attr {
>   * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
>   * 	Return
>   * 		The id is returned or 0 in case the id could not be retrieved.
> + *
> + * u64 bpf_get_current_cgroup_id(void)
> + * 	Return
> + * 		A 64-bit integer containing the current cgroup id based
> + * 		on the cgroup within which the current task is running.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2151,7 +2156,8 @@ union bpf_attr {
>  	FN(lwt_seg6_action),		\
>  	FN(rc_repeat),			\
>  	FN(rc_keydown),			\
> -	FN(skb_cgroup_id),
> +	FN(skb_cgroup_id),		\
> +	FN(get_current_cgroup_id),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 527587d..9f14937 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
>  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
>  const struct bpf_func_proto bpf_sock_map_update_proto __weak;
>  const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
> +const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
>  
>  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
>  {
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 3d24e23..73065e2 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
>  	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
>  	.arg2_type	= ARG_CONST_SIZE,
>  };
> +
> +#ifdef CONFIG_CGROUPS
> +BPF_CALL_0(bpf_get_current_cgroup_id)
> +{
> +	struct cgroup *cgrp = task_dfl_cgroup(current);
> +
> +	return cgrp->kn->id.id;
> +}
> +
> +const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
> +	.func		= bpf_get_current_cgroup_id,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +};
> +#endif

Nit: why not moving this function directly to bpf_trace.c?

> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 752992c..e2ab5b7 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_get_prandom_u32_proto;
>  	case BPF_FUNC_probe_read_str:
>  		return &bpf_probe_read_str_proto;
> +	case BPF_FUNC_get_current_cgroup_id:
> +		return &bpf_get_current_cgroup_id_proto;

When you have !CONFIG_CGROUPS, then it relies on the weak definition of the
bpf_get_current_cgroup_id_proto, which I would think at latest in fixup_bpf_calls()
bails out with 'kernel subsystem misconfigured func' due to func being NULL.

Can't we just do the #ifdef CONFIG_CGROUPS around BPF_FUNC_get_current_cgroup_id
case instead? Then we bail out normally with 'unknown func' when cgroups are
not configured?

>  	default:
>  		return NULL;
>  	}
> 

^ permalink raw reply

* [PATCH net v3] ipv6: omit traffic class when calculating flow hash
From: Michal Kubecek @ 2018-06-04  9:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, linux-kernel, Nicolas Dichtel, Tom Herbert, David Ahern,
	Ido Schimmel
In-Reply-To: <687af1bc-a1f9-c0d2-dd3a-eeef3a2bf9b4@gmail.com>

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also inconsistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
v2: introduce and use an inline helper as suggested by David Ahern
v3: keep the cast out of the helper to make future cleanup easier

 include/net/ipv6.h        | 5 +++++
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 836f31af1369..a406f2e8680a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -906,6 +906,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
+static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
+{
+	return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
+}
+
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..0234f8d1f0ac 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4d61736c41a..4530a82aaa2e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1868,7 +1868,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -1889,7 +1889,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
2.17.1

^ permalink raw reply related

* Re: [RFC feedback] AF_XDP and non-Intel hardware
From: Mykyta Iziumtsev @ 2018-06-04  9:44 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Netdev, Björn Töpel, Karlsson, Magnus, Zhang, Qi Z,
	Francois Ozog, Ilias Apalodimas, Brian Brooks,
	Jesper Dangaard Brouer, Andy Gospodarek, michael.chan,
	Luke Gorrie
In-Reply-To: <CAJ+HfNjxyeOGCw0Fi70HcG_aq1c_dc6JHF3+87K8G6YqBjj54A@mail.gmail.com>

Hi Björn and Magnus,

Here is a short update on the proposed changes to AF_XDP.

During implementation phase I figured out that 'end-of-page' in RX
descriptor's flags won't work, unfortunately. This is because the
kernel driver can't know if the frame will be last on the page when RX
descriptor is being filled in. Imagine that the driver is processing a
frame on a page, and there is still 1000 bytes of non utilized memory
beyond this frame. If the next frame (which haven't yet arrived) is
less than that -- the NIC can place it on the same page, otherwise the
NIC will skip remaining 1000 bytes and take next page. Of course, the
driver can update RX descriptor retrospectively, or use 'empty' RX
descriptors just to indicate page completion, but it's too complex or
inefficient and as such doesn't look good.

What I could propose instead is to make the 'filling' and 'page
completion' symmetric. That is, UMEM will have 1 queue to pass pages
from application to the kernel driver (fring in your terminology) and
one to indicate complete pages back to the application.

That 'page completion' queue can be added as a third queue to UMEM, or
alternatively TX completions can be decoupled from UMEM to make it an
simple 'memory area' object with two directional queues which work
with RX pages only. TX completions can be handled in a simpler manner
as part of TX queues: just make sure consumer index is advanced only
when the frame is finally sent out. The TX completion processing can
be then done by observing consume index advance from application. That
would  not only be more in line with best NIC ring design practices
but will improve cache locality and eliminate critical section  if TX
queues are assigned to different CPUs.

With best regards,
Mykyta

On 22 May 2018 at 14:09, Björn Töpel <bjorn.topel@gmail.com> wrote:
> 2018-05-22 9:45 GMT+02:00 Mykyta Iziumtsev <mykyta.iziumtsev@linaro.org>:
>> On 21 May 2018 at 20:55, Björn Töpel <bjorn.topel@gmail.com> wrote:
>>> 2018-05-21 14:34 GMT+02:00 Mykyta Iziumtsev <mykyta.iziumtsev@linaro.org>:
>>>> Hi Björn and Magnus,
>>>>
>>>> (This thread is a follow up to private dialogue. The intention is to
>>>> let community know that AF_XDP can be enhanced further to make it
>>>> compatible with wider range of NIC vendors).
>>>>
>>>
>>> Mykyta, thanks for doing the write-up and sending it to the netdev
>>> list! The timing could not be better -- we need to settle on an uapi
>>> that works for all vendors prior enabling it in the kernel.
>>>
>>>> There are two NIC variations which don't fit well with current AF_XDP proposal.
>>>>
>>>> The first variation is represented by some NXP and Cavium NICs. AF_XDP
>>>> expects NIC to put incoming frames into slots defined by UMEM area.
>>>> Here slot size is set in XDP_UMEM_REG xdp_umem.reg.frame_size and
>>>> slots available to NIC are communicated to the kernel via UMEM fill
>>>> queue. While Intel NICs support only one slot size, NXP and Cavium
>>>> support multiple slot sizes to optimize memory usage (e.g. { 128, 512,
>>>> 2048 }, please refer to [1] for rationale). On frame reception the NIC
>>>> pulls a slot from best fit pool based on frame size.
>>>>
>>>> The second variation is represented by e.g. Chelsio T5/T6 and Netcope
>>>> NICs. As shown above, AF_XDP expects NIC to put incoming frames at
>>>> predefined addresses. This is not the case here as the NIC is in
>>>> charge of placing frames in memory based on it's own algorithm. For
>>>> example, Chelsio T5/T6 is expecting to get whole pages from the driver
>>>> and puts incoming frames on the page in a 'tape recorder' fashion.
>>>> Assuming 'base' is page address and flen[N] is an array of frame
>>>> lengths, the frame placement in memory will look like that:
>>>>   base + 0
>>>>   base + frame_len[0]
>>>>   base + frame_len[0] + frame_len[1]
>>>>   base + frame_len[0] + frame_len[1] + frame_len[2]
>>>>   ...
>>>>
>>>> To better support these two NIC variations I suggest to abandon 'frame
>>>> size' structuring in UMEM and stick with 'pages' instead. The NIC
>>>> kernel driver is then responsible for splitting provided pages into
>>>> slots expected by underlying HW (or not splitting at all in case of
>>>> Chelsio/Netcope).
>>>>
>>>
>>> Let's call the first variation "multi-pool" and the second
>>> "type-writer" for simplicity. The type-writer model is very
>>> interesting, and makes a lot of sense when the PCIe bus is a
>>> constraint.
>>>
>>>> On XDP_UMEM_REG the application needs to specify page_size. Then the
>>>> application can pass empty pages to the kernel driver using UMEM
>>>> 'fill' queue by specifying page offset within the UMEM area. xdp_desc
>>>> format needs to be changed as well: frame location will be defined by
>>>> offset from the beginning of UMEM area instead of frame index. As
>>>> payload headroom can vary with AF_XDP we'll need to specify it in
>>>> xdp_desc as well. Beside that it could be worth to consider changing
>>>> payload length to u16 as 64k+ frames aren't very common in networking.
>>>> The resulting xdp_desc would look like that:
>>>>
>>>> struct xdp_desc {
>>>>         __u64 offset;
>>>>         __u16 headroom;
>>>>         __u16 len;
>>>>         __u8 flags;
>>>>         __u8 padding[3];
>>>> };
>>>>
>>>
>>> Let's expand a bit here; Instead of passing indicies to fixed sized
>>> frames to the fill ring, we now pass a memory region. For simplicity,
>>> let's say a page. The fill ring descriptor requires offset and
>>> len. The offset is a global offset from an UMEM perspective, and len
>>> is the size of the region.
>>>
>>
>> I would rather stick with region equal to page (regular or huge page,
>> defined by application). The page size can be extracted from
>> vm_area_struct in XDP_UMEM_REG (preferred) or configured by
>> application.
>>
>
> Ok, thinking more about it I prefer this as well. This means that we
> only need to grow the UMEM fring/cring descriptors to u64, and not
> care about length. As you state below, this makes the validation
> simple.
>
> We might consider exposing a "page size hint", that the user can set.
> For 4G hugepage scenario, it might make sense to have a chunk *less*
> than 4G to avoid HW Rx memory running low when the end of a chunk is
> approaching.
>
> As for THP, I need to think about proper behavior here.
>
>>> On the Rx ring the descriptor, as you wrote, must be changed as well
>>> to your suggestion above. Note, that headroom is still needed, since
>>> XDP can change the size of a packet, so the fixed headroom stated in
>>> UMEM registration is not sufficient.
>>>
>>> This model is obviously more flexible, but then also a bit more
>>> complex. E.g. a fixed-frame NIC (like ixgbe), what should the
>>> behaviour be? Should the fill ring entry be used only for *one* frame,
>>> or chopped up to multiple receive frames? Should it be configurable?
>>> Driver specific?
>>
>> I think driver-specific makes most sense here. In case of fixed-frame
>> NIC the driver shall chop the ring entry into multiple receive frames.
>>
>
> Let's start there, keeping the configuration space small.
>
>>>
>>> Also, validating the entries in the fill queue require more work
>>> (compared to the current index variant). Currently, we only skip
>>> invalid indicies. What should we do when say, you pass a memory window
>>> that is too narrow (say 128B) but correct from a UMEM
>>> perspective. Going this path, we need to add pretty hard constraints
>>> so we don't end up it too complex code -- because then it'll be too
>>> slow.
>>
>> If we stick with pages -- the only possible erroneous input will be
>> 'page out of UMEM boundaries'. The validation will be essentially:
>>
>> if ((offset > umem->size) || (offset & (umem->page_size - 1))
>>     fail
>>
>> The question is what shall be done if validation fails ? Would
>> SEGFAULT be reasonable ? This is more or less equivalent to
>> dereferencing invalid pointer.
>>
>
> The current scheme is simply dropping that kernel skips the invalid
> fill ring entry. SIGSEGV is an interesting idea!
>
>>>
>>>> In current proposal you have a notion of 'frame ownership': 'owned by
>>>> kernel' or 'owned by application'. The ownership is transferred by
>>>> means of enqueueing frame index in UMEM 'fill' queue (from application
>>>> to kernel) or in UMEM 'tx completion' queue (from kernel to
>>>> application). If you decide to adopt 'page' approach this notion needs
>>>> to be changed a bit. This is because in case of packet forwarding one
>>>> and the same page can be used for RX (parts of it enqueued in HW 'free
>>>> lists') and TX (forwarding of previously RXed packets).
>>>>
>>>> I propose to define 'ownership' as a right to manipulate the
>>>> partitioning of the page into frames. Whenever application passes a
>>>> page to the kernel via UMEM 'fill' queue -- the ownership is
>>>> transferred to the kernel. The application can't allocate packets on
>>>> this page until kernel is done with it, but it can change payload of
>>>> RXed packets before forwarding them. The kernel can pass ownership
>>>> back by means of 'end-of-page' in xdp_desc.flags.
>>>>
>>>
>>> I like the end-of-page mechanism.
>>>
>>>> The pages are definitely supposed to be recycled sooner or later. Even
>>>> if it's not part of kernel API and the housekeeping implementation
>>>> resided completely in application I still would like to propose
>>>> possible (hopefully, cost efficient) solution to that. The recycling
>>>> could be achieved by keeping refcount on pages and recycling the page
>>>> only when it's owned by application and refcount reaches 0.
>>>>
>>>> Whenever application transfers page ownership to the kernel the
>>>> refcount shall be initialized to 0. With each incoming RX xdp_desc the
>>>> corresponding page needs to be identified (xdp_desc.offset >>
>>>> PAGE_SHIFT) and refcount incremented. When the packet gets freed the
>>>> refcount shall be decremented. If packet is forwarded in TX xdp_desc
>>>> -- the refcount gets decremented only on TX completion (again,
>>>> tx_completion.desc >> PAGE_SHIFT). For packets originating from the
>>>> application itself the payload buffers needs to be allocated from
>>>> empty page owned by the application and refcount needs to be
>>>> incremented as well.
>>>>
>>>
>>> Strictly speaking, we're not enforcing correctness in the current
>>> solution. If the userspace application passes index 1 mulitple times
>>> to the fill ring, and at the same time send index 1, things will
>>> break. So, with the existing solution the userspace application
>>> *still* needs to track the frames. With this new model, the
>>> tracking/refcounting will be more complex. That might be a concern.
>>>
>>> For the multi-pool NICs I think we can still just have one UMEM, and
>>> let the driver decide where in which pool to place this certain chunk
>>> of memory. Thoughts?
>>
>> Definitely agree with that. This is HW specific and exposing it to the
>> application would only harm portability.
>>
>
> Good stuff, we're on the same page then.
>
>>>
>>> Now, how do we go forward? I think this is very important, and I will
>>> hack a copy-mode version for this new API. I'm a bit worried that the
>>> complexity/configuration space will grow and impact performance, but
>>> let's see.
>>>
>>> To prove that the API works for the NICs you mentioned, we need an
>>> actual zero-copy implementation for them. Do you think Linaro could
>>> work on a zero-copy variant for any of the NICs above?
>>>
>>
>> Linaro will definitely contribute zero-copy implementation for some
>> ARM-based NICs with 'multi-pool' variation.
>
> Very nice!
>
>> Implementation of
>> 'type-writer' variation is up to Chelsio/Netcope, we only try to come
>> up with API which (most probably) will fit them as well.
>>
>
> Let's hope we get an implementation from these vendors as well! :-)
>
>
> Björn
>
>>>
>>> Again thanks for bringing this up!
>>> Björn
>>>
>>>
>>>
>>>> [1] https://en.wikipedia.org/wiki/Internet_Mix
>>>>
>>>> With best regards,
>>>> Mykyta

^ permalink raw reply

* [PATCH] docs: networking: fix minor typos in various documentation files
From: Olivier Gayot @ 2018-06-04 10:07 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Olivier Gayot

This patch fixes some typos/misspelling errors in the
Documentation/networking files.

Signed-off-by: Olivier Gayot <olivier.gayot@sigexec.com>
---
 Documentation/networking/6lowpan.txt             |  4 ++--
 Documentation/networking/gtp.txt                 |  4 ++--
 Documentation/networking/ila.txt                 |  2 +-
 Documentation/networking/ip-sysctl.txt           |  2 +-
 Documentation/networking/ipsec.txt               |  4 ++--
 Documentation/networking/ipvlan.txt              |  4 ++--
 Documentation/networking/kcm.txt                 | 10 +++++-----
 Documentation/networking/nf_conntrack-sysctl.txt |  2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Documentation/networking/6lowpan.txt b/Documentation/networking/6lowpan.txt
index a7dc7e939c7a..2e5a939d7e6f 100644
--- a/Documentation/networking/6lowpan.txt
+++ b/Documentation/networking/6lowpan.txt
@@ -24,10 +24,10 @@ enum lowpan_lltypes.
 
 Example to evaluate the private usually you can do:
 
-static inline sturct lowpan_priv_foobar *
+static inline struct lowpan_priv_foobar *
 lowpan_foobar_priv(struct net_device *dev)
 {
-	return (sturct lowpan_priv_foobar *)lowpan_priv(dev)->priv;
+	return (struct lowpan_priv_foobar *)lowpan_priv(dev)->priv;
 }
 
 switch (dev->type) {
diff --git a/Documentation/networking/gtp.txt b/Documentation/networking/gtp.txt
index 0d9c18f05ec6..6966bbec1ecb 100644
--- a/Documentation/networking/gtp.txt
+++ b/Documentation/networking/gtp.txt
@@ -67,7 +67,7 @@ Don't be confused by terminology:  The GTP User Plane goes through
 kernel accelerated path, while the GTP Control Plane goes to
 Userspace :)
 
-The official homepge of the module is at
+The official homepage of the module is at
 https://osmocom.org/projects/linux-kernel-gtp-u/wiki
 
 == Userspace Programs with Linux Kernel GTP-U support ==
@@ -120,7 +120,7 @@ If yo have questions regarding how to use the Kernel GTP module from
 your own software, or want to contribute to the code, please use the
 osmocom-net-grps mailing list for related discussion. The list can be
 reached at osmocom-net-gprs@lists.osmocom.org and the mailman
-interface for managign your subscription is at
+interface for managing your subscription is at
 https://lists.osmocom.org/mailman/listinfo/osmocom-net-gprs
 
 == Issue Tracker ==
diff --git a/Documentation/networking/ila.txt b/Documentation/networking/ila.txt
index 78df879abd26..a17dac9dc915 100644
--- a/Documentation/networking/ila.txt
+++ b/Documentation/networking/ila.txt
@@ -121,7 +121,7 @@ three options to deal with this:
 
 - checksum neutral mapping
 		When an address is translated the difference can be offset
-		elsewhere in a part of the packet that is covered by the
+		elsewhere in a part of the packet that is covered by
 		the checksum. The low order sixteen bits of the identifier
 		are used. This method is preferred since it doesn't require
 		parsing a packet beyond the IP header and in most cases the
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 35ffaa281b26..92a91a025757 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -26,7 +26,7 @@ ip_no_pmtu_disc - INTEGER
 	discarded. Outgoing frames are handled the same as in mode 1,
 	implicitly setting IP_PMTUDISC_DONT on every created socket.
 
-	Mode 3 is a hardend pmtu discover mode. The kernel will only
+	Mode 3 is a hardened pmtu discover mode. The kernel will only
 	accept fragmentation-needed errors if the underlying protocol
 	can verify them besides a plain socket lookup. Current
 	protocols for which pmtu events will be honored are TCP, SCTP
diff --git a/Documentation/networking/ipsec.txt b/Documentation/networking/ipsec.txt
index 8dbc08b7e431..ba794b7e51be 100644
--- a/Documentation/networking/ipsec.txt
+++ b/Documentation/networking/ipsec.txt
@@ -25,8 +25,8 @@ Quote from RFC3173:
    is implementation dependent.
 
 Current IPComp implementation is indeed by the book, while as in practice
-when sending non-compressed packet to the peer(whether or not packet len
-is smaller than the threshold or the compressed len is large than original
+when sending non-compressed packet to the peer (whether or not packet len
+is smaller than the threshold or the compressed len is larger than original
 packet len), the packet is dropped when checking the policy as this packet
 matches the selector but not coming from any XFRM layer, i.e., with no
 security path. Such naked packet will not eventually make it to upper layer.
diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 812ef003e0a8..27a38e50c287 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -73,11 +73,11 @@ mode to make conn-tracking work.
 	This is the default option. To configure the IPvlan port in this mode,
 user can choose to either add this option on the command-line or don't specify
 anything. This is the traditional mode where slaves can cross-talk among
-themseleves apart from talking through the master device.
+themselves apart from talking through the master device.
 
 5.2 private:
 	If this option is added to the command-line, the port is set in private
-mode. i.e. port wont allow cross communication between slaves.
+mode. i.e. port won't allow cross communication between slaves.
 
 5.3 vepa:
 	If this is added to the command-line, the port is set in VEPA mode.
diff --git a/Documentation/networking/kcm.txt b/Documentation/networking/kcm.txt
index 9a513295b07c..b773a5278ac4 100644
--- a/Documentation/networking/kcm.txt
+++ b/Documentation/networking/kcm.txt
@@ -1,4 +1,4 @@
-Kernel Connection Mulitplexor
+Kernel Connection Multiplexor
 -----------------------------
 
 Kernel Connection Multiplexor (KCM) is a mechanism that provides a message based
@@ -31,7 +31,7 @@ KCM implements an NxM multiplexor in the kernel as diagrammed below:
 KCM sockets
 -----------
 
-The KCM sockets provide the user interface to the muliplexor. All the KCM sockets
+The KCM sockets provide the user interface to the multiplexor. All the KCM sockets
 bound to a multiplexor are considered to have equivalent function, and I/O
 operations in different sockets may be done in parallel without the need for
 synchronization between threads in userspace.
@@ -199,7 +199,7 @@ while. Example use:
 BFP programs for message delineation
 ------------------------------------
 
-BPF programs can be compiled using the BPF LLVM backend. For exmple,
+BPF programs can be compiled using the BPF LLVM backend. For example,
 the BPF program for parsing Thrift is:
 
   #include "bpf.h" /* for __sk_buff */
@@ -222,7 +222,7 @@ messages. The kernel provides necessary assurances that messages are sent
 and received atomically. This relieves much of the burden applications have
 in mapping a message based protocol onto the TCP stream. KCM also make
 application layer messages a unit of work in the kernel for the purposes of
-steerng and scheduling, which in turn allows a simpler networking model in
+steering and scheduling, which in turn allows a simpler networking model in
 multithreaded applications.
 
 Configurations
@@ -272,7 +272,7 @@ on the socket thus waking up the application thread. When the application
 sees the error (which may just be a disconnect) it should unattach the
 socket from KCM and then close it. It is assumed that once an error is
 posted on the TCP socket the data stream is unrecoverable (i.e. an error
-may have occurred in the middle of receiving a messssge).
+may have occurred in the middle of receiving a message).
 
 TCP connection monitoring
 -------------------------
diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt
index 433b6724797a..1669dc2419fd 100644
--- a/Documentation/networking/nf_conntrack-sysctl.txt
+++ b/Documentation/networking/nf_conntrack-sysctl.txt
@@ -156,7 +156,7 @@ nf_conntrack_timestamp - BOOLEAN
 nf_conntrack_udp_timeout - INTEGER (seconds)
 	default 30
 
-nf_conntrack_udp_timeout_stream2 - INTEGER (seconds)
+nf_conntrack_udp_timeout_stream - INTEGER (seconds)
 	default 180
 
 	This extended timeout will be used in case there is an UDP stream
-- 
2.17.1

^ permalink raw reply related

* pull request: bluetooth-next 2018-06-04
From: Johan Hedberg @ 2018-06-04 10:15 UTC (permalink / raw)
  To: davem; +Cc: linux-bluetooth, netdev

[-- Attachment #1: Type: text/plain, Size: 2766 bytes --]

Hi Dave,

Here's one last bluetooth-next pull request for the 4.18 kernel:

 - New USB device IDs for Realtek 8822BE and 8723DE
 - reset/resume fix for Dell Inspiron 5565
 - Fix HCI_UART_INIT_PENDING flag behavior
 - Fix patching behavior for some ATH3012 models
 - A few other minor cleanups & fixes

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit 874fcf1de613ad7b3cecf8a9cfe806fe7c2e3c68:

  Merge tag 'mlx5e-updates-2018-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux (2018-05-29 09:45:13 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git for-upstream

for you to fetch changes up to 1cd2fabf4bdcf95eda6a1bcebc4a0a965509da36:

  Bluetooth: btusb: Add additional device ID for RTL8822BE (2018-05-30 15:45:01 +0200)

----------------------------------------------------------------
Andy Shevchenko (2):
      Bluetooth: Re-use kstrtobool_from_user()
      Bluetooth: btmrvl: Re-use kstrtol_from_user()

Arend van Spriel (1):
      Bluetooth: btmrvl: support sysfs initiated firmware coredump

Artiom Vaskov (1):
      Bluetooth: btusb: Add additional device ID for RTL8822BE

Hans de Goede (4):
      Bluetooth: btusb: Add Dell Inspiron 5565 to btusb_needs_reset_resume_table
      Bluetooth: hci_uart: Restore hci_dev->flush callback on open()
      Bluetooth: hci_serdev: Move serdev_device_close/open into common hci_serdev code
      Bluetooth: hci_serdev: Fix HCI_UART_INIT_PENDING not working

Jian-Hong Pan (1):
      Bluetooth: btusb: Add a new Realtek 8723DE ID 2ff8:b011

Takashi Iwai (1):
      Bluetooth: btusb: Apply QCA Rome patches for some ATH3012 models

Thierry Escande (1):
      Bluetooth: hci_qca: Fix "Sleep inside atomic section" warning

Vaibhav Murkute (1):
      Bluetooth: hci_serdev: Removed unnecessary curly braces

 drivers/bluetooth/btmrvl_debugfs.c | 55 +++-----------------------------------
 drivers/bluetooth/btmrvl_drv.h     |  2 --
 drivers/bluetooth/btmrvl_main.c    |  6 -----
 drivers/bluetooth/btmrvl_sdio.c    | 11 +++++---
 drivers/bluetooth/btusb.c          | 43 ++++++++++++++++++++++++-----
 drivers/bluetooth/hci_bcm.c        | 10 +------
 drivers/bluetooth/hci_ldisc.c      | 22 ++++++++-------
 drivers/bluetooth/hci_ll.c         |  3 ---
 drivers/bluetooth/hci_nokia.c      |  3 ---
 drivers/bluetooth/hci_qca.c        |  2 +-
 drivers/bluetooth/hci_serdev.c     | 32 ++++++++++++++--------
 drivers/bluetooth/hci_uart.h       |  1 +
 net/bluetooth/hci_core.c           | 23 +++++-----------
 net/bluetooth/hci_debugfs.c        | 24 ++++++-----------
 net/bluetooth/smp.c                | 12 +++------
 15 files changed, 102 insertions(+), 147 deletions(-)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH] atmel: use memdup_user to simplify the code
From: YueHaibing @ 2018-06-04 10:32 UTC (permalink / raw)
  To: davem, kvalo; +Cc: netdev, linux-kernel, linux-wireless, YueHaibing

use existing memdup_user() helper function instead of open-coding

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/wireless/atmel/atmel.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index d122386..30b479a 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -2657,14 +2657,9 @@ static int atmel_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 			break;
 		}
 
-		if (!(new_firmware = kmalloc(com.len, GFP_KERNEL))) {
-			rc = -ENOMEM;
-			break;
-		}
-
-		if (copy_from_user(new_firmware, com.data, com.len)) {
-			kfree(new_firmware);
-			rc = -EFAULT;
+		new_firmware = memdup_user(com.data, com.len);
+		if (IS_ERR(new_firmware)) {
+			rc = PTR_ERR(new_firmware);
 			break;
 		}
 
-- 
2.7.0

^ permalink raw reply related

* Re: [PATCH bpf-next v3 05/11] bpf: avoid retpoline for lookup/update/delete calls on maps
From: Phil Sutter @ 2018-06-04 11:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Daniel Borkmann, alexei.starovoitov, netdev, Jakub Kicinski,
	Jakub Kicinski, Quentin Monnet
In-Reply-To: <20180603190855.53447a7d@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 776 bytes --]

Hi!

On Sun, Jun 03, 2018 at 07:08:55PM +0200, Jesper Dangaard Brouer wrote:
[...]
> Secondly I personally *hate* how the 'ip' does it's short options
> parsing and especially order/precedence ambiguity.  Phil Sutter
> (Fedora/RHEL iproute2 maintainer) have a funny quiz illustrating the
> ambiguity issues.

Hehe, yes. It's a classical case of something smart evolving into a
pain: At first there's only 'ip link', so you allow 'ip l' as a
shortcut. Then someone implements 'ip l2tp' - so what do you do?
Establish a policy of abbreviation having to be unique and break
existing behaviour or accept the mess and head on.

My suggestion would be to not get into the abbreviated subcommands
business at all but instead ship and maintain a bash-completion script.

Cheers, Phil

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: bpf_redirect_map not working after tail call
From: Jesper Dangaard Brouer via iovisor-dev @ 2018-06-04 11:04 UTC (permalink / raw)
  To: iovisor-dev, Daniel Borkmann
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <319a775a-4913-1e58-c0dd-f3d6e3f56d9e-8RLafaVCWuNeoWH0uzbU5w@public.gmane.org>


On Fri, 1 Jun 2018 14:15:58 +0200
Sebastiano Miano via iovisor-dev <iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org> wrote:

> Dear all,
> 
> We have noticed that the bpf_redirect_map returns an error when it is
> called after a tail call.
> The xdp_redirect_map program (under sample/bpf) works fine, but if we
> modify it as shown in the following diff, it doesn't work anymore.
> I have debugged it with the xdp_monitor application and the error
> returned is EFAULT.
> Is this a known issue? Am I doing something wrong?

Argh, this is likely an issue/bug due to the check xdp_map_invalid(),
that was introduced in commit 7c3001313396 ("bpf: fix ri->map_owner
pointer on bpf_prog_realloc").

To Daniel, I don't know how to solve this, could you give some advice?



 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
				   unsigned long aux)
 {
	return (unsigned long)xdp_prog->aux != aux;
 }

 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
			       struct bpf_prog *xdp_prog)
 {
	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
	unsigned long map_owner = ri->map_owner;
	struct bpf_map *map = ri->map;
	u32 index = ri->ifindex;
	void *fwd = NULL;
	int err;

	[...]
	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
		err = -EFAULT;
		map = NULL;
		goto err;
	}
	[...]


> P.S. I have tested the program with the latest bpf-next kernel.
> 
> ------------
> 
> diff --git a/samples/bpf/xdp_redirect_map_kern.c
> b/samples/bpf/xdp_redirect_map_kern.c
> index 740a529..bf1275a 100644
> --- a/samples/bpf/xdp_redirect_map_kern.c
> +++ b/samples/bpf/xdp_redirect_map_kern.c
> @@ -36,6 +36,13 @@ struct bpf_map_def SEC("maps") rxcnt = {
>  	.max_entries = 1,
>  };
> 
> +struct bpf_map_def SEC("maps") prog_table = {
> +	.type 		= BPF_MAP_TYPE_PROG_ARRAY,
> +	.key_size = sizeof(int),
> +	.value_size = sizeof(int),
> +	.max_entries = 32,
> +};
> +
>  static void swap_src_dst_mac(void *data)
>  {
>  	unsigned short *p = data;
> @@ -89,4 +96,15 @@ int xdp_redirect_dummy_prog(struct xdp_md *ctx)
>  	return XDP_PASS;
>  }
> 
> +/* Entry point */
> +SEC("xdp_redirect_entry_point")
> +int xdp_redirect_entry_point_prog(struct xdp_md *ctx)
> +{
> +	//char fmt[] = "xdp_redirect_entry_point\n";
> +	//bpf_trace_printk(fmt, sizeof(fmt));
> +	bpf_tail_call(ctx, &prog_table, 0);
> +	// Tail call failed
> +	return XDP_DROP;
> +}
> +
>  char _license[] SEC("license") = "GPL";
> diff --git a/samples/bpf/xdp_redirect_map_user.c
> b/samples/bpf/xdp_redirect_map_user.c
> index 4445e76..b2d2059 100644
> --- a/samples/bpf/xdp_redirect_map_user.c
> +++ b/samples/bpf/xdp_redirect_map_user.c
> @@ -120,7 +120,13 @@ int main(int argc, char **argv)
>  		return 1;
>  	}
> 
> -	if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
> +	ret = bpf_map_update_elem(map_fd[2], &key, &prog_fd[0], 0);
> +	if (ret) {
> +		perror("bpf_update_elem");
> +		goto out;
> +	}
> +
> +	if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[2], xdp_flags) < 0) {
>  		printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
>  		return 1;
>  	}
> _______________________________________________
> iovisor-dev mailing list
> iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org
> https://lists.iovisor.org/mailman/listinfo/iovisor-dev



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH 3/6] ravb: remove custom .set_link_ksettings from ethtool ops
From: Vladimir Zapolskiy @ 2018-06-04 11:07 UTC (permalink / raw)
  To: Sergei Shtylyov, David S. Miller; +Cc: netdev, linux-renesas-soc
In-Reply-To: <6029136a-0aac-31f6-ef9e-edb3b7b45cd1@cogentembedded.com>

Hello Sergei,

On 06/03/2018 06:42 PM, Sergei Shtylyov wrote:
> Hello!
> 
>    Sorry for the delay replying, the management keeps me busy... :-(
> 
> On 05/28/2018 12:51 PM, Vladimir Zapolskiy wrote:
> 
>>>> The change replaces a custom implementation of .set_link_ksettings
>>>> callback with a shared phy_ethtool_set_link_ksettings(), this fixes
>>>> sleep in atomic context bug, which is encountered every time when link
>>>> settings are changed by ethtool.
>>>
>>>    Seeing it now...
> 
>    And to say that this is *fixed* by removing the custom method is err...
> simply misleading. The sleep in atomic context is fixed solely by the removal
> of the spinlock grabbing before the phylib call.
> 

As I've already said, "the removal of the spinlock grabbing before the phylib
call" is not a valid fix, but it will introduce another regression.

>>>> Now duplex mode setting is enforced in ravb_adjust_link() only, also
>>>> now TX/RX is disabled when link is put down or modifications to E-MAC
>>>> registers ECMR and GECMR are expected for both cases of checked and
>>>> ignored link status pin state from E-MAC interrupt handler.
>>>>
>>>> Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
>>>> ---
>>>>  drivers/net/ethernet/renesas/ravb_main.c | 58 +++++++++-----------------------
>>>>  1 file changed, 15 insertions(+), 43 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
>>>> index 3d91caa44176..0d811c02ff34 100644
>>>> --- a/drivers/net/ethernet/renesas/ravb_main.c
>>>> +++ b/drivers/net/ethernet/renesas/ravb_main.c
>>>> @@ -980,6 +980,13 @@ static void ravb_adjust_link(struct net_device *ndev)
>>>>  	struct ravb_private *priv = netdev_priv(ndev);
>>>>  	struct phy_device *phydev = ndev->phydev;
>>>>  	bool new_state = false;
>>>> +	unsigned long flags;
>>>> +
>>>> +	spin_lock_irqsave(&priv->lock, flags);
>>>> +
>>>> +	/* Disable TX and RX right over here, if E-MAC change is ignored */
>>>> +	if (priv->no_avb_link)
>>>> +		ravb_rcv_snd_disable(ndev);
>>>>  
>>>>  	if (phydev->link) {
>>>>  		if (phydev->duplex != priv->duplex) {
>>>> @@ -997,18 +1004,21 @@ static void ravb_adjust_link(struct net_device *ndev)
>>>>  			ravb_modify(ndev, ECMR, ECMR_TXF, 0);
>>>>  			new_state = true;
>>>>  			priv->link = phydev->link;
>>>> -			if (priv->no_avb_link)
>>>> -				ravb_rcv_snd_enable(ndev);
>>>>  		}
>>>>  	} else if (priv->link) {
>>>>  		new_state = true;
>>>>  		priv->link = 0;
>>>>  		priv->speed = 0;
>>>>  		priv->duplex = -1;
>>>> -		if (priv->no_avb_link)
>>>> -			ravb_rcv_snd_disable(ndev);
>>>>  	}
>>>>  
>>>> +	/* Enable TX and RX right over here, if E-MAC change is ignored */
>>>> +	if (priv->no_avb_link && phydev->link)
>>>> +		ravb_rcv_snd_enable(ndev);
>>>> +
>>>> +	mmiowb();
>>>> +	spin_unlock_irqrestore(&priv->lock, flags);
>>>> +
>>>
>>>    I like this part. :-)
>>>
>>
>> A weight off my mind :) And I hope that this change will remain the less
>> questionable one, other ones from the series are trivial.
>>
>> Anyway I hope it is understandable that this part of the change can not
>> be simply extracted from the rest one below, otherwise there'll be bugs of
>> another type intorduced.
> 
>    I never said I'd like to apply this part alone, my idea was more like removing
> the spinlock grabbing and the duplex handling down below.
> 

As I've already said, "the removal of the spinlock grabbing" is not a valid fix,
but it will introduce another regression.

Please tell me, a removal of duplex handling change should be done before
a removal of the spinlock grabbing? Or after?

> [...]
>>>> @@ -1096,44 +1106,6 @@ static int ravb_phy_start(struct net_device *ndev)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> -static int ravb_set_link_ksettings(struct net_device *ndev,
>>>> -				   const struct ethtool_link_ksettings *cmd)
>>>> -{
>>>> -	struct ravb_private *priv = netdev_priv(ndev);
>>>> -	unsigned long flags;
>>>> -	int error;
>>>> -
>>>> -	if (!ndev->phydev)
>>>> -		return -ENODEV;
>>>> -
>>>> -	spin_lock_irqsave(&priv->lock, flags);
>>>> -
>>>> -	/* Disable TX and RX */
>>>> -	ravb_rcv_snd_disable(ndev);
>>>> -
>>>> -	error = phy_ethtool_ksettings_set(ndev->phydev, cmd);
>>>> -	if (error)
>>>> -		goto error_exit;
>>>> -
>>>> -	if (cmd->base.duplex == DUPLEX_FULL)
>>>> -		priv->duplex = 1;
>>>> -	else
>>>> -		priv->duplex = 0;
>>>> -
>>>> -	ravb_set_duplex(ndev);
>>>> -
>>>> -error_exit:
>>>> -	mdelay(1);
>>>> -
>>>> -	/* Enable TX and RX */
>>>> -	ravb_rcv_snd_enable(ndev);
>>>> -
>>>> -	mmiowb();
>>>> -	spin_unlock_irqrestore(&priv->lock, flags);
>>>> -
>>>> -	return error;
>>>> -}
>>>> -
>>>
>>>    But this part is clearly lumping it all together... 
>>
>> Please elaborate.
> 
>    My point is still that complete removal of the custom method was somewhat
> premature and completely unnecessary for fixing the issues we have.
> 
>>> [...]
>>>> @@ -1357,7 +1329,7 @@ static const struct ethtool_ops ravb_ethtool_ops = {
>>>>  	.set_ringparam		= ravb_set_ringparam,
>>>>  	.get_ts_info		= ravb_get_ts_info,
>>>>  	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
>>>> -	.set_link_ksettings	= ravb_set_link_ksettings,
>>>> +	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
>>>
>>>    Should have been a part of the final patch in the fix/enhancement chain...
>>
>> Please elaborate.
>>
>> Do you mean that firstly I have to make erroneous ravb_set_link_ksettings()
>> to look similar to phy_ethtool_set_link_ksettings() and then remove it?
> 
>    Yes.
> 

Then this change of "ravb_set_link_ksettings() looks similar to
phy_ethtool_set_link_ksettings()" will be a single commit, and it will be
a fix. Does it sound good?

>> As I see it in the current context (removal of ravb_set_duplex() call and
>> so on), the problem with this approach is that the actual fix change will
>> be done on top of a number of enchancement changes, thus it contradicts to
> 
>    Now I have to ask you to elaborate. I have no idea what you mean. :-(
> 

My statement is based on the next facts:
1. ravb_set_duplex() call in ravb_set_link_ksettings() is unnecessary,
   however its removal is an enchancement,
2. removal of the spinlock grabbing is just a *part* of the proper fix,
   and the complete proper fix includes ravb_set_duplex() call removal,
   adding spinlock grabbing to ravb_adjust_link() and other modifications
   to ravb_adjust_link() from this commit.
3. commits with fixes must precede commits with enchancements in the
   series, because enchancements are not backported.

The question remains the same, what do you ask me to do? 

>    And of course, sometimes the things are broken in a so subtle way, that
> only as pile of "cleanups" fixed them, we had that situation in e.g. the
> R-Car I2C driver -- *none* of AFAIR 9 patches was good as a -stable patch...
> 
>> the accepted development/maintenace model "fixes first", and most probably
>> it won't be possible to backport the real fix, however this sole change can
>> be backported.
> 
>    My idea was to move the [G]ECMR writes to the adjust_link() callback and
> to stop grabbing the spinlock where it *was* grabbed in the same fix patch.
> Then just a single clean up, to start using the new phylib method.
> 

It will be okay iff ravb_set_duplex() call removal is added to the first
part ("fixes" part) of two changes. Please confirm that our understanding
is aligned.

^ permalink raw reply

* [PATCH net] ipmr: fix error path when mr_table_alloc fails
From: Sabrina Dubroca @ 2018-06-04 11:55 UTC (permalink / raw)
  To: netdev
  Cc: Sabrina Dubroca, Eric Dumazet, Nikolay Aleksandrov, Yuval Mintz,
	Ivan Vecera

commit 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
refactored ipmr_new_table, so that it now returns NULL when
mr_table_alloc fails. Unfortunately, all callers of ipmr_new_table
expect an ERR_PTR. commit 66fb33254f45 ("ipmr: properly check
rhltable_init() return value") followed suit.

This can result in NULL deref, when ipmr_rules_exit calls
ipmr_free_table with NULL net->ipv4.mrt in the
!CONFIG_IP_MROUTE_MULTIPLE_TABLES version.

This patch makes mr_table_alloc return errors, and changes
ip6mr_new_table and its callers to return/expect error pointers as
well. It also removes the version of mr_table_alloc defined under
!CONFIG_IP_MROUTE_COMMON, since it is never used.

Fixes: 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
Fixes: 66fb33254f45 ("ipmr: properly check rhltable_init() return value")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
---
 include/linux/mroute_base.h | 10 ----------
 net/ipv4/ipmr_base.c        |  8 +++++---
 net/ipv6/ip6mr.c            | 19 +++++++++++++------
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index d617fe45543e..d633f737b3c6 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -307,16 +307,6 @@ static inline void vif_device_init(struct vif_device *v,
 {
 }
 
-static inline void *
-mr_table_alloc(struct net *net, u32 id,
-	       struct mr_table_ops *ops,
-	       void (*expire_func)(struct timer_list *t),
-	       void (*table_set)(struct mr_table *mrt,
-				 struct net *net))
-{
-	return NULL;
-}
-
 static inline void *mr_mfc_find_parent(struct mr_table *mrt,
 				       void *hasharg, int parent)
 {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..cafb0506c8c9 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id,
 				 struct net *net))
 {
 	struct mr_table *mrt;
+	int err;
 
 	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
 	if (!mrt)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	mrt->id = id;
 	write_pnet(&mrt->net, net);
 
 	mrt->ops = *ops;
-	if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) {
+	err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+	if (err) {
 		kfree(mrt);
-		return NULL;
+		return ERR_PTR(err);
 	}
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b6ed17..f9b801bd00f8 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -227,8 +227,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	INIT_LIST_HEAD(&net->ipv6.mr6_tables);
 
 	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
-	if (!mrt) {
-		err = -ENOMEM;
+	if (IS_ERR(mrt)) {
+		err = PTR_ERR(mrt);
 		goto err1;
 	}
 
@@ -301,8 +301,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
 
 static int __net_init ip6mr_rules_init(struct net *net)
 {
-	net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
-	return net->ipv6.mrt6 ? 0 : -ENOMEM;
+	struct mr_table *mrt;
+
+	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+	if (IS_ERR(mrt))
+		return PTR_ERR(mrt);
+	net->ipv6.mrt6 = mrt;
+	return 0;
 }
 
 static void __net_exit ip6mr_rules_exit(struct net *net)
@@ -1743,6 +1748,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
 	case MRT6_TABLE:
 	{
+		struct mr_table *mrt;
 		u32 v;
 
 		if (optlen != sizeof(u32))
@@ -1757,8 +1763,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
 		rtnl_lock();
 		ret = 0;
-		if (!ip6mr_new_table(net, v))
-			ret = -ENOMEM;
+		mrt = ip6mr_new_table(net, v);
+		if (IS_ERR(mrt))
+			ret = PTR_ERR(mrt);
 		raw6_sk(sk)->ip6mr_table = v;
 		rtnl_unlock();
 		return ret;
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 0/5] AF_XDP: bug fixes and descriptor changes
From: Björn Töpel @ 2018-06-04 11:57 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, magnus.karlsson, alexander.h.duyck,
	alexander.duyck, ast, brouer, daniel, netdev, mykyta.iziumtsev
  Cc: Björn Töpel, john.fastabend, willemdebruijn.kernel, mst,
	michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
	francois.ozog, ilias.apalodimas, brian.brooks, andy, michael.chan

From: Björn Töpel <bjorn.topel@intel.com>

An issue with the current AF_XDP uapi raised by Mykyta Iziumtsev (see
https://www.spinics.net/lists/netdev/msg503664.html) is that it does
not support NICs that have a "type-writer" model in an efficient
way. In this model, a memory window is passed to the hardware and
multiple frames might be filled into that window, instead of just one
that we have in the current fixed frame-size model.

This patch set fixes two bugs in the current implementation and then
changes the uapi so that the type-writer model can be supported
efficiently by a possible future extension of AF_XDP.

These are the uapi changes in this patch:

* Change the "u32 idx" in the descriptors to "u64 addr". The current
  idx based format does NOT work for the type-writer model (as packets
  can start anywhere within a frame) but that a relative address
  pointer (the u64 addr) works well for both models in the prototype
  code we have that supports both models. We increased it from u32 to
  u64 to support umems larger than 4G. We have also removed the u16
  offset when having a "u64 addr" since that information is already
  carried in the least significant bits of the address.

* We want to use "u8 padding[5]" for something useful in the future
  (since we are not allowed to change its name), so we now call it
  just options so it can be extended for various purposes in the
  future. It is an u32 as that it what is left of the 16 byte
  descriptor.

* We changed the name of frame_size in the UMEM_REG setsockopt to
  chunk_size since this naming also makes sense to the type-writer
  model.

With these changes to the uapi, we believe the type-writer model can
be supported without having to resort to a new descriptor format. The
type-writer model could then be supported, from the uapi point of
view, by setting a flag at bind time and providing a new flag bit in
the options field of the descriptor that signals to user space that
all packets have been written in a chunk. Or with a new chunk
completion queue as suggested by Mykyta in his latest feedback mail on
the list.

We based this patch set on bpf-next commit bd3a08aaa9a3 ("bpf:
flowlabel in bpf_fib_lookup should be flowinfo")

The structure of the patch set is as follows:

Patches 1-2: Fixes two bugs in the current implementation.
Patches 3-4: Prepares the uapi for a "type-writer" model and modifies
             the sample application so that it works with the new
	     uapi.
Patch 5: Small performance improvement patch for the sample application.

Cheers: Magnus and Björn

Björn Töpel (4):
  xsk: proper fill queue descriptor validation
  xsk: proper Rx drop statistics update
  xsk: new descriptor addressing scheme
  samples/bpf: adapted to new uapi

Magnus Karlsson (1):
  samples/bpf: minor *_nb_free performance fix

 Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++---------------
 include/uapi/linux/if_xdp.h         |  12 ++---
 net/xdp/xdp_umem.c                  |  33 ++++++------
 net/xdp/xdp_umem.h                  |  27 +++-------
 net/xdp/xdp_umem_props.h            |   4 +-
 net/xdp/xsk.c                       |  41 ++++++++-------
 net/xdp/xsk_queue.c                 |   2 +-
 net/xdp/xsk_queue.h                 |  63 ++++++++--------------
 samples/bpf/xdpsock_user.c          |  92 +++++++++++++++-----------------
 9 files changed, 172 insertions(+), 203 deletions(-)

-- 
2.14.1

^ permalink raw reply

* [PATCH bpf-next 1/5] xsk: proper fill queue descriptor validation
From: Björn Töpel @ 2018-06-04 11:57 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, magnus.karlsson, alexander.h.duyck,
	alexander.duyck, ast, brouer, daniel, netdev, mykyta.iziumtsev
  Cc: Björn Töpel, john.fastabend, willemdebruijn.kernel, mst,
	michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
	francois.ozog, ilias.apalodimas, brian.brooks, andy, michael.chan
In-Reply-To: <20180604115715.17895-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

Previously the fill queue descriptor was not copied to kernel space
prior validating it, making it possible for userland to change the
descriptor post-kernel-validation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk.c       | 11 +++++------
 net/xdp/xsk_queue.h | 32 +++++++++-----------------------
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cce0e4f8a536..43554eb56fe6 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,20 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 *id, len = xdp->data_end - xdp->data;
+	u32 id, len = xdp->data_end - xdp->data;
 	void *buffer;
-	int err = 0;
+	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	id = xskq_peek_id(xs->umem->fq);
-	if (!id)
+	if (!xskq_peek_id(xs->umem->fq, &id))
 		return -ENOSPC;
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
+	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, *id, len,
+	err = xskq_produce_batch_desc(xs->rx, id, len,
 				      xs->umem->frame_headroom);
 	if (!err)
 		xskq_discard_id(xs->umem->fq);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index cb8e5be35110..b5924e7aeb2b 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -85,14 +85,15 @@ static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q)
+static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		if (xskq_is_valid_id(q, ring->desc[idx]))
-			return &ring->desc[idx];
+		*id = READ_ONCE(ring->desc[idx]);
+		if (xskq_is_valid_id(q, *id))
+			return id;
 
 		q->cons_tail++;
 	}
@@ -100,28 +101,22 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q)
+static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 {
-	struct xdp_umem_ring *ring;
-
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
 		smp_rmb();
-
-		return xskq_validate_id(q);
 	}
 
-	ring = (struct xdp_umem_ring *)q->ring;
-	return &ring->desc[q->cons_tail & q->ring_mask];
+	return xskq_validate_id(q, id);
 }
 
 static inline void xskq_discard_id(struct xsk_queue *q)
 {
 	q->cons_tail++;
-	(void)xskq_validate_id(q);
 }
 
 static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
@@ -174,11 +169,9 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		if (xskq_is_valid_desc(q, &ring->desc[idx])) {
-			if (desc)
-				*desc = ring->desc[idx];
+		*desc = READ_ONCE(ring->desc[idx]);
+		if (xskq_is_valid_desc(q, desc))
 			return desc;
-		}
 
 		q->cons_tail++;
 	}
@@ -189,27 +182,20 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
 					      struct xdp_desc *desc)
 {
-	struct xdp_rxtx_ring *ring;
-
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
 		smp_rmb();
-
-		return xskq_validate_desc(q, desc);
 	}
 
-	ring = (struct xdp_rxtx_ring *)q->ring;
-	*desc = ring->desc[q->cons_tail & q->ring_mask];
-	return desc;
+	return xskq_validate_desc(q, desc);
 }
 
 static inline void xskq_discard_desc(struct xsk_queue *q)
 {
 	q->cons_tail++;
-	(void)xskq_validate_desc(q, NULL);
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-- 
2.14.1

^ permalink raw reply related

* [PATCH bpf-next 2/5] xsk: proper Rx drop statistics update
From: Björn Töpel @ 2018-06-04 11:57 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, magnus.karlsson, alexander.h.duyck,
	alexander.duyck, ast, brouer, daniel, netdev, mykyta.iziumtsev
  Cc: Björn Töpel, john.fastabend, willemdebruijn.kernel, mst,
	michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
	francois.ozog, ilias.apalodimas, brian.brooks, andy, michael.chan
In-Reply-To: <20180604115715.17895-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

Previously, rx_dropped could be updated incorrectly, e.g. if the XDP
program redirected the frame to a socket bound to a different queue
than where the XDP program was executing.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 43554eb56fe6..966307ce4b8e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -48,8 +48,10 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id))
+	if (!xskq_peek_id(xs->umem->fq, &id)) {
+		xs->rx_dropped++;
 		return -ENOSPC;
+	}
 
 	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
 	memcpy(buffer, xdp->data, len);
@@ -57,6 +59,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 				      xs->umem->frame_headroom);
 	if (!err)
 		xskq_discard_id(xs->umem->fq);
+	else
+		xs->rx_dropped++;
 
 	return err;
 }
@@ -68,8 +72,6 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	err = __xsk_rcv(xs, xdp);
 	if (likely(!err))
 		xdp_return_buff(xdp);
-	else
-		xs->rx_dropped++;
 
 	return err;
 }
@@ -87,8 +89,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	err = __xsk_rcv(xs, xdp);
 	if (!err)
 		xsk_flush(xs);
-	else
-		xs->rx_dropped++;
 
 	return err;
 }
-- 
2.14.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox