Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net] net: dsa: mv88e6xxx: lock mutex when freeing IRQs
From: Vivien Didelot @ 2017-09-26 17:48 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot

mv88e6xxx_g2_irq_free locks the registers mutex, but not
mv88e6xxx_g1_irq_free, which results in a stack trace from
assert_reg_lock when unloading the mv88e6xxx module. Fix this.

Fixes: 3460a5770ce9 ("net: dsa: mv88e6xxx: Mask g1 interrupts and free interrupt")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c6678aa9b4ef..b4359e4e5165 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -338,9 +338,11 @@ static void mv88e6xxx_g1_irq_free(struct mv88e6xxx_chip *chip)
 	int irq, virq;
 	u16 mask;
 
+	mutex_lock(&chip->reg_lock);
 	mv88e6xxx_g1_read(chip, MV88E6XXX_G1_CTL1, &mask);
 	mask |= GENMASK(chip->g1_irq.nirqs, 0);
 	mv88e6xxx_g1_write(chip, MV88E6XXX_G1_CTL1, mask);
+	mutex_unlock(&chip->reg_lock);
 
 	free_irq(chip->irq, chip);
 
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next] net-next/hinic: Set Rxq irq to specific cpu for NUMA
From: Aviad Krawczyk @ 2017-09-26 17:57 UTC (permalink / raw)
  To: davem; +Cc: linux-kernel, netdev, Aviad Krawczyk

Set Rxq irq to specific cpu for allocating and receiving the skb from
the same node.

Signed-off-by: Aviad Krawczyk <aviad.krawczyk@huawei.com>
---
 drivers/net/ethernet/huawei/hinic/hinic_rx.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
index 1d4f712..e2e5cdc 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
@@ -26,6 +26,7 @@
 #include <linux/skbuff.h>
 #include <linux/dma-mapping.h>
 #include <linux/prefetch.h>
+#include <linux/cpumask.h>
 #include <asm/barrier.h>
 
 #include "hinic_common.h"
@@ -171,11 +172,10 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 	struct hinic_sge sge;
 	dma_addr_t dma_addr;
 	struct sk_buff *skb;
-	int i, alloc_more;
 	u16 prod_idx;
+	int i;
 
 	free_wqebbs = hinic_get_rq_free_wqebbs(rxq->rq);
-	alloc_more = 0;
 
 	/* Limit the allocation chunks */
 	if (free_wqebbs > nic_dev->rx_weight)
@@ -185,7 +185,6 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 		skb = rx_alloc_skb(rxq, &dma_addr);
 		if (!skb) {
 			netdev_err(rxq->netdev, "Failed to alloc Rx skb\n");
-			alloc_more = 1;
 			goto skb_out;
 		}
 
@@ -195,7 +194,6 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 					  &prod_idx);
 		if (!rq_wqe) {
 			rx_free_skb(rxq, skb, dma_addr);
-			alloc_more = 1;
 			goto skb_out;
 		}
 
@@ -211,9 +209,7 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 		hinic_rq_update(rxq->rq, prod_idx);
 	}
 
-	if (alloc_more)
-		tasklet_schedule(&rxq->rx_task);
-
+	tasklet_schedule(&rxq->rx_task);
 	return i;
 }
 
@@ -357,7 +353,7 @@ static int rxq_recv(struct hinic_rxq *rxq, int budget)
 	}
 
 	if (pkts)
-		tasklet_schedule(&rxq->rx_task); /* hinic_rx_alloc_pkts */
+		tasklet_schedule(&rxq->rx_task); /* rx_alloc_pkts */
 
 	u64_stats_update_begin(&rxq->rxq_stats.syncp);
 	rxq->rxq_stats.pkts += pkts;
@@ -417,6 +413,8 @@ static int rx_request_irq(struct hinic_rxq *rxq)
 	struct hinic_dev *nic_dev = netdev_priv(rxq->netdev);
 	struct hinic_hwdev *hwdev = nic_dev->hwdev;
 	struct hinic_rq *rq = rxq->rq;
+	struct hinic_qp *qp;
+	struct cpumask mask;
 	int err;
 
 	rx_add_napi(rxq);
@@ -432,7 +430,9 @@ static int rx_request_irq(struct hinic_rxq *rxq)
 		return err;
 	}
 
-	return 0;
+	qp = container_of(rq, struct hinic_qp, rq);
+	cpumask_set_cpu(qp->q_id % num_online_cpus(), &mask);
+	return irq_set_affinity_hint(rq->irq, &mask);
 }
 
 static void rx_free_irq(struct hinic_rxq *rxq)
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v2 2/2] ip_tunnel: add mpls over gre encapsulation
From: Amine Kherbouche @ 2017-09-26 17:58 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: netdev@vger.kernel.org, xeb, David Lamparter
In-Reply-To: <CAJieiUhJCgrykvLNEFaABg6MXKR=FhgbK=bCZF0_QrYFaeG2tw@mail.gmail.com>

Hi Roopa,

Thanks for the feedback, I have just one question:

On 09/26/2017 05:15 PM, Roopa Prabhu wrote:
>> +static int ipgre_tunnel_encap_add_mpls_ops(void)
>> > +{
>> > +       int ret = -1;
>> > +
>> > +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
>> > +       ret = ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
>> > +#endif
>> > +
>> > +       return ret;
>> > +}
>> > +
>> > +static void ipgre_tunnel_encap_del_mpls_ops(void)
>> > +{
>> > +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
>> > +       ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
>> > +#endif
>> > +}
>> > +
>> >  static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
>> >                        struct nlmsghdr *nlh, struct net *net, u32 portid,
>> >                        unsigned int nlm_flags);
>> > @@ -2486,6 +2521,10 @@ static int __init mpls_init(void)
>> >                       0);
>> >         rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
>> >                       mpls_netconf_dump_devconf, 0);
>> > +       err = ipgre_tunnel_encap_add_mpls_ops();
>> > +       if (err)
>> > +               pr_err("Can't add mpls over gre tunnel ops\n");
>> > +
> This will throw an error  if CONFIG_NET_IP_TUNNEL is not enabled.
> Can you pls put the CONFIG_NET_IP_TUNNEL around
> ipgre_tunnel_encap_add_mpls_ops ?

You want the CONFIG_NET_IP_TUNNEL definition around the declaration of 
the function or when it's called ? or both ? (I can adjust the code for 
any case).

> see CONFIG_INET checks in the rest of af_mpls as example.
> same for del_ops
>

Thanks,

Amine

^ permalink raw reply

* Re: [PATCH net] net: dsa: mv88e6xxx: lock mutex when freeing IRQs
From: Vivien Didelot @ 2017-09-26 18:01 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, linux-kernel, kernel, David S. Miller, Florian Fainelli
In-Reply-To: <20170926180136.GD3325@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> In general, i tried to keep the mutex out of the interrupt code.  That
> is not totally possible, the IRQ thread handler needs it. But
> otherwise, the IRQ code assumes it is called with the mutex taken.
>
> So i think it is better to hold the mutex in mv88e6xxx_remove() when
> calling mv88e6xxx_g1_irq_free().

I'd prefer that too. Respinning, thanks!

    Vivien

^ permalink raw reply

* Re: [PATCH net] net: dsa: mv88e6xxx: lock mutex when freeing IRQs
From: Andrew Lunn @ 2017-09-26 18:01 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: netdev, linux-kernel, kernel, David S. Miller, Florian Fainelli
In-Reply-To: <20170926174837.15999-1-vivien.didelot@savoirfairelinux.com>

On Tue, Sep 26, 2017 at 01:48:37PM -0400, Vivien Didelot wrote:
> mv88e6xxx_g2_irq_free locks the registers mutex, but not
> mv88e6xxx_g1_irq_free, which results in a stack trace from
> assert_reg_lock when unloading the mv88e6xxx module. Fix this.
> 
> Fixes: 3460a5770ce9 ("net: dsa: mv88e6xxx: Mask g1 interrupts and free interrupt")
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
> ---
>  drivers/net/dsa/mv88e6xxx/chip.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
> index c6678aa9b4ef..b4359e4e5165 100644
> --- a/drivers/net/dsa/mv88e6xxx/chip.c
> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
> @@ -338,9 +338,11 @@ static void mv88e6xxx_g1_irq_free(struct mv88e6xxx_chip *chip)
>  	int irq, virq;
>  	u16 mask;
>  
> +	mutex_lock(&chip->reg_lock);
>  	mv88e6xxx_g1_read(chip, MV88E6XXX_G1_CTL1, &mask);
>  	mask |= GENMASK(chip->g1_irq.nirqs, 0);
>  	mv88e6xxx_g1_write(chip, MV88E6XXX_G1_CTL1, mask);
> +	mutex_unlock(&chip->reg_lock);
>  
>  	free_irq(chip->irq, chip);
>  

Hi Vivien

static int mv88e6xxx_probe(struct mdio_device *mdiodev)
{

...

out_g1_irq:
	if (chip->irq > 0) {
		mutex_lock(&chip->reg_lock);
		mv88e6xxx_g1_irq_free(chip);
		mutex_unlock(&chip->reg_lock);
	}

It looks like this will deadlock?

In general, i tried to keep the mutex out of the interrupt code.  That
is not totally possible, the IRQ thread handler needs it. But
otherwise, the IRQ code assumes it is called with the mutex taken.

So i think it is better to hold the mutex in mv88e6xxx_remove() when
calling mv88e6xxx_g1_irq_free().

   Andrew

^ permalink raw reply

* [PATCH net-next] net-next/hinic: Fix a case of Tx Queue is Stopped forever
From: Aviad Krawczyk @ 2017-09-26 18:11 UTC (permalink / raw)
  To: davem; +Cc: linux-kernel, netdev, Aviad Krawczyk

Fix the following scenario:
1. tx_free_poll is running on cpu X
2. xmit function is running on cpu Y and fails to get sq wqe
3. tx_free_poll frees wqes on cpu X and checks the queue is not stopped
4. xmit function stops the queue after failed to get sq wqe
5. The queue is stopped forever

Signed-off-by: Aviad Krawczyk <aviad.krawczyk@huawei.com>
---
 drivers/net/ethernet/huawei/hinic/hinic_tx.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
index abe3e38..9128858 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
@@ -212,10 +212,19 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 
 	sq_wqe = hinic_sq_get_wqe(txq->sq, wqe_size, &prod_idx);
 	if (!sq_wqe) {
-		tx_unmap_skb(nic_dev, skb, txq->sges);
-
 		netif_stop_subqueue(netdev, qp->q_id);
 
+		/* Check for the case free_tx_poll is called in another cpu
+		 * and we stopped the subqueue after free_tx_poll check.
+		 */
+		sq_wqe = hinic_sq_get_wqe(txq->sq, wqe_size, &prod_idx);
+		if (sq_wqe) {
+			netif_wake_subqueue(nic_dev->netdev, qp->q_id);
+			goto process_sq_wqe;
+		}
+
+		tx_unmap_skb(nic_dev, skb, txq->sges);
+
 		u64_stats_update_begin(&txq->txq_stats.syncp);
 		txq->txq_stats.tx_busy++;
 		u64_stats_update_end(&txq->txq_stats.syncp);
@@ -223,6 +232,7 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 		goto flush_skbs;
 	}
 
+process_sq_wqe:
 	hinic_sq_prepare_wqe(txq->sq, prod_idx, sq_wqe, txq->sges, nr_sges);
 
 	hinic_sq_write_wqe(txq->sq, prod_idx, sq_wqe, skb, wqe_size);
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v4 2/3] ipv4: Namespaceify tcp_fastopen_key knob
From: David Miller @ 2017-09-26 18:18 UTC (permalink / raw)
  To: yanhaishuang; +Cc: kuznet, edumazet, weiwan, lucab, netdev, linux-kernel
In-Reply-To: <F9EA5C4F-2306-4BC5-923A-3AB1D8F50FF1@cmss.chinamobile.com>

From: 严海双 <yanhaishuang@cmss.chinamobile.com>
Date: Tue, 26 Sep 2017 09:25:51 +0800

>> On 2017年9月26日, at 上午7:24, David Miller <davem@davemloft.net> wrote:
>> 
>> From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
>> Date: Fri, 22 Sep 2017 21:48:43 +0800
>> 
>>> @@ -9,13 +9,18 @@
>>> #include <net/inetpeer.h>
>>> #include <net/tcp.h>
>>> 
>>> -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
>>> -
>>> -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
>>> -
>>> -void tcp_fastopen_init_key_once(bool publish)
>>> +void tcp_fastopen_init_key_once(struct net *net)
>> 
>> Why did you remove the 'publish' logic from this function?
>> 
> 
> I think this logic is not necessary now, in proc_tcp_fastopen_key, I have removed 
> tcp_fastopen_init_key_once(false) where the ‘publish’ is false:
> 
> -		/* Generate a dummy secret but don't publish it. This
> -		 * is needed so we don't regenerate a new key on the
> -		 * first invocation of tcp_fastopen_cookie_gen
> -		 */
> -		tcp_fastopen_init_key_once(false);
> -		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
> +		tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
> 
> It said we don't regenerate a new key on first invocation of tcp_fastopen_cookie_gen, 
> but in tcp_fastopen_cookie_gen，it didn’t  call tcp_fastopen_init_key_once since
> from commit dfea2aa654243 (tcp: Do not call tcp_fastopen_reset_cipher from interrupt context)：
> 
> And in other places where call tcp_fastopen_init_key_once, the ‘publish’ is always true:

Ok, this simplification seems legitimate.

But it is unrelated to this namespacification.  So it should be in a separate patch,
and should be documented well in the commit message using the great explanation you
gave to me above.

Please respin this series, with this patch #2 split up into two changes.

Thank you.

^ permalink raw reply

* Re: [PATCH v2 net-next 0/4] qed: iWARP fixes and enhancements
From: David Miller @ 2017-09-26 18:22 UTC (permalink / raw)
  To: Michal.Kalderon; +Cc: netdev, leon, linux-rdma, dledford, Ariel.Elior
In-Reply-To: <1506244185-2129-1-git-send-email-Michal.Kalderon@cavium.com>

From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Sun, 24 Sep 2017 12:09:41 +0300

> This patch series includes several fixes and enhancements
> related to iWARP.
> 
> Patch #1 is actually the last of the initial iWARP submission.
> It has been delayed until now as I wanted to make sure that qedr
> supports iWARP prior to enabling iWARP device detection.
> 
> iWARP changes in RDMA tree have been accepted and targeted at
> kernel 4.15, therefore, all iWARP fixes for this cycle are
> submitted to net-next.
> 
> Changes from v1->v2 
>   - Added "Fixes:" tag to commit message of patch #3
> 
> Signed-off by: Michal.Kalderon@cavium.com
> Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>

Series applied, thanks Michal.

^ permalink raw reply

* Re: [PATCH] net: bcm63xx_enet: Use setup_timer and mod_timer
From: David Miller @ 2017-09-26 18:33 UTC (permalink / raw)
  To: himanshujha199640
  Cc: f.fainelli, bcm-kernel-feedback-list, arnd, yamada.masahiro,
	netdev, linux-arm-kernel, linux-kernel
In-Reply-To: <1506255084-10943-1-git-send-email-himanshujha199640@gmail.com>

From: Himanshu Jha <himanshujha199640@gmail.com>
Date: Sun, 24 Sep 2017 17:41:24 +0530

> Use setup_timer and mod_timer API instead of structure assignments.
> 
> This is done using Coccinelle and semantic patch used
> for this as follows:
> 
> @@
> expression x,y,z,a,b;
> @@
> 
> -init_timer (&x);
> +setup_timer (&x, y, z);
> +mod_timer (&a, b);
> -x.function = y;
> -x.data = z;
> -x.expires = b;
> -add_timer(&a);
> 
> Signed-off-by: Himanshu Jha <himanshujha199640@gmail.com>

Applied to net-next, thanks.

^ permalink raw reply

* Re: [PATCH] net/tls: move version check after second userspace fetch
From: David Miller @ 2017-09-26 18:37 UTC (permalink / raw)
  To: mengxu.gatech
  Cc: ilyal, aviadye, davejwatson, netdev, linux-kernel, meng.xu,
	sanidhya, taesoo
In-Reply-To: <1506266095-23003-1-git-send-email-mengxu.gatech@gmail.com>

From: Meng Xu <mengxu.gatech@gmail.com>
Date: Sun, 24 Sep 2017 11:14:55 -0400

> @@ -382,6 +376,12 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
>  			rc = -EFAULT;
>  			goto err_crypto_info;
>  		}
> +
> +		/* check version */
> +		if (crypto_info->version != TLS_1_2_VERSION) {
> +			rc = -ENOTSUPP;
> +			goto err_crypto_info;
> +		}
>  		break;
>  	}
>  	default:
> -- 
> 2.7.4
> 

Please, again, deduce this into a single userspace copy.  Otherwise we're going to
add this check for every cipher we add new support for.

Copy the on-stack crypto info ("tmp_crypto_info") into 'crypto_info' and then
bring in from userspace any further bytes that need to be copied.

That guarantees that the version will not change, and the existing check can
stay where it is.

^ permalink raw reply

* Re: [PATCH] net: stmmac: Meet alignment requirements for DMA
From: Paul Burton @ 2017-09-26 18:48 UTC (permalink / raw)
  To: David Miller
  Cc: matt.redfearn, netdev, alexandre.torgue, peppe.cavallaro,
	linux-kernel, linux-mips, james.hogan
In-Reply-To: <20170926.103400.1009342814128022820.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 4599 bytes --]

Hi David,

On Tuesday, 26 September 2017 10:34:00 PDT David Miller wrote:
> From: Matt Redfearn <matt.redfearn@imgtec.com>
> Date: Tue, 26 Sep 2017 14:57:39 +0100
> 
> > Since the MIPS architecture requires software cache management,
> > performing a dma_map_single(TO_DEVICE) will writeback and invalidate
> > the cachelines for the required region. To comply with (our
> > interpretation of) the DMA API documentation, the MIPS implementation
> > expects a cacheline aligned & sized buffer, so that it can writeback a
> > set of complete cachelines. Indeed a recent patch
> > (https://patchwork.linux-mips.org/patch/14624/) causes the MIPS dma
> > mapping code to emit warnings if the buffer it is requested to sync is
> > not cachline aligned. This driver, as is, fails this test and causes
> > thousands of warnings to be emitted.
> 
> For a device write, if the device does what it is told and does not
> access bytes outside of the periphery of the DMA mapping, nothing bad
> can happen.
> 
> When the DMA buffer is mapped, the cpu side cachelines are flushed (even
> ones which are partially part of the requsted mapping, this is _fine_).

This is not fine. Let's presume we writeback+invalidate the cache lines that 
are only partially covered by the DMA mapping at its beginning or end, and 
just invalidate all the lines that are wholly covered by the mapping. So far 
so good.

> The device writes into only the bytes it was given access to, which
> starts at the DMA address.

OK - still fine but *only* if we don't write to anything that happens to be 
part of the cache lines that are only partially covered by the DMA mapping & 
make them dirty. If we do then any writeback of those lines will clobber data 
the device wrote, and any invalidation of them will discard data the CPU 
wrote.

How would you have us ensure that such writes don't occur?

This really does not feel to me like something to leave up to drivers without 
any means of checking or enforcing correctness. The requirement to align DMA 
mappings at cache-line boundaries, as outlined in Documentation/DMA-API.txt, 
allows this to be enforced. If you want to ignore this requirement then there 
is no clear way to verify that we aren't writing to cache lines involved in a 
DMA transfer whilst the transfer is occurring, and no sane way to handle those 
cache lines if we do.

> The unmap and/or sync operation after the DMA transfer needs to do
> nothing on the cpu side since the map operation flushed the cpu side
> caches already.
> 
> When the cpu reads, it will pull the cacheline from main memory and
> see what the device wrote.

This is not true, because:

1) CPUs may speculatively read data. In between the invalidations that we did
   earlier & the point at which the transfer completes, the CPU may have
   speculatively read any arbitrary part of the memory mapped for DMA.

2) If we wrote to any lines that are only partially covered by the DMA mapping
   then of course they're valid & dirty, and an access won't fetch from main
   memory.

We therefore need to perform cache invalidation again at the end of the 
transfer - on MIPS you can grep for cpu_needs_post_dma_flush to find this. 
>From a glance ARM has similar post-DMA invalidation in 
__dma_page_dev_to_cpu(), ARM64 in __dma_unmap_area() etc etc.

At this point what would you have us do with cache lines that are only 
partially covered by the DMA mapping? As above - if we writeback+invalidate we 
risk clobbering data written by the device. If we just invalidate we risk 
discarding data written by the CPU. And this is even ignoring the risk that a 
writeback of one of these lines happens due to eviction during the DMA 
transfer, which we have no control over at all.

> It's not like the device can put "garbage" into the bytes earlier in
> the cacheline it was given partial access to.

Right - but the CPU can "put garbage" into the bytes the device was given 
access to, simply by fetching stale bytes from main memory before the device 
overwrites them.

> I really don't see what the problem is and why MIPS needs special
> handling.  You will have to give specific examples, step by step,
> where things can go wrong before I will be able to consider your
> changes.

MIPS doesn't need special handling - it just needs the driver to obey a 
documented restriction when using the DMA API. One could argue that it ought 
to be you who justifies why you feel networking drivers can ignore the 
documentation, and that if you disagree with the documented API you should aim 
to change it rather than ignore it.

Thanks,
    Paul

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH net-next] liquidio: fix format truncation warning reported by gcc 7.1.1
From: Felix Manlunas @ 2017-09-26 18:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla

From: Satanand Burla <satananda.burla@cavium.com>

gcc 7.1.1 with -Wformat-truncation reports these warnings:

drivers/net/ethernet/cavium/liquidio/lio_core.c: In function `octeon_setup_interrupt':
drivers/net/ethernet/cavium/liquidio/lio_core.c:1003:41: warning: `%u' directive output may be truncated writing between 1 and 10 bytes into a region of size between 0 and 13 [-Wformat-truncation=]
       INTRNAMSIZ, "LiquidIO%u-pf%u-rxtx-%u",
                                         ^~
drivers/net/ethernet/cavium/liquidio/lio_core.c:1003:19: note: directive argument in the range [0, 2147483647]
       INTRNAMSIZ, "LiquidIO%u-pf%u-rxtx-%u",
                   ^~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/cavium/liquidio/lio_core.c:1002:5: note: `snprintf' output between 21 and 43 bytes into a destination of size 32
     snprintf(&queue_irq_names[IRQ_NAME_OFF(i)],
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       INTRNAMSIZ, "LiquidIO%u-pf%u-rxtx-%u",
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       oct->octeon_id, oct->pf_num, i);
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/cavium/liquidio/lio_core.c:1008:41: warning: `%u' directive output may be truncated writing between 1 and 10 bytes into a region of size between 0 and 13 [-Wformat-truncation=]
       INTRNAMSIZ, "LiquidIO%u-vf%u-rxtx-%u",
                                         ^~
drivers/net/ethernet/cavium/liquidio/lio_core.c:1008:19: note: directive argument in the range [0, 2147483647]
       INTRNAMSIZ, "LiquidIO%u-vf%u-rxtx-%u",
                   ^~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/cavium/liquidio/lio_core.c:1007:5: note: `snprintf' output between 21 and 43 bytes into a destination of size 32
     snprintf(&queue_irq_names[IRQ_NAME_OFF(i)],
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       INTRNAMSIZ, "LiquidIO%u-vf%u-rxtx-%u",
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       oct->octeon_id, oct->vf_num, i);
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fix them by changing the type of the "i" local variable from int to short.

Signed-off-by: Satanand Burla <satananda.burla@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 23f6b60..55c5d44 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -899,11 +899,12 @@ int octeon_setup_interrupt(struct octeon_device *oct, u32 num_ioqs)
 {
 	struct msix_entry *msix_entries;
 	char *queue_irq_names = NULL;
-	int i, num_interrupts = 0;
+	int num_interrupts = 0;
 	int num_alloc_ioq_vectors;
 	char *aux_irq_name = NULL;
 	int num_ioq_vectors;
 	int irqret, err;
+	short i;
 
 	oct->num_msix_irqs = num_ioqs;
 	if (oct->msix_on) {

^ permalink raw reply related

* [PATCH net v2] net: dsa: mv88e6xxx: lock mutex when freeing IRQs
From: Vivien Didelot @ 2017-09-26 18:57 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot

mv88e6xxx_g2_irq_free locks the registers mutex, but not
mv88e6xxx_g1_irq_free, which results in a stack trace from
assert_reg_lock when unloading the mv88e6xxx module. Fix this.

Fixes: 3460a5770ce9 ("net: dsa: mv88e6xxx: Mask g1 interrupts and free interrupt")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c6678aa9b4ef..e7ff7483d2fb 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3947,7 +3947,9 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev)
 	if (chip->irq > 0) {
 		if (chip->info->g2_irqs > 0)
 			mv88e6xxx_g2_irq_free(chip);
+		mutex_lock(&chip->reg_lock);
 		mv88e6xxx_g1_irq_free(chip);
+		mutex_unlock(&chip->reg_lock);
 	}
 }
 
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH net v2] net: dsa: mv88e6xxx: lock mutex when freeing IRQs
From: Florian Fainelli @ 2017-09-26 19:05 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn
In-Reply-To: <20170926185721.12187-1-vivien.didelot@savoirfairelinux.com>

On 09/26/2017 11:57 AM, Vivien Didelot wrote:
> mv88e6xxx_g2_irq_free locks the registers mutex, but not
> mv88e6xxx_g1_irq_free, which results in a stack trace from
> assert_reg_lock when unloading the mv88e6xxx module. Fix this.
> 
> Fixes: 3460a5770ce9 ("net: dsa: mv88e6xxx: Mask g1 interrupts and free interrupt")
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
From: Michael S. Tsirkin @ 2017-09-26 19:13 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <1506067355-5771-4-git-send-email-jasowang@redhat.com>

On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> This patch introduces a helper which just increase the used idx. This
> will be used in pair with vhost_prefetch_desc_indices() by batching
> code.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 34 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 8424166d..6532cda 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>  }
>  EXPORT_SYMBOL_GPL(vhost_add_used);
>  
> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
> +{
> +	u16 old, new;
> +
> +	old = vq->last_used_idx;
> +	new = (vq->last_used_idx += n);
> +	/* If the driver never bothers to signal in a very long while,
> +	 * used index might wrap around. If that happens, invalidate
> +	 * signalled_used index we stored. TODO: make sure driver
> +	 * signals at least once in 2^16 and remove this.
> +	 */
> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> +		vq->signalled_used_valid = false;
> +
> +	/* Make sure buffer is written before we update index. */
> +	smp_wmb();
> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> +			   &vq->used->idx)) {
> +		vq_err(vq, "Failed to increment used idx");
> +		return -EFAULT;
> +	}
> +	if (unlikely(vq->log_used)) {
> +		/* Log used index update. */
> +		log_write(vq->log_base,
> +			  vq->log_addr + offsetof(struct vring_used, idx),
> +			  sizeof(vq->used->idx));
> +		if (vq->log_ctx)
> +			eventfd_signal(vq->log_ctx, 1);
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
> +
>  static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>  			    struct vring_used_elem *heads,
>  			    unsigned count)
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 16c2cb6..5dd6c05 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
>  int vhost_vq_init_access(struct vhost_virtqueue *);
> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>  int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>  		     unsigned count);

Please change the API to hide the fact that there's an index that needs
to be updated.

> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH net-next 2/6] bpf: add meta pointer for direct access
From: Jesper Dangaard Brouer @ 2017-09-26 19:13 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: brouer, davem, alexei.starovoitov, john.fastabend,
	peter.waskiewicz.jr, jakub.kicinski, netdev, Andy Gospodarek
In-Reply-To: <458f9c13ab58abb1a15627906d03c33c42b02a7c.1506297988.git.daniel@iogearbox.net>

On Mon, 25 Sep 2017 02:25:51 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> This work enables generic transfer of metadata from XDP into skb. The
> basic idea is that we can make use of the fact that the resulting skb
> must be linear and already comes with a larger headroom for supporting
> bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
> on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
> for adjusting a new pointer called xdp->data_meta. Thus, the packet has
> a flexible and programmable room for meta data, followed by the actual
> packet data. struct xdp_buff is therefore laid out that we first point
> to data_hard_start, then data_meta directly prepended to data followed
> by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
> account whether we have meta data already prepended and if so, memmove()s
> this along with the given offset provided there's enough room.
> 
> [...] The scratch space at the head
> of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
> yet supporting xdp->data_meta can simply be set up with xdp->data_meta
> as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
> such that the subsequent match against xdp->data for later access is
> guaranteed to fail.

So, xdp->meta_data is placed just before the packet xdp->data starts.

I'm currently implementing a cpumap type, that transfers raw XDP frames
to another CPU, and the SKB is allocated on the remote CPU.  (It
actually works extremely well).  

For transferring info I need, I'm currently using xdp->data_hard_start
(the top/start of the xdp page).  Which should be compatible with your
approach, right?

The info I need:

 struct xdp_pkt {
	void *data;
	u16 len;
	u16 headroom;
	struct net_device *dev_rx;
 };

When I enqueue the xdp packet I do the following:

 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
	struct net_device *dev_rx)
 {
	struct xdp_pkt *xdp_pkt;
	int headroom;

	/* Convert xdp_buff to xdp_pkt */
	headroom = xdp->data - xdp->data_hard_start;
	if (headroom < sizeof(*xdp_pkt))
		return -EOVERFLOW;
	xdp_pkt = xdp->data_hard_start;
	xdp_pkt->data = xdp->data;
	xdp_pkt->len  = xdp->data_end - xdp->data;
	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);

	/* Info needed when constructing SKB on remote CPU */
	xdp_pkt->dev_rx = dev_rx;

	bq_enqueue(rcpu, xdp_pkt);
	return 0;
 }

On the remote CPU dequeueing the packet, I'm doing the following.  As
you can see I'm still lacking some meta-data, that would be nice to
also transfer.  Could I use your infrastructure for that?

 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
					  struct xdp_pkt *xdp_pkt)
 {
	unsigned int truesize;
	void *pkt_data_start;
	struct sk_buff *skb;

	/* TODO: rcpu could provide truesize, it's static per RX-ring */
	truesize = 2048;

	// pkt_data_start = xdp_pkt + sizeof(*xdp_pkt);
	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;

	/* Need to adjust "truesize" for skb_shared_info to get proper
	 * placed, to take into account that xdp_pkt is using part of
	 * headroom
	 */
	skb = build_skb(pkt_data_start, truesize - sizeof(*xdp_pkt));
	if (!skb)
		return NULL;

	skb_reserve(skb, xdp_pkt->headroom);
	__skb_put(skb, xdp_pkt->len);

	// skb_record_rx_queue(skb, rx_ring->queue_index);
	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);

	// How much does csum matter? 
 //	skb->ip_summed = CHECKSUM_UNNECESSARY; // Try to fake it...

	// Does setting skb_set_hash()) matter?
 //	__skb_set_hash(skb, 42, true, false); // Say it is software
 //	__skb_set_hash(skb, 42, false, true); // Say it is hardware

	// Do we lack setting rx_queue... it doesn't seem to matter
 //	skb_record_rx_queue(skb, 0);

	return skb;
 }

(I'll send out some patches soonish, hopefully tomorrow... to show in
more details what I'm doing)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH iproute2 1/1] ip: initialize FILE pointer in ip-monitor
From: Roman Mashak @ 2017-09-26 19:13 UTC (permalink / raw)
  To: stephen; +Cc: netdev, Roman Mashak

Since FILE *_fp was not explicitly initialized, all the consequent print_*()
calls were failing.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 ip/ipmonitor.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index 3171d47..f4d502a 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -284,6 +284,9 @@ int do_ipmonitor(int argc, char **argv)
 	if (lnsid) {
 		groups |= nl_mgrp(RTNLGRP_NSID);
 	}
+
+	new_json_obj(json, stdout);
+
 	if (file) {
 		FILE *fp;
 		int err;
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
From: Michael S. Tsirkin @ 2017-09-26 19:19 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <1506067355-5771-3-git-send-email-jasowang@redhat.com>

On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>  
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update)

why do you need to combine used update with prefetch?

> +{
> +	int ret, ret2;
> +	u16 last_avail_idx, last_used_idx, total, copied;
> +	__virtio16 avail_idx;
> +	struct vring_used_elem __user *used;
> +	int i;
> +
> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +		vq_err(vq, "Failed to access avail idx at %p\n",
> +		       &vq->avail->idx);
> +		return -EFAULT;
> +	}
> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +	total = vq->avail_idx - vq->last_avail_idx;
> +	ret = total = min(total, num);
> +
> +	for (i = 0; i < ret; i++) {
> +		ret2 = vhost_get_avail(vq, heads[i].id,
> +				      &vq->avail->ring[last_avail_idx]);
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to get descriptors\n");
> +			return -EFAULT;
> +		}
> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +	}
> +
> +	if (!used_update)
> +		return ret;
> +
> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> +	while (total) {
> +		copied = min((u16)(vq->num - last_used_idx), total);
> +		ret2 = vhost_copy_to_user(vq,
> +					  &vq->used->ring[last_used_idx],
> +					  &heads[ret - total],
> +					  copied * sizeof(*used));
> +
> +		if (unlikely(ret2)) {
> +			vq_err(vq, "Failed to update used ring!\n");
> +			return -EFAULT;
> +		}
> +
> +		last_used_idx = 0;
> +		total -= copied;
> +	}
> +
> +	/* Only get avail ring entries after they have been exposed by guest. */
> +	smp_rmb();

Barrier before return is a very confusing API. I guess it's designed to
be used in a specific way to make it necessary - but what is it?


> +	return ret;
> +}
> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>  
>  static int __init vhost_init(void)
>  {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 39ff897..16c2cb6 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>  ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>  			     struct iov_iter *from);
>  int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 num, bool used_update);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
From: Michael S. Tsirkin @ 2017-09-26 19:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <1506067355-5771-6-git-send-email-jasowang@redhat.com>

On Fri, Sep 22, 2017 at 04:02:35PM +0800, Jason Wang wrote:
> This patch implements basic batched processing of tx virtqueue by
> prefetching desc indices and updating used ring in a batch. For
> non-zerocopy case, vq->heads were used for storing the prefetched
> indices and updating used ring. It is also a requirement for doing
> more batching on top. For zerocopy case and for simplicity, batched
> processing were simply disabled by only fetching and processing one
> descriptor at a time, this could be optimized in the future.
> 
> XDP_DROP (without touching skb) on tun (with Moongen in guest) with
> zercopy disabled:
> 
> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz:
> Before: 3.20Mpps
> After:  3.90Mpps (+22%)
> 
> No differences were seen with zerocopy enabled.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

So where is the speedup coming from? I'd guess the ring is
hot in cache, it's faster to access it in one go, then
pass many packets to net stack. Is that right?

Another possibility is better code cache locality.

So how about this patchset is refactored:

1. use existing APIs just first get packets then
   transmit them all then use them all
2. add new APIs and move the loop into vhost core
   for more speedups



> ---
>  drivers/vhost/net.c   | 215 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c |   2 +-
>  2 files changed, 121 insertions(+), 96 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c89640e..c439892 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -408,27 +408,25 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>  	return vhost_poll_start(poll, sock->file);
>  }
>  
> -static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> -				    struct vhost_virtqueue *vq,
> -				    struct iovec iov[], unsigned int iov_size,
> -				    unsigned int *out_num, unsigned int *in_num)
> +static bool vhost_net_tx_avail(struct vhost_net *net,
> +			       struct vhost_virtqueue *vq)
>  {
>  	unsigned long uninitialized_var(endtime);
> -	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				  out_num, in_num, NULL, NULL);
>  
> -	if (r == vq->num && vq->busyloop_timeout) {
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> -			cpu_relax();
> -		preempt_enable();
> -		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> -				      out_num, in_num, NULL, NULL);
> -	}
> +	if (!vq->busyloop_timeout)
> +		return false;
>  
> -	return r;
> +	if (!vhost_vq_avail_empty(vq->dev, vq))
> +		return true;
> +
> +	preempt_disable();
> +	endtime = busy_clock() + vq->busyloop_timeout;
> +	while (vhost_can_busy_poll(vq->dev, endtime) &&
> +		vhost_vq_avail_empty(vq->dev, vq))
> +		cpu_relax();
> +	preempt_enable();
> +
> +	return !vhost_vq_avail_empty(vq->dev, vq);
>  }
>  
>  static bool vhost_exceeds_maxpend(struct vhost_net *net)
> @@ -446,8 +444,9 @@ static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vring_used_elem used, *heads = vq->heads;
>  	unsigned out, in;
> -	int head;
> +	int avails, head;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
>  		.msg_namelen = 0,
> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>  	struct socket *sock;
>  	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>  	bool zcopy, zcopy_used;
> +	int i, batched = VHOST_NET_BATCH;
>  
>  	mutex_lock(&vq->mutex);
>  	sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = nvq->vhost_hlen;
>  	zcopy = nvq->ubufs;
>  
> +	/* Disable zerocopy batched fetching for simplicity */
> +	if (zcopy) {
> +		heads = &used;
> +		batched = 1;
> +	}
> +
>  	for (;;) {
>  		/* Release DMAs done buffers first */
>  		if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>  		if (unlikely(vhost_exceeds_maxpend(net)))
>  			break;
>  
> -		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> -						ARRAY_SIZE(vq->iov),
> -						&out, &in);
> +		avails = vhost_prefetch_desc_indices(vq, heads, batched, !zcopy);
>  		/* On error, stop handling until the next kick. */
> -		if (unlikely(head < 0))
> +		if (unlikely(avails < 0))
>  			break;
> -		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> -		if (head == vq->num) {
> +		/* Nothing new?  Busy poll for a while or wait for
> +		 * eventfd to tell us they refilled. */
> +		if (!avails) {
> +			if (vhost_net_tx_avail(net, vq))
> +				continue;
>  			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>  				vhost_disable_notify(&net->dev, vq);
>  				continue;
>  			}
>  			break;
>  		}
> -		if (in) {
> -			vq_err(vq, "Unexpected descriptor format for TX: "
> -			       "out %d, int %d\n", out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO. */
> -		len = iov_length(vq->iov, out);
> -		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> -		iov_iter_advance(&msg.msg_iter, hdr_size);
> -		/* Sanity check */
> -		if (!msg_data_left(&msg)) {
> -			vq_err(vq, "Unexpected header len for TX: "
> -			       "%zd expected %zd\n",
> -			       len, hdr_size);
> -			break;
> -		}
> -		len = msg_data_left(&msg);
> -
> -		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> -				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> -				      nvq->done_idx
> -				   && vhost_net_tx_select_zcopy(net);
> -
> -		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> -		if (zcopy_used) {
> -			struct ubuf_info *ubuf;
> -			ubuf = nvq->ubuf_info + nvq->upend_idx;
> -
> -			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
> -			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
> -			ubuf->callback = vhost_zerocopy_callback;
> -			ubuf->ctx = nvq->ubufs;
> -			ubuf->desc = nvq->upend_idx;
> -			refcount_set(&ubuf->refcnt, 1);
> -			msg.msg_control = ubuf;
> -			msg.msg_controllen = sizeof(ubuf);
> -			ubufs = nvq->ubufs;
> -			atomic_inc(&ubufs->refcount);
> -			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
> -		} else {
> -			msg.msg_control = NULL;
> -			ubufs = NULL;
> -		}
> +		for (i = 0; i < avails; i++) {
> +			head = __vhost_get_vq_desc(vq, vq->iov,
> +						   ARRAY_SIZE(vq->iov),
> +						   &out, &in, NULL, NULL,
> +					       vhost16_to_cpu(vq, heads[i].id));
> +			if (in) {
> +				vq_err(vq, "Unexpected descriptor format for "
> +					   "TX: out %d, int %d\n", out, in);
> +				goto out;
> +			}
>  
> -		total_len += len;
> -		if (total_len < VHOST_NET_WEIGHT &&
> -		    !vhost_vq_avail_empty(&net->dev, vq) &&
> -		    likely(!vhost_exceeds_maxpend(net))) {
> -			msg.msg_flags |= MSG_MORE;
> -		} else {
> -			msg.msg_flags &= ~MSG_MORE;
> -		}
> +			/* Skip header. TODO: support TSO. */
> +			len = iov_length(vq->iov, out);
> +			iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> +			iov_iter_advance(&msg.msg_iter, hdr_size);
> +			/* Sanity check */
> +			if (!msg_data_left(&msg)) {
> +				vq_err(vq, "Unexpected header len for TX: "
> +					"%zd expected %zd\n",
> +					len, hdr_size);
> +				goto out;
> +			}
> +			len = msg_data_left(&msg);
>  
> -		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> -		err = sock->ops->sendmsg(sock, &msg, len);
> -		if (unlikely(err < 0)) {
> +			zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> +				     && (nvq->upend_idx + 1) % UIO_MAXIOV !=
> +					nvq->done_idx
> +				     && vhost_net_tx_select_zcopy(net);
> +
> +			/* use msg_control to pass vhost zerocopy ubuf
> +			 * info to skb
> +			 */
>  			if (zcopy_used) {
> -				vhost_net_ubuf_put(ubufs);
> -				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
> -					% UIO_MAXIOV;
> +				struct ubuf_info *ubuf;
> +				ubuf = nvq->ubuf_info + nvq->upend_idx;
> +
> +				vq->heads[nvq->upend_idx].id =
> +					cpu_to_vhost32(vq, head);
> +				vq->heads[nvq->upend_idx].len =
> +					VHOST_DMA_IN_PROGRESS;
> +				ubuf->callback = vhost_zerocopy_callback;
> +				ubuf->ctx = nvq->ubufs;
> +				ubuf->desc = nvq->upend_idx;
> +				refcount_set(&ubuf->refcnt, 1);
> +				msg.msg_control = ubuf;
> +				msg.msg_controllen = sizeof(ubuf);
> +				ubufs = nvq->ubufs;
> +				atomic_inc(&ubufs->refcount);
> +				nvq->upend_idx =
> +					(nvq->upend_idx + 1) % UIO_MAXIOV;
> +			} else {
> +				msg.msg_control = NULL;
> +				ubufs = NULL;
> +			}
> +
> +			total_len += len;
> +			if (total_len < VHOST_NET_WEIGHT &&
> +				!vhost_vq_avail_empty(&net->dev, vq) &&
> +				likely(!vhost_exceeds_maxpend(net))) {
> +				msg.msg_flags |= MSG_MORE;
> +			} else {
> +				msg.msg_flags &= ~MSG_MORE;
> +			}
> +
> +			/* TODO: Check specific error and bomb out
> +			 * unless ENOBUFS?
> +			 */
> +			err = sock->ops->sendmsg(sock, &msg, len);
> +			if (unlikely(err < 0)) {
> +				if (zcopy_used) {
> +					vhost_net_ubuf_put(ubufs);
> +					nvq->upend_idx =
> +				   ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +				}
> +				vhost_discard_vq_desc(vq, 1);
> +				goto out;
> +			}
> +			if (err != len)
> +				pr_debug("Truncated TX packet: "
> +					" len %d != %zd\n", err, len);
> +			if (!zcopy) {
> +				vhost_add_used_idx(vq, 1);
> +				vhost_signal(&net->dev, vq);
> +			} else if (!zcopy_used) {
> +				vhost_add_used_and_signal(&net->dev,
> +							  vq, head, 0);
> +			} else
> +				vhost_zerocopy_signal_used(net, vq);
> +			vhost_net_tx_packet(net);
> +			if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +				vhost_poll_queue(&vq->poll);
> +				goto out;
>  			}
> -			vhost_discard_vq_desc(vq, 1);
> -			break;
> -		}
> -		if (err != len)
> -			pr_debug("Truncated TX packet: "
> -				 " len %d != %zd\n", err, len);
> -		if (!zcopy_used)
> -			vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -		else
> -			vhost_zerocopy_signal_used(net, vq);
> -		vhost_net_tx_packet(net);
> -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -			vhost_poll_queue(&vq->poll);
> -			break;
>  		}
>  	}
>  out:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 6532cda..8764df5 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -392,7 +392,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  		vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV,
>  				       GFP_KERNEL);
>  		vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL);
> -		vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
> +		vq->heads = kzalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL);
>  		if (!vq->indirect || !vq->log || !vq->heads)
>  			goto err_nomem;
>  	}
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
From: Michael S. Tsirkin @ 2017-09-26 19:26 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <1506067355-5771-1-git-send-email-jasowang@redhat.com>

On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
> Hi:
> 
> This series tries to implement basic tx batched processing. This is
> done by prefetching descriptor indices and update used ring in a
> batch. This intends to speed up used ring updating and improve the
> cache utilization. Test shows about ~22% improvement in tx pss.

> Please review.
> 
> Jason Wang (5):
>   vhost: split out ring head fetching logic
>   vhost: introduce helper to prefetch desc index
>   vhost: introduce vhost_add_used_idx()

Please squash these new APIs into where they are used.
This split-up just makes review harder for me as
I can't figure out how the new APIs are used.

>   vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH

This is ok as a separate patch.

>   vhost_net: basic tx virtqueue batched processing
> 
>  drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h |   9 ++
>  3 files changed, 270 insertions(+), 125 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH v2 2/2] ip_tunnel: add mpls over gre encapsulation
From: Roopa Prabhu @ 2017-09-26 19:31 UTC (permalink / raw)
  To: Amine Kherbouche; +Cc: netdev@vger.kernel.org, xeb, David Lamparter
In-Reply-To: <9f0675b0-cbda-9053-250f-19f3f837b124@6wind.com>

On Tue, Sep 26, 2017 at 10:58 AM, Amine Kherbouche
<amine.kherbouche@6wind.com> wrote:
> Hi Roopa,
>
> Thanks for the feedback, I have just one question:
>
>
> On 09/26/2017 05:15 PM, Roopa Prabhu wrote:
>>>
>>> +static int ipgre_tunnel_encap_add_mpls_ops(void)
>>> > +{
>>> > +       int ret = -1;
>>> > +
>>> > +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
>>> > +       ret = ip_tunnel_encap_add_ops(&mpls_iptun_ops,
>>> > TUNNEL_ENCAP_MPLS);
>>> > +#endif
>>> > +
>>> > +       return ret;
>>> > +}
>>> > +
>>> > +static void ipgre_tunnel_encap_del_mpls_ops(void)
>>> > +{
>>> > +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
>>> > +       ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
>>> > +#endif
>>> > +}
>>> > +
>>> >  static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
>>> >                        struct nlmsghdr *nlh, struct net *net, u32
>>> > portid,
>>> >                        unsigned int nlm_flags);
>>> > @@ -2486,6 +2521,10 @@ static int __init mpls_init(void)
>>> >                       0);
>>> >         rtnl_register(PF_MPLS, RTM_GETNETCONF,
>>> > mpls_netconf_get_devconf,
>>> >                       mpls_netconf_dump_devconf, 0);
>>> > +       err = ipgre_tunnel_encap_add_mpls_ops();
>>> > +       if (err)
>>> > +               pr_err("Can't add mpls over gre tunnel ops\n");
>>> > +
>>
>> This will throw an error  if CONFIG_NET_IP_TUNNEL is not enabled.
>> Can you pls put the CONFIG_NET_IP_TUNNEL around
>> ipgre_tunnel_encap_add_mpls_ops ?
>
>
> You want the CONFIG_NET_IP_TUNNEL definition around the declaration of the
> function or when it's called ? or both ? (I can adjust the code for any
> case).

around the declaration like below....,

#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
static int ipgre_tunnel_encap_add_mpls_ops(void)
{
      return ip_tunnel_encap_add_ops(&mpls_iptun_ops,
                                                            TUNNEL_ENCAP_MPLS);
}

static void ipgre_tunnel_encap_del_mpls_ops(void)
{
      ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
}
#else
static int ipgre_tunnel_encap_add_mpls_ops(void)
{
      return 0
}

static void ipgre_tunnel_encap_del_mpls_ops(void)
{
}
#endif

thanks.

^ permalink raw reply

* Re: [PATCH v4 9/9] kasan: rework Kconfig settings
From: Andrey Ryabinin @ 2017-09-26 19:36 UTC (permalink / raw)
  To: Arnd Bergmann, Masahiro Yamada, Michal Marek, Andrew Morton
  Cc: Mauro Carvalho Chehab, Jiri Pirko, Arend van Spriel, Kalle Valo,
	David S. Miller, Alexander Potapenko, Dmitry Vyukov, Kees Cook,
	Geert Uytterhoeven, Greg Kroah-Hartman, linux-media, linux-kernel,
	netdev, linux-wireless, brcm80211-dev-list.pdl,
	brcm80211-dev-list, kasan-dev, linux-kbuild, Jakub Jelinek,
	Martin Liška
In-Reply-To: <20170922212930.620249-10-arnd@arndb.de>

On 09/23/2017 12:29 AM, Arnd Bergmann wrote:
> We get a lot of very large stack frames using gcc-7.0.1 with the default
> -fsanitize-address-use-after-scope --param asan-stack=1 options, which
> can easily cause an overflow of the kernel stack, e.g.
> 
> drivers/gpu/drm/i915/gvt/handlers.c:2407:1: error: the frame size of 31216 bytes is larger than 2048 bytes
> drivers/net/wireless/ralink/rt2x00/rt2800lib.c:5650:1: error: the frame size of 23632 bytes is larger than 2048 bytes
> drivers/scsi/fnic/fnic_trace.c:451:1: error: the frame size of 5152 bytes is larger than 2048 bytes
> fs/btrfs/relocation.c:1202:1: error: the frame size of 4256 bytes is larger than 2048 bytes
> fs/fscache/stats.c:287:1: error: the frame size of 6552 bytes is larger than 2048 bytes
> lib/atomic64_test.c:250:1: error: the frame size of 12616 bytes is larger than 2048 bytes
> mm/vmscan.c:1367:1: error: the frame size of 5080 bytes is larger than 2048 bytes
> net/wireless/nl80211.c:1905:1: error: the frame size of 4232 bytes is larger than 2048 bytes
> 
> To reduce this risk, -fsanitize-address-use-after-scope is now split
> out into a separate CONFIG_KASAN_EXTRA Kconfig option, leading to stack
> frames that are smaller than 2 kilobytes most of the time on x86_64. An
> earlier version of this patch also prevented combining KASAN_EXTRA with
> KASAN_INLINE, but that is no longer necessary with gcc-7.0.1.
> 
> A lot of warnings with KASAN_EXTRA go away if we disable KMEMCHECK,
> as -fsanitize-address-use-after-scope seems to understand the builtin
> memcpy, but adds checking code around an extern memcpy call. I had to work
> around a circular dependency, as DEBUG_SLAB/SLUB depended on !KMEMCHECK,
> while KASAN did it the other way round. Now we handle both the same way
> and make KASAN and KMEMCHECK mutually exclusive.
> 
> All patches to get the frame size below 2048 bytes with CONFIG_KASAN=y
> and CONFIG_KASAN_EXTRA=n have been submitted along with this patch, so
> we can bring back that default now. KASAN_EXTRA=y still causes lots of
> warnings but now defaults to !COMPILE_TEST to disable it in allmodconfig,
> and it remains disabled in all other defconfigs since it is a new option.
> I arbitrarily raise the warning limit for KASAN_EXTRA to 3072 to reduce
> the noise, but an allmodconfig kernel still has around 50 warnings
> on gcc-7.
> 
> I experimented a bit more with smaller stack frames and have another
> follow-up series that reduces the warning limit for 64-bit architectures
> to 1280 bytes (without CONFIG_KASAN).
> 
> With earlier versions of this patch series, I also had patches to
> address the warnings we get with KASAN and/or KASAN_EXTRA, using a
> "noinline_if_stackbloat" annotation. That annotation now got replaced with
> a gcc-8 bugfix (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715)
> and a workaround for older compilers, which means that KASAN_EXTRA is
> now just as bad as before and will lead to an instant stack overflow in
> a few extreme cases.
> 
> This reverts parts of commit commit 3f181b4 ("lib/Kconfig.debug: disable
> -Wframe-larger-than warnings with KASAN=y").
> 
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>

^ permalink raw reply

* Re: [PATCH net-next 2/6] bpf: add meta pointer for direct access
From: Daniel Borkmann @ 2017-09-26 19:58 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: davem, alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	jakub.kicinski, netdev, Andy Gospodarek
In-Reply-To: <20170926211342.0c8e72b0@redhat.com>

On 09/26/2017 09:13 PM, Jesper Dangaard Brouer wrote:
[...]
> I'm currently implementing a cpumap type, that transfers raw XDP frames
> to another CPU, and the SKB is allocated on the remote CPU.  (It
> actually works extremely well).

Meaning you let all the XDP_PASS packets get processed on a
different CPU, so you can reserve the whole CPU just for
prefiltering, right? Do you have some numbers to share at
this point, just curious when you mention it works extremely
well.

> For transferring info I need, I'm currently using xdp->data_hard_start
> (the top/start of the xdp page).  Which should be compatible with your
> approach, right?

Should be possible, yes. More below.

> The info I need:
>
>   struct xdp_pkt {
> 	void *data;
> 	u16 len;
> 	u16 headroom;
> 	struct net_device *dev_rx;
>   };
>
> When I enqueue the xdp packet I do the following:
>
>   int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
> 	struct net_device *dev_rx)
>   {
> 	struct xdp_pkt *xdp_pkt;
> 	int headroom;
>
> 	/* Convert xdp_buff to xdp_pkt */
> 	headroom = xdp->data - xdp->data_hard_start;
> 	if (headroom < sizeof(*xdp_pkt))
> 		return -EOVERFLOW;
> 	xdp_pkt = xdp->data_hard_start;
> 	xdp_pkt->data = xdp->data;
> 	xdp_pkt->len  = xdp->data_end - xdp->data;
> 	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
>
> 	/* Info needed when constructing SKB on remote CPU */
> 	xdp_pkt->dev_rx = dev_rx;
>
> 	bq_enqueue(rcpu, xdp_pkt);
> 	return 0;
>   }
>
> On the remote CPU dequeueing the packet, I'm doing the following.  As
> you can see I'm still lacking some meta-data, that would be nice to
> also transfer.  Could I use your infrastructure for that?

There could be multiple options to use it, in case you have a
helper where you look up the CPU in the map and would also store
the meta data, you could use a per-CPU scratch buffer similarly
as we do with struct redirect_info, and move that later e.g.
after program return into xdp->data_hard_start pointer. You
could also reserve that upfront potentially, so it's hidden from
the beginning from the program unless you want the program itself
to fill it out (modulo the pointers). Not all drivers currently
leave room though, I've also seen where xdp->data_hard_start
points directly to xdp->data, so there's 0 headroom available to
use; in such case it could either be treated as a hint and for
those drivers where they just pass the skb up the current CPU or
you would need some other means to move the meta data to the
remote CPU, or potentially just use tail room.

Thanks,
Daniel

^ permalink raw reply

* [PATCH] arp: make arp_hdr_len() return unsigned int
From: Alexey Dobriyan @ 2017-09-26 20:12 UTC (permalink / raw)
  To: davem; +Cc: netdev

Negative ARP header length are not a thing.

Constify arguments while I'm at it.

Space savings:

	add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3)
	function                        old     new   delta
	arpt_do_table                  1163    1160      -3

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 drivers/net/bonding/bond_main.c |    3 ++-
 include/linux/if_arp.h          |    2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2491,7 +2491,8 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
 	struct slave *curr_active_slave, *curr_arp_slave;
 	unsigned char *arp_ptr;
 	__be32 sip, tip;
-	int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	unsigned int alen;
 
 	if (!slave_do_arp_validate(bond, slave)) {
 		if ((slave_do_arp_validate_only(bond) && is_arp) ||
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
 	return (struct arphdr *)skb_network_header(skb);
 }
 
-static inline int arp_hdr_len(struct net_device *dev)
+static inline unsigned int arp_hdr_len(const struct net_device *dev)
 {
 	switch (dev->type) {
 #if IS_ENABLED(CONFIG_FIREWIRE_NET)

^ permalink raw reply

* [RFC PATCH 10/11] IP: early demux can return an error code
From: Paolo Abeni @ 2017-09-26 20:18 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Pablo Neira Ayuso, Florian Westphal,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <cover.1506114055.git.pabeni@redhat.com>

it will used by later patch to cope with unconnected sockets.
Since early demux can do a route lookup and an ipv4 route
lookup can return an error code this is consistent with the
current ipv4 route infrastructure.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
This patch and the next one did not land on the ML previously
due to PEBKAC, appending now to give the complete picture of
this RFC series.
Side note: currently the early demux lookup for mcast sockets
does not perform source address validation and we need (also)
something like this commit to fix the issue without causing
large performance regressions.
---
 include/net/protocol.h |  4 ++--
 include/net/tcp.h      |  2 +-
 include/net/udp.h      |  2 +-
 net/ipv4/ip_input.c    | 25 +++++++++++++++----------
 net/ipv4/tcp_ipv4.c    |  9 +++++----
 net/ipv4/udp.c         | 11 ++++++-----
 6 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 65ba335b0e7e..4fc75f7ae23b 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -39,8 +39,8 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	void			(*early_demux)(struct sk_buff *skb);
-	void                    (*early_demux_handler)(struct sk_buff *skb);
+	int			(*early_demux)(struct sk_buff *skb);
+	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	unsigned int		no_policy:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 49a8a46466f3..cf0bb918c52d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32);
 
 void tcp_shutdown(struct sock *sk, int how);
 
-void tcp_v4_early_demux(struct sk_buff *skb);
+int tcp_v4_early_demux(struct sk_buff *skb);
 int tcp_v4_rcv(struct sk_buff *skb);
 
 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
diff --git a/include/net/udp.h b/include/net/udp.h
index 12dfbfe2e2d7..6c759c8594e2 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
 	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
 }
 
-void udp_v4_early_demux(struct sk_buff *skb);
+int udp_v4_early_demux(struct sk_buff *skb);
 bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 5690ef09da28..f172be87674f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -311,9 +311,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt;
+	int (*edemux)(struct sk_buff *skb);
 	struct net_device *dev = skb->dev;
-	void (*edemux)(struct sk_buff *skb);
+	struct rtable *rt;
+	int err;
 
 	/* if ingress device is enslaved to an L3 master device pass the
 	 * skb to its handler for processing
@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-			edemux(skb);
+			err = edemux(skb);
+			if (unlikely(err))
+				goto drop_error;
 			/* must reload iph, skb->head might have changed */
 			iph = ip_hdr(skb);
 		}
@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_valid_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, dev);
-		if (unlikely(err)) {
-			if (err == -EXDEV)
-				__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
-			goto drop;
-		}
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, dev);
+		if (unlikely(err))
+			goto drop_error;
 	}
 
 	/* Since the sk has no reference to the socket, we must
@@ -407,6 +407,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
+
+drop_error:
+	if (err == -EXDEV)
+		__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+	goto drop;
 }
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..85164d4d3e53 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1503,23 +1503,23 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-void tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
 	const struct tcphdr *th;
 	struct sock *sk;
 
 	if (skb->pkt_type != PACKET_HOST)
-		return;
+		return 0;
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
-		return;
+		return 0;
 
 	iph = ip_hdr(skb);
 	th = tcp_hdr(skb);
 
 	if (th->doff < sizeof(struct tcphdr) / 4)
-		return;
+		return 0;
 
 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 				       iph->saddr, th->source,
@@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 				skb_dst_set_noref(skb, dst);
 		}
 	}
+	return 0;
 }
 
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5cbbd78024dc..b7202a15f360 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2215,7 +2215,7 @@ void udp_set_skb_rx_dst(struct sock *sk, struct sk_buff *skb, u32 cookie)
 }
 EXPORT_SYMBOL_GPL(udp_set_skb_rx_dst);
 
-void udp_v4_early_demux(struct sk_buff *skb)
+int udp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	int dif = skb->dev->ifindex;
@@ -2227,7 +2227,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 
 	/* validate the packet */
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
-		return;
+		return 0;
 
 	iph = ip_hdr(skb);
 	uh = udp_hdr(skb);
@@ -2237,14 +2237,14 @@ void udp_v4_early_demux(struct sk_buff *skb)
 		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 
 		if (!in_dev)
-			return;
+			return 0;
 
 		/* we are supposed to accept bcast packets */
 		if (skb->pkt_type == PACKET_MULTICAST) {
 			ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
 					       iph->protocol);
 			if (!ours)
-				return;
+				return 0;
 		}
 
 		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
@@ -2256,11 +2256,12 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	}
 
 	if (!sk)
-		return;
+		return 0;
 
 	skb_set_noref_sk(skb, sk);
 	if (udp_use_rx_dst_cache(sk, skb))
 		udp_set_skb_rx_dst(sk, skb, 0);
+	return 0;
 }
 
 int udp_rcv(struct sk_buff *skb)
-- 
2.13.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox