Netdev List
 help / color / mirror / Atom feed
* Re: [net-next, PATCH 2/2, v1] net: socionext: add AF_XDP support
From: Ilias Apalodimas @ 2018-09-10 16:21 UTC (permalink / raw)
  To: Toshiaki Makita
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	mykyta.iziumtsev, bjorn.topel, magnus.karlsson
In-Reply-To: <8bfd8219-acea-8b63-b6be-d17a7e3b6e24@lab.ntt.co.jp>

> > @@ -707,6 +731,26 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
> >  		if (unlikely(!buf_addr))
> >  			break;
> >  
> > +		if (xdp_prog) {
> > +			xdp_result = netsec_run_xdp(desc, priv, xdp_prog,
> > +						    pkt_len);
> > +			if (xdp_result != NETSEC_XDP_PASS) {
> > +				xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
> > +
> > +				dma_unmap_single_attrs(priv->dev,
> > +						       desc->dma_addr,
> > +						       desc->len, DMA_TO_DEVICE,
> > +						       DMA_ATTR_SKIP_CPU_SYNC);
> > +
> > +				desc->len = desc_len;
> > +				desc->dma_addr = dma_handle;
> > +				desc->addr = buf_addr;
> > +				netsec_rx_fill(priv, idx, 1);
> > +				nsetsec_adv_desc(&dring->tail);
> > +			}
> > +			continue;
> 
> Continue even on XDP_PASS? Is this really correct?
> 
> Also seems there is no handling of adjust_head/tail for XDP_PASS case.
> 
A question on this. Should XDP related frames be allocated using 1 page
per packet?

Thanks

Ilias

^ permalink raw reply

* Re: [PATCH] net: ipv4: Use BUG_ON directly instead of a if condition followed by BUG
From: kbuild test robot @ 2018-09-10 21:20 UTC (permalink / raw)
  To: zhong jiang
  Cc: kbuild-all, davem, edumazet, kuznet, yoshfuji, netdev,
	linux-kernel
In-Reply-To: <1536590282-23899-1-git-send-email-zhongjiang@huawei.com>

[-- Attachment #1: Type: text/plain, Size: 5670 bytes --]

Hi zhong,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]
[also build test ERROR on v4.19-rc3 next-20180910]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/zhong-jiang/net-ipv4-Use-BUG_ON-directly-instead-of-a-if-condition-followed-by-BUG/20180911-034152
config: mips-rt305x_defconfig (attached as .config)
compiler: mipsel-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 

All errors (new ones prefixed by >>):

   net/ipv4/tcp_input.c: In function 'tcp_collapse':
>> net/ipv4/tcp_input.c:4924:5: error: too many arguments to function 'BUG'
        BUG(skb_copy_bits(skb, offset,
        ^~~
   In file included from include/linux/bug.h:5:0,
                    from include/linux/mmdebug.h:5,
                    from include/linux/mm.h:9,
                    from net/ipv4/tcp_input.c:67:
   arch/mips/include/asm/bug.h:12:31: note: declared here
    static inline void __noreturn BUG(void)
                                  ^~~
   net/ipv4/tcp_input.c: In function 'tcp_urg':
   net/ipv4/tcp_input.c:5318:4: error: too many arguments to function 'BUG'
       BUG(skb_copy_bits(skb, ptr, &tmp, 1));
       ^~~
   In file included from include/linux/bug.h:5:0,
                    from include/linux/mmdebug.h:5,
                    from include/linux/mm.h:9,
                    from net/ipv4/tcp_input.c:67:
   arch/mips/include/asm/bug.h:12:31: note: declared here
    static inline void __noreturn BUG(void)
                                  ^~~

vim +/BUG +4924 net/ipv4/tcp_input.c

  4838	
  4839	/* Collapse contiguous sequence of skbs head..tail with
  4840	 * sequence numbers start..end.
  4841	 *
  4842	 * If tail is NULL, this means until the end of the queue.
  4843	 *
  4844	 * Segments with FIN/SYN are not collapsed (only because this
  4845	 * simplifies code)
  4846	 */
  4847	static void
  4848	tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
  4849		     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
  4850	{
  4851		struct sk_buff *skb = head, *n;
  4852		struct sk_buff_head tmp;
  4853		bool end_of_skbs;
  4854	
  4855		/* First, check that queue is collapsible and find
  4856		 * the point where collapsing can be useful.
  4857		 */
  4858	restart:
  4859		for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
  4860			n = tcp_skb_next(skb, list);
  4861	
  4862			/* No new bits? It is possible on ofo queue. */
  4863			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
  4864				skb = tcp_collapse_one(sk, skb, list, root);
  4865				if (!skb)
  4866					break;
  4867				goto restart;
  4868			}
  4869	
  4870			/* The first skb to collapse is:
  4871			 * - not SYN/FIN and
  4872			 * - bloated or contains data before "start" or
  4873			 *   overlaps to the next one.
  4874			 */
  4875			if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
  4876			    (tcp_win_from_space(sk, skb->truesize) > skb->len ||
  4877			     before(TCP_SKB_CB(skb)->seq, start))) {
  4878				end_of_skbs = false;
  4879				break;
  4880			}
  4881	
  4882			if (n && n != tail &&
  4883			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
  4884				end_of_skbs = false;
  4885				break;
  4886			}
  4887	
  4888			/* Decided to skip this, advance start seq. */
  4889			start = TCP_SKB_CB(skb)->end_seq;
  4890		}
  4891		if (end_of_skbs ||
  4892		    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
  4893			return;
  4894	
  4895		__skb_queue_head_init(&tmp);
  4896	
  4897		while (before(start, end)) {
  4898			int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
  4899			struct sk_buff *nskb;
  4900	
  4901			nskb = alloc_skb(copy, GFP_ATOMIC);
  4902			if (!nskb)
  4903				break;
  4904	
  4905			memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
  4906	#ifdef CONFIG_TLS_DEVICE
  4907			nskb->decrypted = skb->decrypted;
  4908	#endif
  4909			TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
  4910			if (list)
  4911				__skb_queue_before(list, skb, nskb);
  4912			else
  4913				__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
  4914			skb_set_owner_r(nskb, sk);
  4915	
  4916			/* Copy data, releasing collapsed skbs. */
  4917			while (copy > 0) {
  4918				int offset = start - TCP_SKB_CB(skb)->seq;
  4919				int size = TCP_SKB_CB(skb)->end_seq - start;
  4920	
  4921				BUG_ON(offset < 0);
  4922				if (size > 0) {
  4923					size = min(copy, size);
> 4924					BUG(skb_copy_bits(skb, offset,
  4925							  skb_put(nskb, size), size));
  4926					TCP_SKB_CB(nskb)->end_seq += size;
  4927					copy -= size;
  4928					start += size;
  4929				}
  4930				if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
  4931					skb = tcp_collapse_one(sk, skb, list, root);
  4932					if (!skb ||
  4933					    skb == tail ||
  4934					    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
  4935						goto end;
  4936	#ifdef CONFIG_TLS_DEVICE
  4937					if (skb->decrypted != nskb->decrypted)
  4938						goto end;
  4939	#endif
  4940				}
  4941			}
  4942		}
  4943	end:
  4944		skb_queue_walk_safe(&tmp, skb, n)
  4945			tcp_rbtree_insert(root, skb);
  4946	}
  4947	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 14248 bytes --]

^ permalink raw reply

* Re: [PATCH net-next] net/ipv6: Remove rt6i_prefsrc
From: David Miller @ 2018-09-10 17:02 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, lucien.xin, dsahern
In-Reply-To: <20180910161128.25520-1-dsahern@kernel.org>

From: dsahern@kernel.org
Date: Mon, 10 Sep 2018 09:11:28 -0700

> From: David Ahern <dsahern@gmail.com>
> 
> After the conversion to fib6_info, rt6i_prefsrc has a single user that
> reads the value and otherwise it is only set. The one reader can be
> converted to use rt->from so rt6i_prefsrc can be removed, reducing
> rt6_info by another 20 bytes.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>

Applied, thanks David.

^ permalink raw reply

* Re: unexpected GRO/veth behavior
From: Eric Dumazet @ 2018-09-10 17:06 UTC (permalink / raw)
  To: Paolo Abeni, Eric Dumazet, netdev; +Cc: Toshiaki Makita
In-Reply-To: <c5be74086876ce96353cb79e6486df321d58d48d.camel@redhat.com>



On 09/10/2018 08:22 AM, Paolo Abeni wrote:
 in this already heavy cost engine.
> 
> Yup, even if I do not see any measurable cost added by the posted code.

Sure, micro bench marks wont show anything.

Now, if GRO receives one packet every 100 usec, as many hosts in the wild do,
there is an additional cost because of icache being wasted.

^ permalink raw reply

* [PATCH net-next v1] net/tls: Fixed return value when tls_complete_pending_work() fails
From: Vakul Garg @ 2018-09-10 17:23 UTC (permalink / raw)
  To: netdev; +Cc: borisp, aviadye, davejwatson, davem, doronrk, Vakul Garg

In tls_sw_sendmsg() and tls_sw_sendpage(), the variable 'ret' has
been set to return value of tls_complete_pending_work(). This allows
return of proper error code if tls_complete_pending_work() fails.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
---
 net/tls/tls_sw.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index be4f2e990f9f..adab598bd6db 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -486,7 +486,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	int ret = 0;
+	int ret;
 	int required_size;
 	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 	bool eor = !(msg->msg_flags & MSG_MORE);
@@ -502,7 +502,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 
 	lock_sock(sk);
 
-	if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo))
+	ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo);
+	if (ret)
 		goto send_end;
 
 	if (unlikely(msg->msg_controllen)) {
@@ -637,7 +638,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	int ret = 0;
+	int ret;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 	bool eor;
 	size_t orig_size = size;
@@ -657,7 +658,8 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 
 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
-	if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo))
+	ret = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
+	if (ret)
 		goto sendpage_end;
 
 	/* Call the sk_stream functions to manage the sndbuf mem. */
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH net-next v2] net: sched: cls_flower: dump offload count value
From: David Miller @ 2018-09-10 17:35 UTC (permalink / raw)
  To: vladbu; +Cc: netdev, jakub.kicinski, jhs, xiyou.wangcong, jiri
In-Reply-To: <1536330141-10354-1-git-send-email-vladbu@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>
Date: Fri,  7 Sep 2018 17:22:21 +0300

> Change flower in_hw_count type to fixed-size u32 and dump it as
> TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
> blocks and re-offload functionality.
> 
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>

Applied, thank you.

^ permalink raw reply

* Re: [Patch net-next] net_sched: remove redundant qdisc lock classes
From: David Miller @ 2018-09-10 17:44 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev, jiri, jhs
In-Reply-To: <20180907202914.21331-1-xiyou.wangcong@gmail.com>

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri,  7 Sep 2018 13:29:13 -0700

> We no longer take any spinlock on RX path for ingress qdisc,
> so this lockdep annotation is no longer needed.
> 
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Applied.

^ permalink raw reply

* Re: [Patch net-next] htb: use anonymous union for simplicity
From: David Miller @ 2018-09-10 17:44 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev, jiri, jhs
In-Reply-To: <20180907202914.21331-2-xiyou.wangcong@gmail.com>

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri,  7 Sep 2018 13:29:14 -0700

> cl->leaf.q is slightly more readable than cl->un.leaf.q.
> 
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net] qmi_wwan: Support dynamic config on Quectel EP06
From: David Miller @ 2018-09-10 17:49 UTC (permalink / raw)
  To: kristian.evensen; +Cc: netdev, bjorn
In-Reply-To: <20180908115048.12667-1-kristian.evensen@gmail.com>

From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Sat,  8 Sep 2018 13:50:48 +0200

> Quectel EP06 (and EM06/EG06) supports dynamic configuration of USB
> interfaces, without the device changing VID/PID or configuration number.
> When the configuration is updated and interfaces are added/removed, the
> interface numbers change. This means that the current code for matching
> EP06 does not work.
> 
> This patch removes the current EP06 interface number match, and replaces
> it with a match on class, subclass and protocol. Unfortunately, matching
> on those three alone is not enough, as the diag interface exports the
> same values as QMI. The other serial interfaces + adb export different
> values and do not match.
> 
> The diag interface only has two endpoints, while the QMI interface has
> three. I have therefore added a check for number of interfaces, and we
> ignore the interface if the number of endpoints equals two.
> 
> Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node
From: Xin Long @ 2018-09-10 17:55 UTC (permalink / raw)
  To: David Ahern; +Cc: network dev, davem, Roopa Prabhu
In-Reply-To: <b13be408-a734-e959-3299-bdd49a1318e7@cumulusnetworks.com>

On Tue, Sep 11, 2018 at 12:13 AM David Ahern <dsa@cumulusnetworks.com> wrote:
>
> On 9/9/18 12:29 AM, Xin Long wrote:
> >>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >>> index 18e00ce..e554922 100644
> >>> --- a/net/ipv6/route.c
> >>> +++ b/net/ipv6/route.c
> >>> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
> >>>                        int iif, int type, u32 portid, u32 seq,
> >>>                        unsigned int flags)
> >>>  {
> >>> -     struct rtmsg *rtm;
> >>> +     struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> >>> +     struct rt6_info *rt6 = (struct rt6_info *)dst;
> >>> +     u32 *pmetrics, table, fib6_flags;
> >>>       struct nlmsghdr *nlh;
> >>> +     struct rtmsg *rtm;
> >>>       long expires = 0;
> >>> -     u32 *pmetrics;
> >>> -     u32 table;
> >>>
> >>>       nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
> >>>       if (!nlh)
> >>>               return -EMSGSIZE;
> >>>
> >>> +     if (rt6) {
> >>> +             fib6_dst = &rt6->rt6i_dst;
> >>> +             fib6_src = &rt6->rt6i_src;
> >>> +             fib6_flags = rt6->rt6i_flags;
> >>> +             fib6_prefsrc = &rt6->rt6i_prefsrc;
> >>> +     } else {
> >>> +             fib6_dst = &rt->fib6_dst;
> >>> +             fib6_src = &rt->fib6_src;
> >>> +             fib6_flags = rt->fib6_flags;
> >>> +             fib6_prefsrc = &rt->fib6_prefsrc;
> >>> +     }
> >>
> >> Unless I am missing something at the moment, an rt6_info can only have
> >> the same dst, src and prefsrc as the fib6_info on which it is based.
> >> Thus, only the flags is needed above. That simplifies this patch a lot.
> > If dst, src and prefsrc in rt6_info are always the same as these in fib6_info,
> > why do we need them in rt6_info? we could just get it by 'from'.
> >
>
> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
> that can be converted.
>
> rt6i_src is checked against the fib6_info to invalidate a dst if the src
> has changed, so a valid rt will always have the same rt6i_src as the
> rt->from.
>
> rt6i_dst is set to the dest address / 128 in cases, so it should be used
> for rt6_info cases above.
So that means, I will use rt6i_dst and rt6i_flags when dst is set?
how about I use rt6i_src there as well? just to make it look clear.
and plus the gw/nh dump fix in rt6_fill_node():
-        if (rt->fib6_nsiblings) {
+        if (rt6) {
+                if (fib6_flags & RTF_GATEWAY)
+                        if (nla_put_in6_addr(skb, RTA_GATEWAY,
+                                             &rt6->rt6i_gateway) < 0)
+                                goto nla_put_failure;
+
+                if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+                        goto nla_put_failure;
+        } else if (rt->fib6_nsiblings) {
                 struct fib6_info *sibling, *next_sibling;
                 struct nlattr *mp;

looks good to you?

^ permalink raw reply

* Re: [PATCH can-next] can: ucan: remove duplicated include from ucan.c
From: Martin Elshuber @ 2018-09-10 18:10 UTC (permalink / raw)
  To: YueHaibing, Wolfgang Grandegger, Marc Kleine-Budde,
	David S. Miller, Jakob Unterwurzacher, Philipp Tomsich
  Cc: linux-can, netdev, kernel-janitors
In-Reply-To: <1535505945-143347-1-git-send-email-yuehaibing@huawei.com>


[-- Attachment #1.1: Type: text/plain, Size: 811 bytes --]

Thank you for the fix

Reviewed-by: Martin Elshuber <martin.elshuber@theobroma-systems.com>

Am 29.08.18 um 03:25 schrieb YueHaibing:
> Remove duplicated include.
> 
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> ---
>  drivers/net/can/usb/ucan.c | 4 ----
>  1 file changed, 4 deletions(-)
> 
> diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
> index 0678a38..c6f4b41 100644
> --- a/drivers/net/can/usb/ucan.c
> +++ b/drivers/net/can/usb/ucan.c
> @@ -35,10 +35,6 @@
>  #include <linux/slab.h>
>  #include <linux/usb.h>
>  
> -#include <linux/can.h>
> -#include <linux/can/dev.h>
> -#include <linux/can/error.h>
> -
>  #define UCAN_DRIVER_NAME "ucan"
>  #define UCAN_MAX_RX_URBS 8
>  /* the CAN controller needs a while to enable/disable the bus */
> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply

* Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races
From: Neil Armstrong @ 2018-09-10 18:15 UTC (permalink / raw)
  To: Jose Abreu, netdev
  Cc: Jerome Brunet, Martin Blumenstingl, David S. Miller, Joao Pinto,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <5d7c2397-8ca9-1f9c-c394-f73a12456384@synopsys.com>

Hi Jose,

On 10/09/2018 18:21, Jose Abreu wrote:
> On 10-09-2018 16:49, Neil Armstrong wrote:
>> Hi Jose,
>>
>> On 10/09/2018 16:44, Jose Abreu wrote:
>>> On 10-09-2018 14:46, Neil Armstrong wrote:
>>>> hi Jose,
>>>>
>>>> On 10/09/2018 14:55, Jose Abreu wrote:
>>>>> On 10-09-2018 13:52, Jose Abreu wrote:
>>>>>> Can you please try attached follow-up patch ? 
>>>>> Oh, please apply the whole series otherwise this will not apply
>>>>> cleanly.
>>>> Indeed, it helps!
>>>>
>>>> With the fixups, it fails later, around 15s instead of 3, in RX and TX.
>>> Thanks for testing Neil. What if we keep rearming the timer
>>> whilst there are pending packets ? Something like in the attach.
>>> (applies on top of previous one).
>> It fixes RX, but TX fails after ~13s.
> 
> Ok :(
> 
> Can you please try attached follow-up patch ?

RX is still ok but now TX fails almost immediately...

With 100ms report :

$ iperf3 -c 192.168.1.47 -t 0 -p 5202 -R -i 0.1
Connecting to host 192.168.1.47, port 5202
Reverse mode, remote host 192.168.1.47 is sending
[  4] local 192.168.1.45 port 45900 connected to 192.168.1.47 port 5202
[ ID] Interval           Transfer     Bandwidth
[  4]   0.00-0.10   sec  10.9 MBytes   913 Mbits/sec
[  4]   0.10-0.20   sec  11.0 MBytes   923 Mbits/sec
[  4]   0.20-0.30   sec  6.34 MBytes   532 Mbits/sec
[  4]   0.30-0.40   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.40-0.50   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.50-0.60   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.60-0.70   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.70-0.80   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.80-0.90   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.90-1.00   sec  0.00 Bytes  0.00 bits/sec
[  4]   1.00-1.10   sec  0.00 Bytes  0.00 bits/sec
^C[  4]   1.10-1.10   sec  0.00 Bytes  0.00 bits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth
[  4]   0.00-1.10   sec  0.00 Bytes  0.00 bits/sec                  sender
[  4]   0.00-1.10   sec  28.2 MBytes   214 Mbits/sec                  receiver
iperf3: interrupt - the client has terminated

Neil

> 
> I'm so sorry about this back and forth and I appreciate all your
> help .
> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> 
> 
>>
>> Neil
>>
>>> Thanks and Best Regards,
>>> Jose Miguel Abreu
>>>
> 

^ permalink raw reply

* Re: [PATCH net-next v2 2/2] net: stmmac: Fixup the tail addr setting in xmit path
From: Florian Fainelli @ 2018-09-10 18:46 UTC (permalink / raw)
  To: Jose Abreu, netdev
  Cc: David S. Miller, Joao Pinto, Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <2b430bedf98176f052e1530004ab623d26d2c71a.1536570319.git.joabreu@synopsys.com>

On 09/10/2018 02:14 AM, Jose Abreu wrote:
> Currently we are always setting the tail address of descriptor list to
> the end of the pre-allocated list.
> 
> According to databook this is not correct. Tail address should point to
> the last available descriptor + 1, which means we have to update the
> tail address everytime we call the xmit function.
> 
> This should make no impact in older versions of MAC but in newer
> versions there are some DMA features which allows the IP to fetch
> descriptors in advance and in a non sequential order so its critical
> that we set the tail address correctly.


Can you include the appropriate Fixes tag here so this can easily be
backported to relevant stable branches?
-- 
Florian

^ permalink raw reply

* Re: Corrupted sit-tunnelled packets when using skb_gso_segment() on an IFB interface?
From: Eric Dumazet @ 2018-09-10 18:52 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, netdev; +Cc: cake
In-Reply-To: <87in3djq45.fsf@toke.dk>



On 09/10/2018 09:04 AM, Toke Høiland-Jørgensen wrote:
> Hi everyone
> 
> While investigating a bug report on CAKE[0], I've run into the following
> behaviour:
> 
> When running CAKE as an ingress shaper on an IFB interface, if the GSO
> splitting feature is turned on, TCP throughput will drop dramatically on
> 6in4 (sit) tunnels running over the interface in question. Looking at a
> traffic dump, I'm seeing ~15% packet loss on the encapsulated TCP
> stream.
> 
> IPv4 traffic is fine on the same interface, as is native IPv6 traffic.
> And turning off GSO splitting in CAKE makes the packet loss go away. The
> issue only seems to appear on IFB interfaces. So I'm wondering if there
> is some interaction that corrupts packets when they are being split in
> this configuration?
> 
> Steps to reproduce (assuming the box you are running on has IP 10.0.0.2
> on eth0, and has a peer at 10.0.0.1 with a suitably configured sit
> tunnel):
> 
> # modprobe ifb
> # ip link set dev ifb0 up
> # tc qdisc add dev eth0 handle ffff: ingress
> # tc filter add dev eth0 parent ffff: protocol all prio 10 matchall action mirred egress redirect dev ifb0
> # tc qdisc replace dev ifb0 root cake
> # ip link add type sit local 10.0.0.2 remote 10.0.0.1
> # ip link set dev sit1 up
> # netperf -H fe80::a00:1%sit1 -t TCP_MAERTS
> 
> Whereas, in the same setup, this will work fine:
> 
> # netperf -H 10.0.0.1 -t TCP_MAERTS
> 
> As will this:
> 
> # tc qdisc replace dev ifb0 root cake no-split-gso
> # netperf -H fe80::a00:1%sit1 -t TCP_MAERTS
> 
> 
> Does anyone have any ideas? :)
> 

My guess is that skb->mac_len is not properly updated in the segments (compared to the original GSO packet)

^ permalink raw reply

* Fw: [Bug 201071] New: Creating a vxlan in state 'up' does not give proper RTM_NEWLINK message
From: Stephen Hemminger @ 2018-09-10 18:55 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: netdev



Begin forwarded message:

Date: Mon, 10 Sep 2018 04:04:37 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 201071] New: Creating a vxlan in state 'up' does not give proper RTM_NEWLINK message


https://bugzilla.kernel.org/show_bug.cgi?id=201071

            Bug ID: 201071
           Summary: Creating a vxlan in state 'up' does not give proper
                    RTM_NEWLINK message
           Product: Networking
           Version: 2.5
    Kernel Version: 4.19-rc1
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: normal
          Priority: P1
         Component: Other
          Assignee: stephen@networkplumber.org
          Reporter: liam.mcbirnie@boeing.com
        Regression: Yes

If a vxlan is created with state 'up', the RTM_NEWLINK message shows the state
as down, and there no other netlink messages are sent.
As a result, processes listening to netlink are never notified that the vxlan
link is up.

eg.
# ip link add test up type vxlan id 8 group 224.224.224.224 dev eth0

Output of ip monitor link
# 4: test: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
      link/ether ee:cd:97:1a:cf:91 brd ff:ff:ff:ff:ff:ff

Output of ip link show (expected from netlink message)
# 4: test: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state
UNKNOWN group default qlen 1000
      link/ether ee:cd:97:1a:cf:91 brd ff:ff:ff:ff:ff:ff

This is a regression introduced by the following patch series.
https://patchwork.ozlabs.org/patch/947181/

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Fw: [Bug 201063] New: kernel panic on heavy network use
From: Stephen Hemminger @ 2018-09-10 18:56 UTC (permalink / raw)
  To: netdev



Begin forwarded message:

Date: Sun, 09 Sep 2018 13:45:28 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 201063] New: kernel panic on heavy network use


https://bugzilla.kernel.org/show_bug.cgi?id=201063

            Bug ID: 201063
           Summary: kernel panic on heavy network use
           Product: Networking
           Version: 2.5
    Kernel Version: 4.19rc2
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: normal
          Priority: P1
         Component: Other
          Assignee: stephen@networkplumber.org
          Reporter: oyvinds@everdot.org
        Regression: No

Created attachment 278379
  --> https://bugzilla.kernel.org/attachment.cgi?id=278379&action=edit  
RIP: native_smp_send_rechedule, what did they mean by this

kernel panics, it seems to happen when there is heavy network traffic going
through that box. no nothing in logs, took picture of screen with kernel panic,
it is attached

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node
From: David Ahern @ 2018-09-10 19:07 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, davem, Roopa Prabhu
In-Reply-To: <CADvbK_fJCA9PUqzsuFjVYgMtfM1tvtq32=OU90PPykq8CPP=PA@mail.gmail.com>

On 9/10/18 11:55 AM, Xin Long wrote:
> On Tue, Sep 11, 2018 at 12:13 AM David Ahern <dsa@cumulusnetworks.com> wrote:
>>
>> On 9/9/18 12:29 AM, Xin Long wrote:
>>>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>>>>> index 18e00ce..e554922 100644
>>>>> --- a/net/ipv6/route.c
>>>>> +++ b/net/ipv6/route.c
>>>>> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
>>>>>                        int iif, int type, u32 portid, u32 seq,
>>>>>                        unsigned int flags)
>>>>>  {
>>>>> -     struct rtmsg *rtm;
>>>>> +     struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
>>>>> +     struct rt6_info *rt6 = (struct rt6_info *)dst;
>>>>> +     u32 *pmetrics, table, fib6_flags;
>>>>>       struct nlmsghdr *nlh;
>>>>> +     struct rtmsg *rtm;
>>>>>       long expires = 0;
>>>>> -     u32 *pmetrics;
>>>>> -     u32 table;
>>>>>
>>>>>       nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
>>>>>       if (!nlh)
>>>>>               return -EMSGSIZE;
>>>>>
>>>>> +     if (rt6) {
>>>>> +             fib6_dst = &rt6->rt6i_dst;
>>>>> +             fib6_src = &rt6->rt6i_src;
>>>>> +             fib6_flags = rt6->rt6i_flags;
>>>>> +             fib6_prefsrc = &rt6->rt6i_prefsrc;
>>>>> +     } else {
>>>>> +             fib6_dst = &rt->fib6_dst;
>>>>> +             fib6_src = &rt->fib6_src;
>>>>> +             fib6_flags = rt->fib6_flags;
>>>>> +             fib6_prefsrc = &rt->fib6_prefsrc;
>>>>> +     }
>>>>
>>>> Unless I am missing something at the moment, an rt6_info can only have
>>>> the same dst, src and prefsrc as the fib6_info on which it is based.
>>>> Thus, only the flags is needed above. That simplifies this patch a lot.
>>> If dst, src and prefsrc in rt6_info are always the same as these in fib6_info,
>>> why do we need them in rt6_info? we could just get it by 'from'.
>>>
>>
>> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
>> that can be converted.
>>
>> rt6i_src is checked against the fib6_info to invalidate a dst if the src
>> has changed, so a valid rt will always have the same rt6i_src as the
>> rt->from.
>>
>> rt6i_dst is set to the dest address / 128 in cases, so it should be used
>> for rt6_info cases above.
> So that means, I will use rt6i_dst and rt6i_flags when dst is set?
> how about I use rt6i_src there as well? just to make it look clear.
> and plus the gw/nh dump fix in rt6_fill_node():
> -        if (rt->fib6_nsiblings) {
> +        if (rt6) {
> +                if (fib6_flags & RTF_GATEWAY)
> +                        if (nla_put_in6_addr(skb, RTA_GATEWAY,
> +                                             &rt6->rt6i_gateway) < 0)
> +                                goto nla_put_failure;
> +
> +                if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
> +                        goto nla_put_failure;
> +        } else if (rt->fib6_nsiblings) {
>                  struct fib6_info *sibling, *next_sibling;
>                  struct nlattr *mp;
> 
> looks good to you?
> 

sure

^ permalink raw reply

* Re: [PATCH iproute2 v2] tc/mqprio: Print extra info on invalid args.
From: Stephen Hemminger @ 2018-09-10 19:15 UTC (permalink / raw)
  To: Caleb Raitto; +Cc: netdev, jhs, xiyou.wangcong, jiri, Caleb Raitto
In-Reply-To: <20180906210117.203461-1-caleb.raitto@gmail.com>

On Thu,  6 Sep 2018 14:01:17 -0700
Caleb Raitto <caleb.raitto@gmail.com> wrote:

> From: Caleb Raitto <caraitto@google.com>
> 
> Print the name of the argument that wasn't understood.
> 
> Signed-off-by: Caleb Raitto <caraitto@google.com>

That is simpler, thanks. Applied

^ permalink raw reply

* Re: [iproute PATCH v2] ip-route: Fix segfault with many nexthops
From: Stephen Hemminger @ 2018-09-10 19:17 UTC (permalink / raw)
  To: Phil Sutter; +Cc: netdev
In-Reply-To: <20180906133151.21753-1-phil@nwl.cc>

On Thu,  6 Sep 2018 15:31:51 +0200
Phil Sutter <phil@nwl.cc> wrote:

> It was possible to crash ip-route by adding an IPv6 route with 37
> nexthop statements. A simple reproducer is:
> 
> | for i in `seq 37`; do
> | 	nhs="nexthop via 1111::$i "$nhs
> | done
> | ip -6 route add 3333::/64 $nhs
> 
> The related code was broken in multiple ways:
> 
> * parse_one_nh() assumed that rta points to 4kB of storage but caller
>   provided just 1kB. Fixed by passing 'len' parameter with the correct
>   value.
> 
> * Error checking of rta_addattr*() calls in parse_one_nh() and called
>   functions was completely absent, so with above fix in place output
>   flood would occur due to parser looping forever.
> 
> While being at it, increase message buffer sizes to 4k. This allows for
> at most 144 nexthops.
> 
> Signed-off-by: Phil Sutter <phil@nwl.cc>

Thanks for fixing this.
Shows where more test cases are needed.

^ permalink raw reply

* Re: [PATCH net-next v2 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races
From: Florian Fainelli @ 2018-09-10 19:22 UTC (permalink / raw)
  To: Jose Abreu, netdev, Tal Gilboa
  Cc: Jerome Brunet, Martin Blumenstingl, David S. Miller, Joao Pinto,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <c54230d2dc7e2740c039e0f5e8f306a7544bbe5e.1536570319.git.joabreu@synopsys.com>

On 09/10/2018 02:14 AM, Jose Abreu wrote:
> This follows David Miller advice and tries to fix coalesce timer in
> multi-queue scenarios.
> 
> We are now using per-queue coalesce values and per-queue TX timer.
> 
> Coalesce timer default values was changed to 1ms and the coalesce frames
> to 25.
> 
> Tested in B2B setup between XGMAC2 and GMAC5.

Why not revert the entire features for this merge window and work on
getting it to work over the next weeks/merge windows?

The idea of using a timer to coalesce TX path when there is not a HW
timer is a good idea and if this is made robust enough, you could even
promote that as being a network stack library/feature that could be used
by other drivers. In fact, this could be a great addition to the net DIM
library (Tal, what do you think?)

Here's a quick drive by review of things that appear wrong in the
current driver (without your patches):

- in stmmac_xmit(), in case we hit the !is_jumbo branch and we fail the
DMA mapping, there is no timer cancellation, don't we want to abort the
whole transmission?

- stmmac_tx_clean() should probably use netif_lock_bh() to guard against
the timer (soft IRQ context) and the the NAPI context (also soft IRQ)
running in parallel on two different CPUs. This may not explain all
problems, but these two things are fundamentally exclusive, because the
timer is meant to emulate the interrupt after N packets, while NAPI
executes when such a thing did actually occur

- stmmac_poll() should cancel pending timer(s) if it was able to reclaim
packets, likewise stmmac_tx_timer() should re-enable TX interrupts if it
reclaimed packets, since TX interrupts could have been left disabled
from a prior NAPI run. These could be considered optimizations, since
you could leave the TX timer running all the time, just adjust the
deadline (based on line rate, MTU, IPG, number of fragments and their
respective length), worst case, both NAPI and the timer clean up your TX
ring, so you should always have room to push more packets

> 
> Signed-off-by: Jose Abreu <joabreu@synopsys.com>
> Cc: Jerome Brunet <jbrunet@baylibre.com>
> Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Joao Pinto <jpinto@synopsys.com>
> Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
> Cc: Alexandre Torgue <alexandre.torgue@st.com>
> ---
> Jerome,
> 
> Can you please test if this one is okay ?
> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> ---
>  drivers/net/ethernet/stmicro/stmmac/common.h      |   4 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac.h      |   6 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 207 ++++++++++++++--------
>  3 files changed, 135 insertions(+), 82 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
> index 1854f270ad66..b1b305f8f414 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/common.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/common.h
> @@ -258,10 +258,10 @@ struct stmmac_safety_stats {
>  #define MAX_DMA_RIWT		0xff
>  #define MIN_DMA_RIWT		0x20
>  /* Tx coalesce parameters */
> -#define STMMAC_COAL_TX_TIMER	40000
> +#define STMMAC_COAL_TX_TIMER	1000
>  #define STMMAC_MAX_COAL_TX_TICK	100000
>  #define STMMAC_TX_MAX_FRAMES	256
> -#define STMMAC_TX_FRAMES	64
> +#define STMMAC_TX_FRAMES	25
>  
>  /* Packets types */
>  enum packets_types {
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> index c0a855b7ab3b..957030cfb833 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> @@ -48,6 +48,9 @@ struct stmmac_tx_info {
>  
>  /* Frequently used values are kept adjacent for cache effect */
>  struct stmmac_tx_queue {
> +	u32 tx_count_frames;
> +	int tx_timer_active;
> +	struct timer_list txtimer;
>  	u32 queue_index;
>  	struct stmmac_priv *priv_data;
>  	struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp;
> @@ -59,6 +62,7 @@ struct stmmac_tx_queue {
>  	dma_addr_t dma_tx_phy;
>  	u32 tx_tail_addr;
>  	u32 mss;
> +	struct napi_struct napi ____cacheline_aligned_in_smp;
>  };
>  
>  struct stmmac_rx_queue {
> @@ -109,14 +113,12 @@ struct stmmac_pps_cfg {
>  
>  struct stmmac_priv {
>  	/* Frequently used values are kept adjacent for cache effect */
> -	u32 tx_count_frames;
>  	u32 tx_coal_frames;
>  	u32 tx_coal_timer;
>  
>  	int tx_coalesce;
>  	int hwts_tx_en;
>  	bool tx_path_in_lpi_mode;
> -	struct timer_list txtimer;
>  	bool tso;
>  
>  	unsigned int dma_buf_sz;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 9f458bb16f2a..9809c2b319fe 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -148,6 +148,7 @@ static void stmmac_verify_args(void)
>  static void stmmac_disable_all_queues(struct stmmac_priv *priv)
>  {
>  	u32 rx_queues_cnt = priv->plat->rx_queues_to_use;
> +	u32 tx_queues_cnt = priv->plat->tx_queues_to_use;
>  	u32 queue;
>  
>  	for (queue = 0; queue < rx_queues_cnt; queue++) {
> @@ -155,6 +156,12 @@ static void stmmac_disable_all_queues(struct stmmac_priv *priv)
>  
>  		napi_disable(&rx_q->napi);
>  	}
> +
> +	for (queue = 0; queue < tx_queues_cnt; queue++) {
> +		struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
> +
> +		napi_disable(&tx_q->napi);
> +	}
>  }
>  
>  /**
> @@ -164,6 +171,7 @@ static void stmmac_disable_all_queues(struct stmmac_priv *priv)
>  static void stmmac_enable_all_queues(struct stmmac_priv *priv)
>  {
>  	u32 rx_queues_cnt = priv->plat->rx_queues_to_use;
> +	u32 tx_queues_cnt = priv->plat->tx_queues_to_use;
>  	u32 queue;
>  
>  	for (queue = 0; queue < rx_queues_cnt; queue++) {
> @@ -171,6 +179,12 @@ static void stmmac_enable_all_queues(struct stmmac_priv *priv)
>  
>  		napi_enable(&rx_q->napi);
>  	}
> +
> +	for (queue = 0; queue < tx_queues_cnt; queue++) {
> +		struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
> +
> +		napi_enable(&tx_q->napi);
> +	}
>  }
>  
>  /**
> @@ -1843,7 +1857,8 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
>   * @queue: TX queue index
>   * Description: it reclaims the transmit resources after transmission completes.
>   */
> -static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
> +static int stmmac_tx_clean(struct stmmac_priv *priv, int limit, u32 queue,
> +			   bool *more)
>  {
>  	struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
>  	unsigned int bytes_compl = 0, pkts_compl = 0;
> @@ -1851,10 +1866,13 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
>  
>  	netif_tx_lock(priv->dev);
>  
> +	if (more)
> +		*more = false;
> +
>  	priv->xstats.tx_clean++;
>  
>  	entry = tx_q->dirty_tx;
> -	while (entry != tx_q->cur_tx) {
> +	while ((entry != tx_q->cur_tx) && (pkts_compl < limit)) {
>  		struct sk_buff *skb = tx_q->tx_skbuff[entry];
>  		struct dma_desc *p;
>  		int status;
> @@ -1937,7 +1955,13 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
>  		stmmac_enable_eee_mode(priv);
>  		mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer));
>  	}
> +
> +	if (more && (tx_q->dirty_tx != tx_q->cur_tx))
> +		*more = true;
> +
>  	netif_tx_unlock(priv->dev);
> +
> +	return pkts_compl;
>  }
>  
>  /**
> @@ -2020,6 +2044,34 @@ static bool stmmac_safety_feat_interrupt(struct stmmac_priv *priv)
>  	return false;
>  }
>  
> +static int stmmac_napi_check(struct stmmac_priv *priv, u32 chan)
> +{
> +	int status = stmmac_dma_interrupt_status(priv, priv->ioaddr,
> +						 &priv->xstats, chan);
> +
> +	if ((status & handle_rx) && (chan < priv->plat->rx_queues_to_use)) {
> +		struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan];
> +
> +		if (likely(napi_schedule_prep(&rx_q->napi))) {
> +			stmmac_disable_dma_irq(priv, priv->ioaddr, chan);
> +			__napi_schedule(&rx_q->napi);
> +		}
> +	} else {
> +		status &= ~handle_rx;
> +	}
> +
> +	if ((status & handle_tx) && (chan < priv->plat->tx_queues_to_use)) {
> +		struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan];
> +
> +		if (likely(napi_schedule_prep(&tx_q->napi)))
> +			__napi_schedule(&tx_q->napi);
> +	} else {
> +		status &= ~handle_tx;
> +	}
> +
> +	return status;
> +}
> +
>  /**
>   * stmmac_dma_interrupt - DMA ISR
>   * @priv: driver private structure
> @@ -2034,57 +2086,14 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
>  	u32 channels_to_check = tx_channel_count > rx_channel_count ?
>  				tx_channel_count : rx_channel_count;
>  	u32 chan;
> -	bool poll_scheduled = false;
>  	int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)];
>  
>  	/* Make sure we never check beyond our status buffer. */
>  	if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status)))
>  		channels_to_check = ARRAY_SIZE(status);
>  
> -	/* Each DMA channel can be used for rx and tx simultaneously, yet
> -	 * napi_struct is embedded in struct stmmac_rx_queue rather than in a
> -	 * stmmac_channel struct.
> -	 * Because of this, stmmac_poll currently checks (and possibly wakes)
> -	 * all tx queues rather than just a single tx queue.
> -	 */
>  	for (chan = 0; chan < channels_to_check; chan++)
> -		status[chan] = stmmac_dma_interrupt_status(priv, priv->ioaddr,
> -				&priv->xstats, chan);
> -
> -	for (chan = 0; chan < rx_channel_count; chan++) {
> -		if (likely(status[chan] & handle_rx)) {
> -			struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan];
> -
> -			if (likely(napi_schedule_prep(&rx_q->napi))) {
> -				stmmac_disable_dma_irq(priv, priv->ioaddr, chan);
> -				__napi_schedule(&rx_q->napi);
> -				poll_scheduled = true;
> -			}
> -		}
> -	}
> -
> -	/* If we scheduled poll, we already know that tx queues will be checked.
> -	 * If we didn't schedule poll, see if any DMA channel (used by tx) has a
> -	 * completed transmission, if so, call stmmac_poll (once).
> -	 */
> -	if (!poll_scheduled) {
> -		for (chan = 0; chan < tx_channel_count; chan++) {
> -			if (status[chan] & handle_tx) {
> -				/* It doesn't matter what rx queue we choose
> -				 * here. We use 0 since it always exists.
> -				 */
> -				struct stmmac_rx_queue *rx_q =
> -					&priv->rx_queue[0];
> -
> -				if (likely(napi_schedule_prep(&rx_q->napi))) {
> -					stmmac_disable_dma_irq(priv,
> -							priv->ioaddr, chan);
> -					__napi_schedule(&rx_q->napi);
> -				}
> -				break;
> -			}
> -		}
> -	}
> +		status[chan] = stmmac_napi_check(priv, chan);
>  
>  	for (chan = 0; chan < tx_channel_count; chan++) {
>  		if (unlikely(status[chan] & tx_hard_error_bump_tc)) {
> @@ -2241,13 +2250,11 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
>   */
>  static void stmmac_tx_timer(struct timer_list *t)
>  {
> -	struct stmmac_priv *priv = from_timer(priv, t, txtimer);
> -	u32 tx_queues_count = priv->plat->tx_queues_to_use;
> -	u32 queue;
> +	struct stmmac_tx_queue *tx_q = from_timer(tx_q, t, txtimer);
>  
> -	/* let's scan all the tx queues */
> -	for (queue = 0; queue < tx_queues_count; queue++)
> -		stmmac_tx_clean(priv, queue);
> +	if (likely(napi_schedule_prep(&tx_q->napi)))
> +		__napi_schedule(&tx_q->napi);
> +	tx_q->tx_timer_active = 0;
>  }
>  
>  /**
> @@ -2260,11 +2267,17 @@ static void stmmac_tx_timer(struct timer_list *t)
>   */
>  static void stmmac_init_tx_coalesce(struct stmmac_priv *priv)
>  {
> +	u32 tx_channel_count = priv->plat->tx_queues_to_use;
> +	u32 chan;
> +
>  	priv->tx_coal_frames = STMMAC_TX_FRAMES;
>  	priv->tx_coal_timer = STMMAC_COAL_TX_TIMER;
> -	timer_setup(&priv->txtimer, stmmac_tx_timer, 0);
> -	priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer);
> -	add_timer(&priv->txtimer);
> +
> +	for (chan = 0; chan < tx_channel_count; chan++) {
> +		struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan];
> +
> +		timer_setup(&tx_q->txtimer, stmmac_tx_timer, 0);
> +	}
>  }
>  
>  static void stmmac_set_rings_length(struct stmmac_priv *priv)
> @@ -2592,6 +2605,7 @@ static void stmmac_hw_teardown(struct net_device *dev)
>  static int stmmac_open(struct net_device *dev)
>  {
>  	struct stmmac_priv *priv = netdev_priv(dev);
> +	u32 chan;
>  	int ret;
>  
>  	stmmac_check_ether_addr(priv);
> @@ -2688,7 +2702,9 @@ static int stmmac_open(struct net_device *dev)
>  	if (dev->phydev)
>  		phy_stop(dev->phydev);
>  
> -	del_timer_sync(&priv->txtimer);
> +	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++)
> +		del_timer_sync(&priv->tx_queue[chan].txtimer);
> +
>  	stmmac_hw_teardown(dev);
>  init_error:
>  	free_dma_desc_resources(priv);
> @@ -2708,6 +2724,7 @@ static int stmmac_open(struct net_device *dev)
>  static int stmmac_release(struct net_device *dev)
>  {
>  	struct stmmac_priv *priv = netdev_priv(dev);
> +	u32 chan;
>  
>  	if (priv->eee_enabled)
>  		del_timer_sync(&priv->eee_ctrl_timer);
> @@ -2722,7 +2739,8 @@ static int stmmac_release(struct net_device *dev)
>  
>  	stmmac_disable_all_queues(priv);
>  
> -	del_timer_sync(&priv->txtimer);
> +	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++)
> +		del_timer_sync(&priv->tx_queue[chan].txtimer);
>  
>  	/* Free the IRQ lines */
>  	free_irq(dev->irq, dev);
> @@ -2936,14 +2954,11 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>  	priv->xstats.tx_tso_nfrags += nfrags;
>  
>  	/* Manage tx mitigation */
> -	priv->tx_count_frames += nfrags + 1;
> -	if (likely(priv->tx_coal_frames > priv->tx_count_frames)) {
> -		mod_timer(&priv->txtimer,
> -			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
> -	} else {
> -		priv->tx_count_frames = 0;
> +	tx_q->tx_count_frames += nfrags + 1;
> +	if (priv->tx_coal_frames <= tx_q->tx_count_frames) {
>  		stmmac_set_tx_ic(priv, desc);
>  		priv->xstats.tx_set_ic_bit++;
> +		tx_q->tx_count_frames = 0;
>  	}
>  
>  	skb_tx_timestamp(skb);
> @@ -2994,6 +3009,12 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
>  
> +	if (priv->tx_coal_timer && !tx_q->tx_timer_active) {
> +		tx_q->tx_timer_active = 1;
> +		mod_timer(&tx_q->txtimer,
> +				STMMAC_COAL_TIMER(priv->tx_coal_timer));
> +	}
> +
>  	return NETDEV_TX_OK;
>  
>  dma_map_err:
> @@ -3146,14 +3167,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
>  	 * This approach takes care about the fragments: desc is the first
>  	 * element in case of no SG.
>  	 */
> -	priv->tx_count_frames += nfrags + 1;
> -	if (likely(priv->tx_coal_frames > priv->tx_count_frames)) {
> -		mod_timer(&priv->txtimer,
> -			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
> -	} else {
> -		priv->tx_count_frames = 0;
> +	tx_q->tx_count_frames += nfrags + 1;
> +	if (priv->tx_coal_frames <= tx_q->tx_count_frames) {
>  		stmmac_set_tx_ic(priv, desc);
>  		priv->xstats.tx_set_ic_bit++;
> +		tx_q->tx_count_frames = 0;
>  	}
>  
>  	skb_tx_timestamp(skb);
> @@ -3199,8 +3217,15 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
>  	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
>  
>  	stmmac_enable_dma_transmission(priv, priv->ioaddr);
> +
>  	stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
>  
> +	if (priv->tx_coal_timer && !tx_q->tx_timer_active) {
> +		tx_q->tx_timer_active = 1;
> +		mod_timer(&tx_q->txtimer,
> +				STMMAC_COAL_TIMER(priv->tx_coal_timer));
> +	}
> +
>  	return NETDEV_TX_OK;
>  
>  dma_map_err:
> @@ -3514,27 +3539,41 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
>   *  Description :
>   *  To look at the incoming frames and clear the tx resources.
>   */
> -static int stmmac_poll(struct napi_struct *napi, int budget)
> +static int stmmac_rx_poll(struct napi_struct *napi, int budget)
>  {
>  	struct stmmac_rx_queue *rx_q =
>  		container_of(napi, struct stmmac_rx_queue, napi);
>  	struct stmmac_priv *priv = rx_q->priv_data;
> -	u32 tx_count = priv->plat->tx_queues_to_use;
>  	u32 chan = rx_q->queue_index;
>  	int work_done = 0;
> -	u32 queue;
>  
>  	priv->xstats.napi_poll++;
>  
> -	/* check all the queues */
> -	for (queue = 0; queue < tx_count; queue++)
> -		stmmac_tx_clean(priv, queue);
> -
>  	work_done = stmmac_rx(priv, budget, rx_q->queue_index);
> +	if (work_done < budget && napi_complete_done(napi, work_done))
> +		stmmac_enable_dma_irq(priv, priv->ioaddr, chan);
> +
> +	return work_done;
> +}
> +
> +static int stmmac_tx_poll(struct napi_struct *napi, int budget)
> +{
> +	struct stmmac_tx_queue *tx_q =
> +		container_of(napi, struct stmmac_tx_queue, napi);
> +	struct stmmac_priv *priv = tx_q->priv_data;
> +	u32 chan = tx_q->queue_index;
> +	int work_done = 0;
> +	bool more;
> +
> +	priv->xstats.napi_poll++;
> +
> +	work_done = stmmac_tx_clean(priv, budget, chan, &more);
>  	if (work_done < budget) {
>  		napi_complete_done(napi, work_done);
> -		stmmac_enable_dma_irq(priv, priv->ioaddr, chan);
> +		if (more)
> +			napi_reschedule(napi);
>  	}
> +
>  	return work_done;
>  }
>  
> @@ -4325,10 +4364,17 @@ int stmmac_dvr_probe(struct device *device,
>  	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) {
>  		struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
>  
> -		netif_napi_add(ndev, &rx_q->napi, stmmac_poll,
> +		netif_napi_add(ndev, &rx_q->napi, stmmac_rx_poll,
>  			       (8 * priv->plat->rx_queues_to_use));
>  	}
>  
> +	for (queue = 0; queue < priv->plat->tx_queues_to_use; queue++) {
> +		struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
> +
> +		netif_napi_add(ndev, &tx_q->napi, stmmac_tx_poll,
> +			       (8 * priv->plat->tx_queues_to_use));
> +	}
> +
>  	mutex_init(&priv->lock);
>  
>  	/* If a specific clk_csr value is passed from the platform
> @@ -4377,6 +4423,11 @@ int stmmac_dvr_probe(struct device *device,
>  
>  		netif_napi_del(&rx_q->napi);
>  	}
> +	for (queue = 0; queue < priv->plat->tx_queues_to_use; queue++) {
> +		struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
> +
> +		netif_napi_del(&tx_q->napi);
> +	}
>  error_hw_init:
>  	destroy_workqueue(priv->wq);
>  error_wq:
> 


-- 
Florian

^ permalink raw reply

* Re: Corrupted sit-tunnelled packets when using skb_gso_segment() on an IFB interface?
From: Eric Dumazet @ 2018-09-10 19:39 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, netdev; +Cc: cake, Herbert Xu
In-Reply-To: <693f99f0-6876-15bc-2a29-c235e14cc25f@gmail.com>



On 09/10/2018 11:52 AM, Eric Dumazet wrote:
> 
> 
> On 09/10/2018 09:04 AM, Toke Høiland-Jørgensen wrote:
>> Hi everyone
>>
>> While investigating a bug report on CAKE[0], I've run into the following
>> behaviour:
>>
>> When running CAKE as an ingress shaper on an IFB interface, if the GSO
>> splitting feature is turned on, TCP throughput will drop dramatically on
>> 6in4 (sit) tunnels running over the interface in question. Looking at a
>> traffic dump, I'm seeing ~15% packet loss on the encapsulated TCP
>> stream.
>>
>> IPv4 traffic is fine on the same interface, as is native IPv6 traffic.
>> And turning off GSO splitting in CAKE makes the packet loss go away. The
>> issue only seems to appear on IFB interfaces. So I'm wondering if there
>> is some interaction that corrupts packets when they are being split in
>> this configuration?
>>
>> Steps to reproduce (assuming the box you are running on has IP 10.0.0.2
>> on eth0, and has a peer at 10.0.0.1 with a suitably configured sit
>> tunnel):
>>
>> # modprobe ifb
>> # ip link set dev ifb0 up
>> # tc qdisc add dev eth0 handle ffff: ingress
>> # tc filter add dev eth0 parent ffff: protocol all prio 10 matchall action mirred egress redirect dev ifb0
>> # tc qdisc replace dev ifb0 root cake
>> # ip link add type sit local 10.0.0.2 remote 10.0.0.1
>> # ip link set dev sit1 up
>> # netperf -H fe80::a00:1%sit1 -t TCP_MAERTS
>>
>> Whereas, in the same setup, this will work fine:
>>
>> # netperf -H 10.0.0.1 -t TCP_MAERTS
>>
>> As will this:
>>
>> # tc qdisc replace dev ifb0 root cake no-split-gso
>> # netperf -H fe80::a00:1%sit1 -t TCP_MAERTS
>>
>>
>> Does anyone have any ideas? :)
>>
> 
> My guess is that skb->mac_len is not properly updated in the segments (compared to the original GSO packet)

And the skb->mac_len being not properly set is a problem after commit 
f40ae91307c275fc8b17420fa74145e9937c3c0b act_mirred: Fix bogus header when redirecting from VLAN

^ permalink raw reply

* ksoftirqd takes 100% of a core with ixgbe and netconsole (netpoll)
From: Song Liu @ 2018-09-10 20:00 UTC (permalink / raw)
  To: Networking
  Cc: john.r.fastabend@intel.com, alexander.h.duyck@intel.com,
	jeffrey.t.kirsher@intel.com, Kernel Team


We are debugging an issue with netconsole and ixgbe, that ksoftirqd takes 100%
of a core. It happens with both current net and net-next.

To reproduce the issue:

  1. Setup server with ixgbe and netconsole. We bind each queue to a separate
     core via smp_affinity;
  2. Start simple netperf job from client, like:
        ./super_netperf 201 -P 0 -t TCP_RR -p 8888 -H <SERVER> -l 7200 -- -r 300,300 -o -s 1M,1M -S 1M,1M
  3. On server, write to /dev/kmsg in a loop (to send netconsole):
        for x in {1..7200} ; do echo aa >> /dev/kmsg ; sleep 1; done
  4. On server, monitor ksoftirqd in top

Within a few minutes, top will show one ksoftirqd take 100% of the core for many
seconds in a row. 

When the ksoftirqd takes 100% of a core, the driver hits "clean_complete=false"
path below, so this napi stays in polling mode. 

        ixgbe_for_each_ring(ring, q_vector->rx) {
                int cleaned = ixgbe_clean_rx_irq(q_vector, ring,
                                                 per_ring_budget);

                work_done += cleaned;
                if (cleaned >= per_ring_budget)
                        clean_complete = false;
        }

        /* If all work not completed, return budget and keep polling */
        if (!clean_complete)
                return budget;

We didn't see this issue on a 4.6 based kernel.

We are still debugging the issue. But we would like to check whether there is
known solution for it. Any comments and suggestions are highly appreciated.

Best,
Song

^ permalink raw reply

* [PATCH net-next v3 00/17] WireGuard: Secure Network Tunnel
From: Jason A. Donenfeld @ 2018-09-11  1:08 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, gregkh; +Cc: Jason A. Donenfeld

This patchset is available on git.kernel.org in this branch, where it may be
pulled directly for inclusion into net-next:

  * https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/linux.git/log/?h=jd/wireguard

WireGuard is a secure network tunnel written especially for Linux, which
has faced around three years of serious development, deployment, and
scrutiny. It delivers excellent performance and is extremely easy to
use and configure. It has been designed with the primary goal of being
both easy to audit by virtue of being small and highly secure from a
cryptography and systems security perspective. WireGuard is used by some
massive companies pushing enormous amounts of traffic, and likely
already today you've consumed bytes that at some point transited through
a WireGuard tunnel. Even as an out-of-tree module, WireGuard has been
integrated into various userspace tools, Linux distributions, mobile
phones, and data centers. There are ports in several languages to
several operating systems, and even commercial hardware and services
sold integrating WireGuard. It is time, therefore, for WireGuard to be
properly integrated into Linux.

Ample information, including documentation, installation instructions,
and project details, is available at:

  * https://www.wireguard.com/
  * https://www.wireguard.com/papers/wireguard.pdf

As it is currently an out-of-tree module, it lives in its own git repo
and has its own mailing list, and every commit for the module is tested
against every stable kernel since 3.10 on a variety of architectures
using an extensive test suite:

  * https://git.zx2c4.com/WireGuard
    https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/WireGuard.git/
  * https://lists.zx2c4.com/mailman/listinfo/wireguard
  * https://www.wireguard.com/build-status/

The project has been broadly discussed at conferences, and was presented
to the Netdev developers in Seoul last November, where a paper was
released detailing some interesting aspects of the project. Dave asked
me after the talk if I would consider sending in a v1 "sooner rather
than later", hence this patchset. A decision is still waiting from the
Linux Plumbers Conference, but an update on these topics may be presented
in Vancouver in a few months. Prior presentations:

  * https://www.wireguard.com/presentations/
  * https://www.wireguard.com/papers/wireguard-netdev22.pdf

The cryptography in the protocol itself has been formally verified by
several independent academic teams with positive results, and I know of
two additional efforts on their way to further corroborate those
findings. The version 1 protocol is "complete", and so the purpose of
this review is to assess the implementation of the protocol. However, it
still may be of interest to know that the thing you're reviewing uses a
protocol with various nice security properties:

  * https://www.wireguard.com/formal-verification/

This patchset is divided into three segments. The first introduces a very
simple helper for working with the FPU state for the purposes of amortizing
SIMD operations. The second segment is a small collection of cryptographic
primitives, split up into several commits by primitive and by hardware. The
third is WireGuard itself, presented as an unintrusive and self-contained
virtual network driver.

It is intended that this entire patch series enter the kernel through
DaveM's net-next tree. Subsequently, WireGuard patches will go through
DaveM's net-next tree, while Zinc patches will go through Greg KH's tree.

Changes v2->v3:
  - 80 column formatting of Zinc.
  - Remaining relevant checkpatch.pl concerns in WireGuard addressed.
  - More idomatic ARM assembly or Curve25519.
  - Numerous cleanups.
  - Corrected licensing situation of CRYPTOGAMS-based code (from Andy
    Polyakov).

Enjoy,
Jason

^ permalink raw reply

* [PATCH net-next v3 01/17] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-09-11  1:08 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, gregkh
  Cc: Jason A. Donenfeld, Andy Lutomirski, Thomas Gleixner,
	Samuel Neves, linux-arch
In-Reply-To: <20180911010838.8818-1-Jason@zx2c4.com>

Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
FPU/SIMD functions over a number of calls, because FPU restoration is
quite expensive. This adds a simple header for carrying out this pattern:

    simd_context_t simd_context = simd_get();
    while ((item = get_item_from_queue()) != NULL) {
        encrypt_item(item, simd_context);
        simd_context = simd_relax(simd_context);
    }
    simd_put(simd_context);

The relaxation step ensures that we don't trample over preemption, and
the get/put API should be a familiar paradigm in the kernel.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: linux-arch@vger.kernel.org
---
 arch/alpha/include/asm/Kbuild      |  5 ++--
 arch/arc/include/asm/Kbuild        |  1 +
 arch/arm/include/asm/simd.h        | 42 ++++++++++++++++++++++++++++++
 arch/arm64/include/asm/simd.h      | 37 +++++++++++++++++++++-----
 arch/c6x/include/asm/Kbuild        |  3 ++-
 arch/h8300/include/asm/Kbuild      |  3 ++-
 arch/hexagon/include/asm/Kbuild    |  1 +
 arch/ia64/include/asm/Kbuild       |  1 +
 arch/m68k/include/asm/Kbuild       |  1 +
 arch/microblaze/include/asm/Kbuild |  1 +
 arch/mips/include/asm/Kbuild       |  1 +
 arch/nds32/include/asm/Kbuild      |  7 ++---
 arch/nios2/include/asm/Kbuild      |  1 +
 arch/openrisc/include/asm/Kbuild   |  7 ++---
 arch/parisc/include/asm/Kbuild     |  1 +
 arch/powerpc/include/asm/Kbuild    |  3 ++-
 arch/riscv/include/asm/Kbuild      |  3 ++-
 arch/s390/include/asm/Kbuild       |  3 ++-
 arch/sh/include/asm/Kbuild         |  1 +
 arch/sparc/include/asm/Kbuild      |  1 +
 arch/um/include/asm/Kbuild         |  3 ++-
 arch/unicore32/include/asm/Kbuild  |  1 +
 arch/x86/include/asm/simd.h        | 30 ++++++++++++++++++++-
 arch/xtensa/include/asm/Kbuild     |  1 +
 include/asm-generic/simd.h         | 15 +++++++++++
 include/linux/simd.h               | 28 ++++++++++++++++++++
 26 files changed, 180 insertions(+), 21 deletions(-)
 create mode 100644 arch/arm/include/asm/simd.h
 create mode 100644 include/linux/simd.h

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 0580cb8c84b2..07b2c1025d34 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -2,14 +2,15 @@
 
 
 generic-y += compat.h
+generic-y += current.h
 generic-y += exec.h
 generic-y += export.h
 generic-y += fb.h
 generic-y += irq_work.h
+generic-y += kprobes.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += trace_clock.h
-generic-y += current.h
-generic-y += kprobes.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index feed50ce89fa..a7f4255f1649 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += parport.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += user.h
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
new file mode 100644
index 000000000000..bf468993bbef
--- /dev/null
+++ b/arch/arm/include/asm/simd.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
+static __must_check inline bool may_use_simd(void)
+{
+	return !in_interrupt();
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = may_use_simd();
+	if (have_simd)
+		kernel_neon_begin();
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_neon_end();
+}
+#else
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+#endif
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 6495cc51246f..058c336de38d 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -1,11 +1,10 @@
-/*
- * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+/* SPDX-License-Identifier: GPL-2.0
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
+#include <linux/simd.h>
 #ifndef __ASM_SIMD_H
 #define __ASM_SIMD_H
 
@@ -16,6 +15,8 @@
 #include <linux/types.h>
 
 #ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+#include <asm/simd.h>
 
 DECLARE_PER_CPU(bool, kernel_neon_busy);
 
@@ -40,12 +41,36 @@ static __must_check inline bool may_use_simd(void)
 		!this_cpu_read(kernel_neon_busy);
 }
 
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = may_use_simd();
+	if (have_simd)
+		kernel_neon_begin();
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_neon_end();
+}
+
 #else /* ! CONFIG_KERNEL_MODE_NEON */
 
-static __must_check inline bool may_use_simd(void) {
+static __must_check inline bool may_use_simd(void)
+{
 	return false;
 }
 
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
 #endif /* ! CONFIG_KERNEL_MODE_NEON */
 
 #endif
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 33a2c94fed0d..22f3d8333c74 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,8 +5,8 @@ generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
@@ -30,6 +30,7 @@ generic-y += pgalloc.h
 generic-y += preempt.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += tlbflush.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index a5d0b2991f47..f5c2f12d593e 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -8,8 +8,8 @@ generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
@@ -39,6 +39,7 @@ generic-y += preempt.h
 generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += spinlock.h
 generic-y += timex.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index dd2fd9c0d292..217d4695fd8a 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -29,6 +29,7 @@ generic-y += rwsem.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 557bbc8ba9f5..41c5ebdf79e5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += vtime.h
 generic-y += word-at-a-time.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index a4b8d3331a9e..73898dd1a4d0 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += mm-arch-hooks.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += spinlock.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 569ba9e670c1..7a877eea99d3 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -25,6 +25,7 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += syscalls.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 58351e48421e..e8868e0fb2c3 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += sections.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += user.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index dbc4e5422550..603c1d020620 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -7,14 +7,14 @@ generic-y += bug.h
 generic-y += bugs.h
 generic-y += checksum.h
 generic-y += clkdev.h
-generic-y += cmpxchg.h
 generic-y += cmpxchg-local.h
+generic-y += cmpxchg.h
 generic-y += compat.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
@@ -46,14 +46,15 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmbuf.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += stat.h
 generic-y += switch_to.h
 generic-y += timex.h
 generic-y += topology.h
 generic-y += trace_clock.h
-generic-y += xor.h
 generic-y += unaligned.h
 generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
+generic-y += xor.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..571a9d9ad107 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -33,6 +33,7 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += spinlock.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index eb87cd8327c8..5e9f2f4c4d39 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -28,12 +28,13 @@ generic-y += module.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
-generic-y += qrwlock_types.h
 generic-y += qrwlock.h
+generic-y += qrwlock_types.h
+generic-y += qspinlock.h
+generic-y += qspinlock_types.h
 generic-y += sections.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += string.h
 generic-y += switch_to.h
 generic-y += topology.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 2013d639e735..97970b4d05ab 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += seccomp.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d227e351..64290f48e733 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,7 +4,8 @@ generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += msi.h
 generic-y += preempt.h
 generic-y += rwsem.h
+generic-y += simd.h
 generic-y += vtime.h
-generic-y += msi.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index efdbe311e936..6669b7374c0a 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -5,9 +5,9 @@ generic-y += compat.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-contiguous.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
@@ -46,6 +46,7 @@ generic-y += setup.h
 generic-y += shmbuf.h
 generic-y += shmparam.h
 generic-y += signal.h
+generic-y += simd.h
 generic-y += socket.h
 generic-y += sockios.h
 generic-y += stat.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index e3239772887a..7a26dc6ce815 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,9 +7,9 @@ generated-y += unistd_nr.h
 generic-y += asm-offsets.h
 generic-y += cacheflush.h
 generic-y += device.h
+generic-y += div64.h
 generic-y += dma-contiguous.h
 generic-y += dma-mapping.h
-generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += export.h
 generic-y += fb.h
@@ -22,6 +22,7 @@ generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += rwsem.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += word-at-a-time.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 6a5609a55965..8e64ff35a933 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += trace_clock.h
 generic-y += xor.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 410b263ef5c8..72b9e08fb350 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -17,5 +17,6 @@ generic-y += msi.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b10dde6cb793..d37288b08dd2 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -16,15 +16,16 @@ generic-y += io.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
+generic-y += kprobes.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += param.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
-generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index bfc7abe77905..98a908720bbd 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -27,6 +27,7 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += syscalls.h
 generic-y += topology.h
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index a341c878e977..79411178988a 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,4 +1,11 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
 
 #include <asm/fpu/api.h>
 
@@ -10,3 +17,24 @@ static __must_check inline bool may_use_simd(void)
 {
 	return irq_fpu_usable();
 }
+
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = false;
+#if !defined(CONFIG_UML)
+	have_simd = may_use_simd();
+	if (have_simd)
+		kernel_fpu_begin();
+#endif
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+#if !defined(CONFIG_UML)
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_fpu_end();
+#endif
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 82c756431b49..7950f359649d 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
diff --git a/include/asm-generic/simd.h b/include/asm-generic/simd.h
index d0343d58a74a..fad899a5a92d 100644
--- a/include/asm-generic/simd.h
+++ b/include/asm-generic/simd.h
@@ -1,5 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
 #include <linux/hardirq.h>
 
 /*
@@ -13,3 +17,14 @@ static __must_check inline bool may_use_simd(void)
 {
 	return !in_interrupt();
 }
+
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/include/linux/simd.h b/include/linux/simd.h
new file mode 100644
index 000000000000..f62d047188bf
--- /dev/null
+++ b/include/linux/simd.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+typedef enum {
+	HAVE_NO_SIMD,
+	HAVE_FULL_SIMD
+} simd_context_t;
+
+#include <linux/sched.h>
+#include <asm/simd.h>
+
+static inline simd_context_t simd_relax(simd_context_t prior_context)
+{
+#ifdef CONFIG_PREEMPT
+	if (prior_context != HAVE_NO_SIMD && need_resched()) {
+		simd_put(prior_context);
+		return simd_get();
+	}
+#endif
+	return prior_context;
+}
+
+#endif /* _SIMD_H */
-- 
2.18.0

^ permalink raw reply related

* [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-11  1:08 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, gregkh
  Cc: Jason A. Donenfeld, Andy Lutomirski, Samuel Neves,
	Jean-Philippe Aumasson, linux-crypto
In-Reply-To: <20180911010838.8818-1-Jason@zx2c4.com>

Zinc stands for "Zinc Is Neat Crypto" or "Zinc as IN Crypto" or maybe
just "Zx2c4's INsane Cryptolib." It's also short, easy to type, and
plays nicely with the recent trend of naming crypto libraries after
elements. The guiding principle is "don't overdo it". It's less of a
library and more of a directory tree for organizing well-curated direct
implementations of cryptography primitives.

Zinc is a new cryptography API that is much more minimal and lower-level
than the current one. It intends to complement it and provide a basis
upon which the current crypto API might build, as the provider of
software implementations of cryptographic primitives. It is motivated by
three primary observations in crypto API design:

  * Highly composable "cipher modes" and related abstractions from
    90s cryptographers did not turn out to be as terrific an idea as
    hoped, leading to a host of API misuse problems.

  * Most programmers are afraid of crypto code, and so prefer to
    integrate it into libraries in a highly abstracted manner, so as to
    shield themselves from implementation details. Cryptographers, on
    the other hand, prefer simple direct implementations, which they're
    able to verify for high assurance and optimize in accordance with
    their expertise.

  * Overly abstracted and flexible cryptography APIs lead to a host of
    dangerous problems and performance issues. The kernel is in the
    business usually not of coming up with new uses of crypto, but
    rather implementing various constructions, which means it essentially
    needs a library of primitives, not a highly abstracted enterprise-ready
    pluggable system, with a few particular exceptions.

This last observation has seen itself play out several times over and
over again within the kernel:

  * The perennial move of actual primitives away from crypto/ and into
    lib/, so that users can actually call these functions directly with
    no overhead and without lots of allocations, function pointers,
    string specifier parsing, and general clunkiness. For example:
    sha256, chacha20, siphash, sha1, and so forth live in lib/ rather
    than in crypto/. Zinc intends to stop the cluttering of lib/ and
    introduce these direct primitives into their proper place, lib/zinc/.

  * An abundance of misuse bugs with the present crypto API that have
    been very unpleasant to clean up.

  * A hesitance to even use cryptography, because of the overhead and
    headaches involved in accessing the routines.

Zinc goes in a rather different direction. Rather than providing a
thoroughly designed and abstracted API, Zinc gives you simple functions,
which implement some primitive, or some particular and specific
construction of primitives. It is not dynamic in the least, though one
could imagine implementing a complex dynamic dispatch mechanism (such as
the current crypto API) on top of these basic functions. After all,
dynamic dispatch is usually needed for applications with cipher agility,
such as IPsec, dm-crypt, AF_ALG, and so forth, and the existing crypto
API will continue to play that role. However, Zinc will provide a non-
haphazard way of directly utilizing crypto routines in applications
that do have neither the need nor desire for abstraction and dynamic
dispatch.

It also organizes the implementations in a simple, straight-forward,
and direct manner, making it enjoyable and intuitive to work on.
Rather than moving optimized assembly implementations into arch/, it
keeps them all together in lib/zinc/, making it simple and obvious to
compare and contrast what's happening. This is, notably, exactly what
the lib/raid6/ tree does, and that seems to work out rather well. It's
also the pattern of most successful crypto libraries. The architecture-
specific glue-code is made a part of each translation unit, rather than
being in a separate one, so that generic and architecture-optimized code
are combined at compile-time, and incompatibility branches compiled out by
the optimizer.

All implementations have been extensively tested and fuzzed, and are
selected for their quality, trustworthiness, and performance. Wherever
possible and performant, formally verified implementations are used,
such as those from HACL* [1] and Fiat-Crypto [2]. The routines also take
special care to zero out secrets using memzero_explicit (and future work
is planned to have gcc do this more reliably and performantly with
compiler plugins). The performance of the selected implementations is
state-of-the-art and unrivaled on a broad array of hardware, though of
course we will continue to fine tune these to the hardware demands
needed by kernel contributors. Each implementation also comes with
extensive self-tests and crafted test vectors, pulled from various
places such as Wycheproof [9].

Regularity of function signatures is important, so that users can easily
"guess" the name of the function they want. Though, individual
primitives are oftentimes not trivially interchangeable, having been
designed for different things and requiring different parameters and
semantics, and so the function signatures they provide will directly
reflect the realities of the primitives' usages, rather than hiding it
behind (inevitably leaky) abstractions. Also, in contrast to the current
crypto API, Zinc functions can work on stack buffers, and can be called
with different keys, without requiring allocations or locking.

SIMD is used automatically when available, though some routines may
benefit from either having their SIMD disabled for particular
invocations, or to have the SIMD initialization calls amortized over
several invocations of the function, and so Zinc utilizes function
signatures enabling that in conjunction with the recently introduced
simd_context_t.

More generally, Zinc provides function signatures that allow just what
is required by the various callers. This isn't to say that users of the
functions will be permitted to pollute the function semantics with weird
particular needs, but we are trying very hard not to overdo it, and that
means looking carefully at what's actually necessary, and doing just that,
and not much more than that. Remember: practicality and cleanliness rather
than over-zealous infrastructure.

Zinc provides also an opening for the best implementers in academia to
contribute their time and effort to the kernel, by being sufficiently
simple and inviting. In discussing this commit with some of the best and
brightest over the last few years, there are many who are eager to
devote rare talent and energy to this effort.

Following the merging of this, I expect for the primitives that
currently exist in lib/ to work their way into lib/zinc/, after intense
scrutiny of each implementation, potentially replacing them with either
formally-verified implementations, or better studied and faster
state-of-the-art implementations.

Also following the merging of this, I expect for the old crypto API
implementations to be ported over to use Zinc for their software-based
implementations.

As Zinc is simply library code, its config options are un-menued, with
the exception of CONFIG_ZINC_DEBUG, which enables various selftests and
BUG_ONs.

[1] https://github.com/project-everest/hacl-star
[2] https://github.com/mit-plv/fiat-crypto
[3] https://cr.yp.to/ecdh.html
[4] https://cr.yp.to/chacha.html
[5] https://cr.yp.to/snuffle/xsalsa-20081128.pdf
[6] https://cr.yp.to/mac.html
[7] https://blake2.net/
[8] https://tools.ietf.org/html/rfc8439
[9] https://github.com/google/wycheproof

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: linux-crypto@vger.kernel.org
---
 MAINTAINERS       |  8 ++++++++
 lib/Kconfig       |  2 ++
 lib/Makefile      |  2 ++
 lib/zinc/Kconfig  | 20 ++++++++++++++++++++
 lib/zinc/Makefile |  8 ++++++++
 lib/zinc/main.c   | 31 +++++++++++++++++++++++++++++++
 6 files changed, 71 insertions(+)
 create mode 100644 lib/zinc/Kconfig
 create mode 100644 lib/zinc/Makefile
 create mode 100644 lib/zinc/main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2ef884b883c3..d2092e52320d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16160,6 +16160,14 @@ Q:	https://patchwork.linuxtv.org/project/linux-media/list/
 S:	Maintained
 F:	drivers/media/dvb-frontends/zd1301_demod*
 
+ZINC CRYPTOGRAPHY LIBRARY
+M:	Jason A. Donenfeld <Jason@zx2c4.com>
+M:	Samuel Neves <sneves@dei.uc.pt>
+S:	Maintained
+F:	lib/zinc/
+F:	include/zinc/
+L:	linux-crypto@vger.kernel.org
+
 ZPOOL COMPRESSED PAGE STORAGE API
 M:	Dan Streetman <ddstreet@ieee.org>
 L:	linux-mm@kvack.org
diff --git a/lib/Kconfig b/lib/Kconfig
index a3928d4438b5..3e6848269c66 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -485,6 +485,8 @@ config GLOB_SELFTEST
 	  module load) by a small amount, so you're welcome to play with
 	  it, but you probably don't need it.
 
+source "lib/zinc/Kconfig"
+
 #
 # Netlink attribute parsing support is select'ed if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index ca3f7ebb900d..3f16e35d2c11 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -214,6 +214,8 @@ obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
 
 obj-$(CONFIG_ASN1) += asn1_decoder.o
 
+obj-$(CONFIG_ZINC) += zinc/
+
 obj-$(CONFIG_FONT_SUPPORT) += fonts/
 
 obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
new file mode 100644
index 000000000000..aa4f8d449d6b
--- /dev/null
+++ b/lib/zinc/Kconfig
@@ -0,0 +1,20 @@
+config ZINC
+	tristate
+	select CRYPTO_BLKCIPHER
+	select VFP
+	select VFPv3
+	select NEON
+	select KERNEL_MODE_NEON
+
+config ZINC_DEBUG
+	bool "Zinc cryptography library debugging and self-tests"
+	depends on ZINC
+	help
+	  This builds a series of self-tests for the Zinc crypto library, which
+	  help diagnose any cryptographic algorithm implementation issues that
+	  might be at the root cause of potential bugs. It also adds various
+	  debugging traps.
+
+	  Unless you're developing and testing cryptographic routines, or are
+	  especially paranoid about correctness on your hardware, you may say
+	  N here.
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
new file mode 100644
index 000000000000..dad47573de42
--- /dev/null
+++ b/lib/zinc/Makefile
@@ -0,0 +1,8 @@
+ccflags-y := -O3
+ccflags-y += -Wframe-larger-than=8192
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
+
+zinc-y += main.o
+
+obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
new file mode 100644
index 000000000000..ceece33ff5a7
--- /dev/null
+++ b/lib/zinc/main.c
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#ifdef DEBUG
+#define selftest(which) do { \
+	if (!which ## _selftest()) \
+		return -ENOTRECOVERABLE; \
+} while (0)
+#else
+#define selftest(which)
+#endif
+
+static int __init mod_init(void)
+{
+	return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Zinc cryptography library");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-- 
2.18.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox