Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 25/30] net: add rb_to_skb() and other rb tree helpers
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 18a4c0eab2623cc95be98a1e6af1ad18e7695977)
---
 include/linux/skbuff.h  | 18 ++++++++++++++++++
 net/ipv4/tcp_fastopen.c |  8 +++-----
 net/ipv4/tcp_input.c    | 33 ++++++++++++---------------------
 net/sched/sch_netem.c   | 14 ++++----------
 4 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 758084b434c8..2837e55df03e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3169,6 +3169,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 
 #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3183,6 +3189,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fbbeda647774..0567edb76522 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -458,17 +458,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node *p;
-	struct sk_buff *skb;
 	struct dst_entry *dst;
+	struct sk_buff *skb;
 
 	if (!tp->syn_fastopen)
 		return;
 
 	if (!tp->data_segs_in) {
-		p = rb_first(&tp->out_of_order_queue);
-		if (p && !rb_next(p)) {
-			skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 				tcp_fastopen_active_disable(sk);
 				return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bdabd748f4bc..991f382afc1b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4372,7 +4372,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 	p = rb_first(&tp->out_of_order_queue);
 	while (p) {
-		skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = rb_to_skb(p);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4440,7 +4440,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node **p, *q, *parent;
+	struct rb_node **p, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
 	bool fragstolen;
@@ -4503,7 +4503,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	parent = NULL;
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 			p = &parent->rb_left;
 			continue;
@@ -4548,9 +4548,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 merge_right:
 	/* Remove other segments covered by skb. */
-	while ((q = rb_next(&skb->rbnode)) != NULL) {
-		skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4565,7 +4563,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_drop(sk, skb1);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
-	if (!q)
+	if (!skb1)
 		tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4749,7 +4747,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
 	if (list)
 		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+	return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4778,7 +4776,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 			p = &parent->rb_left;
 		else
@@ -4898,19 +4896,12 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb, *head;
-	struct rb_node *p;
 	u32 start, end;
 
-	p = rb_first(&tp->out_of_order_queue);
-	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+	skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
 	if (!skb) {
-		p = rb_last(&tp->out_of_order_queue);
-		/* Note: This is possible p is NULL here. We do not
-		 * use rb_entry_safe(), as ooo_last_skb is valid only
-		 * if rbtree is not empty.
-		 */
-		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 		return;
 	}
 	start = TCP_SKB_CB(skb)->seq;
@@ -4918,7 +4909,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	range_truesize = skb->truesize;
 
 	for (head = skb;;) {
-		skb = tcp_skb_next(skb, NULL);
+		skb = skb_rb_next(skb);
 
 		/* Range is terminated when we see a gap or when
 		 * we are at the queue end.
@@ -4974,7 +4965,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
 		goal -= rb_to_skb(node)->truesize;
-		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+		tcp_drop(sk, rb_to_skb(node));
 		if (!prev || goal <= 0) {
 			sk_mem_reclaim(sk);
 			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -4984,7 +4975,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		}
 		node = prev;
 	} while (node);
-	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+	tp->ooo_last_skb = rb_to_skb(prev);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 8c8df75dbead..2a2ab6bfe5d8 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -149,12 +149,6 @@ struct netem_skb_cb {
 	ktime_t		tstamp_save;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-	return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -365,7 +359,7 @@ static void tfifo_reset(struct Qdisc *sch)
 	struct rb_node *p;
 
 	while ((p = rb_first(&q->t_root))) {
-		struct sk_buff *skb = netem_rb_to_skb(p);
+		struct sk_buff *skb = rb_to_skb(p);
 
 		rb_erase(p, &q->t_root);
 		rtnl_kfree_skbs(skb, skb);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		struct sk_buff *skb;
 
 		parent = *p;
-		skb = netem_rb_to_skb(parent);
+		skb = rb_to_skb(parent);
 		if (tnext >= netem_skb_cb(skb)->time_to_send)
 			p = &parent->rb_right;
 		else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				struct sk_buff *t_skb;
 				struct netem_skb_cb *t_last;
 
-				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
 				    t_last->time_to_send > last->time_to_send) {
@@ -618,7 +612,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	if (p) {
 		psched_time_t time_to_send;
 
-		skb = netem_rb_to_skb(p);
+		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 27/30] ipv4: frags: precedence bug in ip_expire()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Dan Carpenter
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Dan Carpenter <dan.carpenter@oracle.com>

We accidentally removed the parentheses here, but they are required
because '!' has higher precedence than '&'.

Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 70837ffe3085c9a91488b52ca13ac84424da1042)
---
 net/ipv4/ip_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 0e8f8de77e71..7cb7ed761d8c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -154,7 +154,7 @@ static void ip_expire(struct timer_list *t)
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-	if (!qp->q.flags & INET_FRAG_FIRST_IN)
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
 		goto out;
 
 	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH net-next RFC] virtio_net: ethtool tx napi configuration
From: Tobin C. Harding @ 2018-09-13 10:04 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, jasowang, mst, f.fainelli, Willem de Bruijn
In-Reply-To: <20180912232911.218610-1-willemdebruijn.kernel@gmail.com>

On Wed, Sep 12, 2018 at 07:29:11PM -0400, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
> 
> Implement ethtool .set_priv_flags and .get_priv_flags handlers
> and use ethtool private flags to toggle transmit napi:
> 
>   ethtool --set-priv-flags eth0 tx-napi on
>   ethtool --show-priv-flags eth0
> 
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Suggested-by: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>  drivers/net/virtio_net.c | 49 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 49 insertions(+)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..9ca7e0a0f0d9 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -73,6 +73,10 @@ static const unsigned long guest_offloads[] = {
>  	VIRTIO_NET_F_GUEST_UFO
>  };
>  
> +static const char virtnet_ethtool_priv_flags[][ETH_GSTRING_LEN] = {
> +	"tx-napi",
> +};
> +
>  struct virtnet_stat_desc {
>  	char desc[ETH_GSTRING_LEN];
>  	size_t offset;
> @@ -2059,6 +2063,9 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>  			}
>  		}
>  		break;
> +	case ETH_SS_PRIV_FLAGS:
> +		memcpy(data, virtnet_ethtool_priv_flags,
> +		       sizeof(virtnet_ethtool_priv_flags));
>  	}
>  }
>  
> @@ -2070,6 +2077,9 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>  	case ETH_SS_STATS:
>  		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
>  					       VIRTNET_SQ_STATS_LEN);
> +	case ETH_SS_PRIV_FLAGS:
> +		return ARRAY_SIZE(virtnet_ethtool_priv_flags);
> +
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> @@ -2181,6 +2191,43 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>  	return 0;
>  }
>  
> +static int virtnet_set_priv_flags(struct net_device *dev, u32 priv_flags)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i, napi_weight;
> +
> +	napi_weight = priv_flags & 0x1 ? NAPI_POLL_WEIGHT : 0;
> +
> +	if (napi_weight ^ vi->sq[0].napi.weight) {
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			struct netdev_queue *txq =
> +				netdev_get_tx_queue(vi->dev, i);
> +
> +			virtnet_napi_tx_disable(&vi->sq[i].napi);
> +			__netif_tx_lock_bh(txq);
> +			vi->sq[i].napi.weight = napi_weight;
> +			if (!napi_weight)
> +				virtqueue_enable_cb(vi->sq[i].vq);
> +			__netif_tx_unlock_bh(txq);
> +			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +					       &vi->sq[i].napi);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static u32 virtnet_get_priv_flags(struct net_device *dev)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int priv_flags = 0;
> +
> +	if (vi->sq[0].napi.weight)
> +		priv_flags |= 0x1;
> +
> +	return priv_flags;
> +}

Why the use of priv_flags here?  Is there some reason that we don't want
to use the more simple

    static u32 virtnet_get_priv_flags(struct net_device *dev)
    {
            struct virtnet_info *vi = netdev_priv(dev);
    
    	    if (vi->sq[0].napi.weight)
       	            return 1;
    
    	    return 0;
    }


thanks,
Tobin.

^ permalink raw reply

* Re: [PATCH 1/1] ipv6: Add sockopt IPV6_MULTICAST_ALL analogue to IP_MULTICAST_ALL
From: David Miller @ 2018-09-13 15:17 UTC (permalink / raw)
  To: nautsch2
  Cc: netdev, linux-kernel, kuznet, yoshfuji, gregkh, ek, tglx, maze,
	shli, kstewart, pombredanne
In-Reply-To: <20180910082715.11506-2-nautsch2@gmail.com>

From: Andre Naujoks <nautsch2@gmail.com>
Date: Mon, 10 Sep 2018 10:27:15 +0200

> The socket option will be enabled by default to ensure current behaviour
> is not changed. This is the same for the IPv4 version.
> 
> A socket bound to in6addr_any and a specific port will receive all traffic
> on that port. Analogue to IP_MULTICAST_ALL, disable this behaviour, if
> one or more multicast groups were joined (using said socket) and only
> pass on multicast traffic from groups, which were explicitly joined via
> this socket.
> 
> Without this option disabled a socket (system even) joined to multiple
> multicast groups is very hard to get right. Filtering by destination
> address has to take place in user space to avoid receiving multicast
> traffic from other multicast groups, which might have traffic on the same
> port.
> 
> The extension of the IP_MULTICAST_ALL socketoption to just apply to ipv6,
> too, is not done to avoid changing the behaviour of current applications.
> 
> Signed-off-by: Andre Naujoks <nautsch2@gmail.com>
> Acked-By: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>

Applied to net-next, thank you.

^ permalink raw reply

* Re: [PATCH net-next 5/5] ebpf: Add sample ebpf program for SOCKET_SG_FILTER
From: Sowmini Varadhan @ 2018-09-13 10:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tushar Dave, ast, daniel, davem, santosh.shilimkar,
	jakub.kicinski, quentin.monnet, jiong.wang, sandipan,
	john.fastabend, kafai, rdna, yhs, netdev, rds-devel
In-Reply-To: <20180913020757.geqe55gnsexv6icp@ast-mbp>

On (09/12/18 19:07), Alexei Starovoitov wrote:
> 
> I didn't know that. The way I understand your statement that
> this new program type, new sg logic, and all the complexity
> are only applicable to RDMA capable hw and RDS.

I dont know if you have been following the RFC series at all 
(and DanielB/JohnF feedback to it) but that is not what the patch 
set is about.

To repeat a summary of the original problem statement:

RDS (hardly a "niche" driver, let's please not get carried away 
with strong assertions based on incomplete understanding), 
is an example of a driver that happens to pass up packets
as both scatterlist and sk_buffs to the ULPs. 

The scatterlist comes from IB, the sk_buffs come from the ethernet
drivers. At the moment, the only way to build firewalls for
this is to convert scatterlist to skb and use  either netfilter
or eBPF on the skb. What Tushar is adding is support to use eBPF
on the scatterlist itself, so that you dont have to do this
inefficient scatterlist->skb conversion.

> In such case, I think, such BPF extensions do not belong
> in the upstream kernel. I don't want BPF to support niche technologies,

> since the maintenance overhead makes it prohibitive long term.

After I sent the message, I noticed that selftests/bpf has
some tests using veth/netns. I think Tushar should be able to write
tests for the rds-tcp path (and thus test the eBPF infrastructure, 
which seems to be what you are worried about)

Does that address your concern?

--Sowmini

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Willem de Bruijn @ 2018-09-13 15:20 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, David Miller, virtualization,
	Network Development, LKML, Willem de Bruijn
In-Reply-To: <20180913053545.18585-1-jasowang@redhat.com>

On Thu, Sep 13, 2018 at 1:40 AM Jason Wang <jasowang@redhat.com> wrote:
>
> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
> Interrupt moderation is currently not supported, so these accept and
> display the default settings of 0 usec and 1 frame.
>
> Toggle tx napi through a bit in tx-frames. So as to not interfere
> with possible future interrupt moderation, value 1 means tx napi while
> value 0 means not.
>
> To properly synchronize with the data path, tx napi is disabled and
> tx lock is held when changing the value of napi weight. And two more
> places that can access tx napi weight:
>
> - speculative tx polling in rx napi, we can leave it as is since it
>   not a must for correctness.
> - skb_xmit_done(), one more check of napi weight is added before
>   trying to enable tx to avoid tx to be disabled forever if napi is
>   disabled after skb_xmit_done() but before the napi
>
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes from V1:
> - try to synchronize with datapath to allow changing mode when
>   interface is up.
> - use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
> ---
>  drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..6e70864f5899 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>
> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> +

This is no longer needed

>  static const unsigned long guest_offloads[] = {
>         VIRTIO_NET_F_GUEST_TSO4,
>         VIRTIO_NET_F_GUEST_TSO6,
> @@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>
>         virtqueue_napi_complete(napi, sq->vq, 0);
>
> -       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> +       /* Check napi.weight to avoid tx stall since it could be set
> +        * to zero by ethtool after skb_xmit_done().
> +        */
> +       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
>                 netif_tx_wake_queue(txq);

I see. This assumes that the napi handler will always be called on
conversion from napi to no-napi mode.

That is safe to assume because if it isn't called (and will not call
netif_tx_wake_queue) that implies that napi was not scheduled, and
thus the tx interrupt was not suppressed and thus there was no tx
completion work to be scheduled?

>
>         return 0;
> @@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>         return 0;
>  }
>
> +static int virtnet_set_coalesce(struct net_device *dev,
> +                               struct ethtool_coalesce *ec)
> +{
> +       struct ethtool_coalesce ec_default = {
> +               .cmd = ETHTOOL_SCOALESCE,
> +               .rx_max_coalesced_frames = 1,
> +       };
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       int i, napi_weight;
> +
> +       if (ec->tx_max_coalesced_frames > 1)
> +               return -EINVAL;
> +
> +       ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
> +       napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> +
> +       /* disallow changes to fields not explicitly tested above */
> +       if (memcmp(ec, &ec_default, sizeof(ec_default)))
> +               return -EINVAL;
> +
> +       if (napi_weight ^ vi->sq[0].napi.weight) {
> +               for (i = 0; i < vi->max_queue_pairs; i++) {
> +                       struct netdev_queue *txq =
> +                              netdev_get_tx_queue(vi->dev, i);
> +
> +                       virtnet_napi_tx_disable(&vi->sq[i].napi);
> +                       __netif_tx_lock_bh(txq);
> +                       vi->sq[i].napi.weight = napi_weight;
> +                       __netif_tx_unlock_bh(txq);
> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +                                              &vi->sq[i].napi);
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int virtnet_get_coalesce(struct net_device *dev,
> +                               struct ethtool_coalesce *ec)
> +{
> +       struct ethtool_coalesce ec_default = {
> +               .cmd = ETHTOOL_GCOALESCE,
> +               .rx_max_coalesced_frames = 1,
> +               .tx_max_coalesced_frames = 0,

no need to explicitly initialize to 0 (unless you did this for
documentation purposes, which is fine).

> +       };
> +       struct virtnet_info *vi = netdev_priv(dev);
> +
> +       memcpy(ec, &ec_default, sizeof(ec_default));
> +
> +       if (vi->sq[0].napi.weight)
> +               ec->tx_max_coalesced_frames = 1;
> +
> +       return 0;
> +}
> +
>  static void virtnet_init_settings(struct net_device *dev)
>  {
>         struct virtnet_info *vi = netdev_priv(dev);
> @@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>         .get_ts_info = ethtool_op_get_ts_info,
>         .get_link_ksettings = virtnet_get_link_ksettings,
>         .set_link_ksettings = virtnet_set_link_ksettings,
> +       .set_coalesce = virtnet_set_coalesce,
> +       .get_coalesce = virtnet_get_coalesce,
>  };
>
>  static void virtnet_freeze_down(struct virtio_device *vdev)
> --
> 2.17.1
>

^ permalink raw reply

* Re: [PATCH] xen/netfront: don't bug in case of too many frags
From: David Miller @ 2018-09-13 15:22 UTC (permalink / raw)
  To: jgross; +Cc: linux-kernel, netdev, xen-devel, boris.ostrovsky, stable
In-Reply-To: <20180911070448.3958-1-jgross@suse.com>

From: Juergen Gross <jgross@suse.com>
Date: Tue, 11 Sep 2018 09:04:48 +0200

> Commit 57f230ab04d291 ("xen/netfront: raise max number of slots in
> xennet_get_responses()") raised the max number of allowed slots by one.
> This seems to be problematic in some configurations with netback using
> a larger MAX_SKB_FRAGS value (e.g. old Linux kernel with MAX_SKB_FRAGS
> defined as 18 instead of nowadays 17).
> 
> Instead of BUG_ON() in this case just fall back to retransmission.
> 
> Fixes: 57f230ab04d291 ("xen/netfront: raise max number of slots in xennet_get_responses()")
> Cc: stable@vger.kernel.org
> Signed-off-by: Juergen Gross <jgross@suse.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Andy Lutomirski @ 2018-09-13 15:26 UTC (permalink / raw)
  To: Milan Broz
  Cc: Andy Lutomirski, Ard Biesheuvel, Jason A. Donenfeld, LKML, Netdev,
	David Miller, Greg Kroah-Hartman, Samuel Neves,
	Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <c5855aa4-1561-8387-aebb-0a7587ac6d4f@gmail.com>


> On Sep 12, 2018, at 11:39 PM, Milan Broz <gmazyland@gmail.com> wrote:
> 
>> On 13/09/18 01:45, Andy Lutomirski wrote:
>> On Wed, Sep 12, 2018 at 3:56 PM, Ard Biesheuvel
> ... 
>> b) Crypto that is used dynamically.  This includes dm-crypt
>> (aes-xts-plain64, aes-cbc-essiv, etc), all the ALG_IF interfaces, a
>> lot of IPSEC stuff, possibly KCM, and probably many more.  These will
>> get comparatively little benefit from being converted to a zinc-like
>> interface.  For some of these cases, it wouldn't make any sense at all
>> to convert them.  Certainly the ones that do async hardware crypto
>> using DMA engines will never look at all like zinc, even under the
>> hood.
> 
> Please note, that dm-crypt now uses not only block ciphers and modes,
> but also authenticated encryption and hashes (for ESSIV and HMAC
> in authenticated composed modes) and RNG (for random IV).
> We use crypto API, including async variants (I hope correctly :)

Right. And all this is why I don’t think dm-crypt should use zinc, at least not any time soon.

^ permalink raw reply

* Re: [RFC PATCH iproute2-next] man: Add devlink health man page
From: Tobin C. Harding @ 2018-09-13 10:27 UTC (permalink / raw)
  To: Eran Ben Elisha
  Cc: netdev, Jiri Pirko, Andy Gospodarek, Michael Chan, Jakub Kicinski,
	Simon Horman, Alexander Duyck, Andrew Lunn, Florian Fainelli,
	Tal Alon, Ariel Almog
In-Reply-To: <1536826696-9413-2-git-send-email-eranbe@mellanox.com>

On Thu, Sep 13, 2018 at 11:18:16AM +0300, Eran Ben Elisha wrote:
> Add devlink-health man page. Devlink-health tool will control device
> health attributes, sensors, actions and logging.
> 
> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
> 
> -------------------------------------------------------
> Copy paste man output to here for easier review process of the RFC.
> 
> DEVLINK-HEALTH(8)                                                                                               Linux                                                                                              DEVLINK-HEALTH(8)
> 
> NAME
>        devlink-health - devlink health configuration
> 
> SYNOPSIS
>        devlink [ OPTIONS ] health  { COMMAND | help }
> 
>        OPTIONS := { -V[ersion] | -n[no-nice-names] }
> 
>        devlink health show [ DEV ] [ sensor NAME ]
> 
>        devlink health sensor set DEV name NAME [ action NAME { active | inactive } ]"
> 
>        devlink health action set DEV name NAME period PERIOD count COUNT fail { ignore | down }
> 
>        devlink health action reinit DEV name NAME
> 
>        devlink health help
> 
> DESCRIPTION
>        devlink-health tool allows user to configure the way driver treats unexpected status. The tool allows configuration of the sensors that can trigger health activity. Set for each sensor the follow up operations, such as,
>        reset and dump of info. In addition, set the health activity termination action.
> 
>    devlink health show - Display devlink health sensors and actions attributes
>        DEV - Specifies the devlink device to show.  If this argument is omitted, all devices are listed.
> 
>            Format is:
>              BUS_NAME/BUS_ADDRESS
> 
>        sensor NAME - Specifies the devlink sensor to show.
> 

Perhaps the commands should include the optional arguments so when
reading the description one doesn't have to scroll to the top of the
page all the time

e.g
     devlink health show [ DEV ] [ sensor NAME ] - Display devlink health sensors and actions attributes

>    devlink health sensor set - sets devlink health sensor attributes
>        DEV    Specifies the devlink device to show.

	 		      	      	     	set

>        name NAME
>               Name of the sensor to set.
> 
>        action NAME { active | inactive }
>                   Specify which actions to activate and which to deactivate once a sensor was triggered. actions can be dump, reset, etc.
> 
>    devlink health action set - sets devlink action attributes
>        DEV    Specifies the devlink device to set.
> 
>        name NAME
>               Specifies the devlink action to set.

This is a little unclear to me?

>        period PERIOD
>               The period on which we limit the amount of performed actions, measured in seconds.
> 
>        count COUNT
>               The maximum amount of actions performed in a limit time frame.

Perhaps		    	    	      	      
                The maximum number of actions performed in a limited time frame.

>        fail   { ignore | down }
>                   Specify the behavior once count limit was reached.
> 
>                   ignore - Ignore errors without execution of any action.
> 
>                   down - Driver will remain in nonoperational state.
> 
>    devlink health action reinit - reset devlink action attributes (period, count, fail, etc)
>        DEV    Specifies the devlink device to set.
> 
>        name NAME
>               Specifies the devlink action to set.

Perhaps s/set/reinitialise/g for the above two descriptions.

Hope this helps,
Tobin.

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Ard Biesheuvel @ 2018-09-13 15:42 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Andrew Lutomirski, LKML, Netdev, David Miller, Greg Kroah-Hartman,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAHmME9pcODNc2m26DH1cQY6R+grxYW5dvFpNDpz_GiugM3qufw@mail.gmail.com>

On 13 September 2018 at 16:32, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> On Thu, Sep 13, 2018 at 7:41 AM Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> But one of the supposed selling points of this crypto library is that
>> it gives engineers who are frightened of crypto in general and the
>> crypto API in particular simple and easy to use crypto primitives
>> rather than having to jump through the crypto API's hoops.
>
> The goal is for engineers who want to specifically use algorithm X
> from within the kernel in a non-dynamic way to be able to then use
> algorithm X with a simple function call. The goal is not to open it up
> to people who have no idea what they're doing; for that a NaCL-like
> library with functions like "crypto_box_open" or something would fit
> the bill; but that's also not what we're trying to do here. Please
> don't confuse the design goals. The rest of your email is therefore a
> bit of a straw man; cut the rhetoric out.
>
>> A crypto library whose only encryption algorithm is a stream cipher
>> does *not* deliver on that promise, since it is only suitable for
>> cases where IVs are guaranteed not to be reused.
>
> False. We also offer XChaCha20Poly1305, which takes a massive nonce,
> suitable for random generation.
>
> If there became a useful case for AES-PMAC-SIV or even AES-GCM or
> something to that extent, then Zinc would add that as required. But
> we're not going to start adding random ciphers unless they're needed.
>

I'd prefer it if all the accelerated software implementations live in
the same place. But I do strongly prefer arch code to live in
arch/$arch, simply because of the maintenance scope for arch
developers and maintainers.

I think AES-GCM is a useful example here. I really like the SIMD token
abstraction a lot, but I would like to understand how this would work
in Zinc if you have
a) a generic implementation
b) perhaps an arch specific scalar implementation
c) a pure NEON implementation
d) an implementation using AES instructions but not the PMULL instructions
e) an implementation that uses AES and PMULL instructions.

On arch/arm64 we support code patching, on other arches it may be
different. We also support udev loading of modules depending on CPU
capabilities, which means only the implementation you are likely to
use will be loaded, and other modules are disregarded.

I am not asking you to implement this now. But I do want to understand
how these use cases fit into Zinc. And note that this has nothing to
do with the crypto API: this could simply be a synchronous
aes_gcm_encrypt() library function that maps to any of the above at
runtime depending on the platform's capabilities.

>> You yourself were
>> bitten by the clunkiness of the crypto API when attempting to use the
>> SHA26 code, right? So shouldn't we move that into this crypto library
>> as well?
>
> As stated in the initial commit, and in numerous other emails
> stretching back a year, yes, sha256 and other things in lib/ are going
> to be put into Zinc following the initial merge of Zinc. These changes
> will happen incrementally, like everything else that happens in the
> kernel. Sha256, in particular, is probably the first thing I'll port
> post-merge.
>

Excellent.

>> I think it is reasonable for WireGuard to standardize on
>> ChaCha20/Poly1305 only, although I have my concerns about the flag day
>> that will be required if this 'one true cipher' ever does turn out to
>> be compromised (either that, or we will have to go back in time and
>> add some kind of protocol versioning to existing deployments of
>> WireGuard)
>
> Those concerns are not valid and have already been addressed (to you,
> I believe) on this mailing list and elsewhere. WireGuard is versioned,
> hence there's no need to "add" versioning, and it is prepared to roll
> out new cryptography in a subsequent version should there be any
> issues. In other words, your concern is based on a misunderstanding of
> the protocol. If you have issues, however, with the design decisions
> of WireGuard, something that's been heavily discussed with members of
> the linux kernel community, networking community, cryptography
> community, and so forth, for the last 3 years, I invite you to bring
> them up on <wireguard@lists.zx2c4.com>.
>

I'd prefer to have the discussion here, if you don't mind.

IIUC clients and servers need to run the same version of the protocol.
So does it require a flag day to switch the world over to the new
version when the old one is deprecated?

>> And frankly, if the code were as good as the prose, we wouldn't be
>> having this discussion.
>
> Please cut out this rhetoric. That's an obviously unprovable
> statement, but it probably isn't true anyway. I wish you'd stick to
> technical concerns only, rather than what appears to be a desire to
> derail this by any means necessary.
>

I apologize if I hit a nerve there, that was not my intention.

I am not an 'entrenched crypto API guy that is out to get you'. I am a
core ARM developer that takes an interest in crypto, shares your
concern about the usability of the crypto API, and tries to ensure
that what we end up is maintainable and usable for everybody.

>> Zinc adds its own clunky ways to mix arch and
>> generic code, involving GCC -include command line arguments and
>> #ifdefs everywhere. My review comments on this were completely ignored
>> by Jason.
>
> No, they were not ignored. v2 cleaned up the #ifdefs. v4 has already
> cleaned up the makefile stuff and will be even cleaner. Good things
> await, don't worry.
>

You know what? If you're up for it, let's not wait until Plumbers, but
instead, let's collaborate off list to get this into shape.

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 15:45 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: LKML, Netdev, David Miller, Greg Kroah-Hartman, Andrew Lutomirski,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAKv+Gu9VamLBv5Ji5AtvugUrAYQYaotLFZZGA=Rt18JUDEaAMQ@mail.gmail.com>

On Thu, Sep 13, 2018 at 5:04 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> > The code still benefits from the review that's gone into OpenSSL. It's
> > not modified in ways that would affect the cryptographic operations
> > being done. It's modified to be suitable for kernel space.
>
> So could we please at least have those changes as a separate patch then?

I'll experiment with a couple ways of trying to communicate with
precision what's been changed from OpenSSL for the next round of
patches.

> > That's interesting. I'll bring this up with AndyP. FWIW, if you think
> > you have a real and compelling claim here, I'd be much more likely to
> > accept a different ChaCha20 implementation than I would be to accept a
> > different Poly1305 implementation. (It's a *lot* harder to screw up
> > ChaCha20 than it is to screw up Poly1305.)
>
> The question is really whether we want different implementations in
> the crypto API and in zinc.

Per earlier in this discussion, I've already authored patches that
replaces the crypto API's implementations with simple calls to Zinc,
so that code isn't duplicated. These will be in v4 and you can comment
on the approach then.

> You are completely missing my point. I am not particularly invested in
> the crypto API, and I share the concerns about its usability. That is
> why I want to make sure that your solution actually results in a net
> improvement for everybody, not just for WireGuard, in a maintainable
> way.

Right, likewise. I've put quite a bit of effort into separating Zinc
into Zinc and not into something part of WireGuard. The motivation for
doing so is a decent amount of call sites all around the kernel that
I'd like to gradually fix up.

^ permalink raw reply

* Re: Regression: kernel 4.14 an later very slow with many ipsec tunnels
From: Wolfgang Walter @ 2018-09-13 15:46 UTC (permalink / raw)
  To: Florian Westphal
  Cc: netdev, Steffen Klassert, David Miller, Linux Kernel Mailing List,
	Linus Torvalds
In-Reply-To: <20180913135844.3ut6fxgx67t6ndtu@breakpoint.cc>

Am Donnerstag, 13. September 2018, 15:58:44 schrieb Florian Westphal:
> Wolfgang Walter <linux@stwm.de> wrote:
> > thanks to the fix from Steffen Klassert I could now run 4.14.69 + his
> > patch
> > and 4.18.7 + his patch without oopsing immediately.
> > 
> > But I found that those kernels perform very bad. They perform so bad that
> > they are unusable for our router with about 3000 ipsec tunnels (tunnel
> > mode network <-> network).
> 
> Can you do a 'perf record -a -g sleep 5' with 4.18 and provide 'perf
> report' result?
> 
> It would be good to see where those cycles are spent.

I'll try that but this isn't that easy as the router image does not contain 
perf. I also have to do that on our production router. I try to do that 
tomorrow evening.

What I can say is that it depends mainly on number of policy rules and SA.

Regards
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 15:58 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Andrew Lutomirski, LKML, Netdev, David Miller, Greg Kroah-Hartman,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAKv+Gu9GVbSETjp01tANMwJgA6O9aexhnH+47836KjZg+71q2A@mail.gmail.com>

On Thu, Sep 13, 2018 at 5:43 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> I'd prefer it if all the accelerated software implementations live in
> the same place. But I do strongly prefer arch code to live in
> arch/$arch

Zinc follows the scheme of the raid6 code, as well as of most other
crypto libraries: code is grouped by cipher, making it easy for people
to work with and understand differing implementations. It also allows
us to trivially link these together at compile time rather than at
link time, which makes cipher selection much more efficient. It's
really much more maintainable this way.

> I think AES-GCM is a useful example here. I really like the SIMD token
> abstraction a lot, but I would like to understand how this would work
> in Zinc if you have
> a) a generic implementation
> b) perhaps an arch specific scalar implementation
> c) a pure NEON implementation
> d) an implementation using AES instructions but not the PMULL instructions
> e) an implementation that uses AES and PMULL instructions.

The same way that Zinc currently chooses between the five different
implementations for, say, x86_64 ChaCha20:

- Generic C scalar
- SSSE3
- AVX2
- AVX512F
- AVX512VL

We make a decision based on CPU capabilities, SIMD context, and input
length, and then choose the right function.

> You know what? If you're up for it, let's not wait until Plumbers, but
> instead, let's collaborate off list to get this into shape.

Sure, sounds good.

Jason

^ permalink raw reply

* Re: [PATCH 1/2] netlink: add NLA_REJECT policy type
From: Michal Kubecek @ 2018-09-13 10:49 UTC (permalink / raw)
  To: Johannes Berg; +Cc: linux-wireless, netdev, Johannes Berg
In-Reply-To: <20180913084603.7979-1-johannes@sipsolutions.net>

On Thu, Sep 13, 2018 at 10:46:02AM +0200, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
> 
> In some situations some netlink attributes may be used for output
> only (kernel->userspace) or may be reserved for future use. It's
> then helpful to be able to prevent userspace from using them in
> messages sent to the kernel, since they'd otherwise be ignored and
> any future will become impossible if this happens.
> 
> Add NLA_REJECT to the policy which does nothing but reject (with
> EINVAL) validation of any messages containing this attribute.
> Allow for returning a specific extended ACK error message in the
> validation_data pointer.
> 
> While at it fix the indentation of NLA_BITFIELD32 and describe the
> validation_data pointer for it.
> 
> The specific case I have in mind now is a shared nested attribute
> containing request/response data, and it would be pointless and
> potentially confusing to have userspace include response data in
> the messages that actually contain a request.
> 
> Signed-off-by: Johannes Berg <johannes.berg@intel.com>
> ---
> ...
> @@ -251,10 +256,13 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
>  
>  		if (type > 0 && type <= maxtype) {
>  			if (policy) {
> -				err = validate_nla(nla, maxtype, policy);
> +				err = validate_nla(nla, maxtype, policy,
> +						   extack);
>  				if (err < 0) {
> -					NL_SET_ERR_MSG_ATTR(extack, nla,
> -							    "Attribute failed policy validation");
> +					NL_SET_BAD_ATTR(extack, nla);
> +					if (extack && !extack->_msg)
> +						NL_SET_ERR_MSG(extack,
> +							       "Attribute failed policy validation");
>  					goto errout;
>  				}
>  			}
> -- 

Technically, this would change the outcome when nla_parse() is called
with extack->_msg already set nad validate_nla() fails on something else
than NLA_REJECT; it will preserve the previous message in such case.
But I don't think this is a serious problem.

Reviewed-by: Michal Kubecek <mkubecek@suse.cz>

^ permalink raw reply

* Re: [PATCH v2] neighbour: confirm neigh entries when ARP packet is received
From: David Miller @ 2018-09-13 16:04 UTC (permalink / raw)
  To: vasilykh
  Cc: roopa, adobriyan, edumazet, stephen, jwestfall, w.bumiller,
	anarsoul, keescook, ihrachys, netdev, linux-kernel
In-Reply-To: <20180911180406.31283-1-vasilykh@arista.com>

From: Vasily Khoruzhick <vasilykh@arista.com>
Date: Tue, 11 Sep 2018 11:04:06 -0700

> Update 'confirmed' timestamp when ARP packet is received. It shouldn't
> affect locktime logic and anyway entry can be confirmed by any higher-layer
> protocol. Thus it makes to sense not to confirm it when ARP packet is
> received.
> 
> Fixes: 77d7123342 ("neighbour: update neigh timestamps iff update is
> effective")
> 
> Signed-off-by: Vasily Khoruzhick <vasilykh@arista.com>
> ---
> v2: - update comment to match new code.

Please fix the wording in this commit message, as per Sergei's
feedback.

Also, the Fixes: tag should be all one line (people grep for these
strings in the repository) and with no empty lines between it and the
rest of the tags such as Signed-off-by:

Thanks.

^ permalink raw reply

* Re: [PATCH] knav: qmss: Introduce queue descriptors monitor
From: David Miller @ 2018-09-13 16:07 UTC (permalink / raw)
  To: gomonovych
  Cc: w-kwok2, m-karicheri2, grygorii.strashko, vasyl.gomonovych,
	ssantosh, linux-kernel, netdev, linux-arm-kernel
In-Reply-To: <20180911211549.12092-1-gomonovych@gmail.com>

From: Vasyl Gomonovych <gomonovych@gmail.com>
Date: Tue, 11 Sep 2018 23:15:47 +0200

> Monitor and record available descriptors in knav_qmss queues
> Get amount of available descriptors in free-descriptor queue
> base on event-triggered RX traffic.
> Also monitor free-descriptor queue base on periodic time interval
> in kernel thread.
> To start monitoring available descriptors in queue earlyi,
> module parameters, enable start monitoring in boottime
> 
> This queue descriptor monitor helps debugging starvation issue.
> The monitor should help debug queue under traffic pressure
> and can describe the shape of this pressure when a queue
> faced descriptors starvation.
> Monitor helpful for IP blocks which do not have dedicated
> descriptor starvation interrupt like RapidIO IP.
> 
> Registration and enable file in debugfs hierarchy
> 
> |-/sys/kernel/debug
> |-- knav_qmssm_soc:hwqueue@2a40000
> |   |-- 8710
> |   |   |-- buffer_size
> |   |   |-- enable
> |   |   |-- monitor_stats
> |   |   -- unregister
> 
> ---
> 
> The current implementation is the first iteration
> and require additional work.
> By this patch I would like to know does this could be
> helpful for other components and continue my work in a right way.
> 
> Signed-off-by: Vasyl Gomonovych <vasyl.gomonovych@nokia.com>

This is way over engineered for just keeping some statistics around.

Just have a periodic timer or a workqueue that does the necessary
sampling.

^ permalink raw reply

* Re: KMSAN: uninit-value in pppoe_rcv
From: Guillaume Nault @ 2018-09-13 16:19 UTC (permalink / raw)
  To: Alexander Potapenko
  Cc: Eric Dumazet, syzbot+f5f6080811c849739212, LKML, mostrows,
	Networking, syzkaller-bugs
In-Reply-To: <CAG_fn=WXPsQBWR1GDsprY3aL4oW0v6EGhNB5wrvPJCOMh_U7wQ@mail.gmail.com>

On Thu, Sep 13, 2018 at 04:12:38PM +0200, Alexander Potapenko wrote:
> On Thu, Sep 13, 2018 at 3:57 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> >
> > On 09/12/2018 03:38 AM, Alexander Potapenko wrote:
> > > On Wed, Sep 12, 2018 at 12:24 PM syzbot
> > > <syzbot+f5f6080811c849739212@syzkaller.appspotmail.com> wrote:
> > >>
> > >> Hello,
> > >>
> > >> syzbot found the following crash on:
> > >>
> > >> HEAD commit:    d2d741e5d189 kmsan: add initialization for shmem pages
> > >> git tree:       https://github.com/google/kmsan.git/master
> > >> console output: https://syzkaller.appspot.com/x/log.txt?x=1465fc37800000
> > >> kernel config:  https://syzkaller.appspot.com/x/.config?x=48f9de3384bcd0f
> > >> dashboard link: https://syzkaller.appspot.com/bug?extid=f5f6080811c849739212
> > >> compiler:       clang version 7.0.0 (trunk 329391)
> > >> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=14d6e607800000
> > >> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10a15b5b800000
> > >>
> > >> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > >> Reported-by: syzbot+f5f6080811c849739212@syzkaller.appspotmail.com
> > >>
> > >> IPVS: ftp: loaded support on port[0] = 21
> > >> ==================================================================
> > >> BUG: KMSAN: uninit-value in __get_item drivers/net/ppp/pppoe.c:172 [inline]
> > >> BUG: KMSAN: uninit-value in get_item drivers/net/ppp/pppoe.c:236 [inline]
> > >> BUG: KMSAN: uninit-value in pppoe_rcv+0xcef/0x10e0
> > >> drivers/net/ppp/pppoe.c:450
> > >> CPU: 0 PID: 4543 Comm: syz-executor355 Not tainted 4.16.0+ #87
> > >> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > >> Google 01/01/2011
> > >> Call Trace:
> > >>   __dump_stack lib/dump_stack.c:17 [inline]
> > >>   dump_stack+0x185/0x1d0 lib/dump_stack.c:53
> > >>   kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
> > >>   __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
> > >>   __get_item drivers/net/ppp/pppoe.c:172 [inline]
> > >>   get_item drivers/net/ppp/pppoe.c:236 [inline]
> > >>   pppoe_rcv+0xcef/0x10e0 drivers/net/ppp/pppoe.c:450
> > >>   __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4562
> > >>   __netif_receive_skb net/core/dev.c:4627 [inline]
> > >>   netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701
> > >>   netif_receive_skb+0x230/0x240 net/core/dev.c:4725
> > >>   tun_rx_batched drivers/net/tun.c:1555 [inline]
> > >>   tun_get_user+0x740f/0x7c60 drivers/net/tun.c:1962
> > >>   tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
> > >>   call_write_iter include/linux/fs.h:1782 [inline]
> > >>   new_sync_write fs/read_write.c:469 [inline]
> > >>   __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
> > >>   vfs_write+0x463/0x8d0 fs/read_write.c:544
> > >>   SYSC_write+0x172/0x360 fs/read_write.c:589
> > >>   SyS_write+0x55/0x80 fs/read_write.c:581
> > >>   do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
> > >>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> > >> RIP: 0033:0x4447c9
> > >> RSP: 002b:00007fff64c8fc28 EFLAGS: 00000297 ORIG_RAX: 0000000000000001
> > >> RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004447c9
> > >> RDX: 000000000000fd87 RSI: 0000000020000600 RDI: 0000000000000004
> > >> RBP: 00000000006cf018 R08: 00007fff64c8fda8 R09: 00007fff00006bda
> > >> R10: 0000000000005fe7 R11: 0000000000000297 R12: 00000000004020d0
> > >> R13: 0000000000402160 R14: 0000000000000000 R15: 0000000000000000
> > >>
> > >> Uninit was created at:
> > >>   kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
> > >>   kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
> > >>   kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
> > >>   kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
> > >>   slab_post_alloc_hook mm/slab.h:445 [inline]
> > >>   slab_alloc_node mm/slub.c:2737 [inline]
> > >>   __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
> > >>   __kmalloc_reserve net/core/skbuff.c:138 [inline]
> > >>   __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
> > >>   alloc_skb include/linux/skbuff.h:984 [inline]
> > >>   alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234
> > >>   sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085
> > >>   tun_alloc_skb drivers/net/tun.c:1532 [inline]
> > >>   tun_get_user+0x2242/0x7c60 drivers/net/tun.c:1829
> > >>   tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
> > >>   call_write_iter include/linux/fs.h:1782 [inline]
> > >>   new_sync_write fs/read_write.c:469 [inline]
> > >>   __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
> > >>   vfs_write+0x463/0x8d0 fs/read_write.c:544
> > >>   SYSC_write+0x172/0x360 fs/read_write.c:589
> > >>   SyS_write+0x55/0x80 fs/read_write.c:581
> > >>   do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
> > >>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> > >> ==================================================================
> > > I did a little digging before sending the bug upstream.
> > > If I add memset(obj, 0xfe, size) to __kmalloc_reserve(), these 0xfe
> > > bytes are visible in __get_item() at the place where KMSAN reports an
> > > error.
> > >
> > > The problem is somehow related to tun_get_user() creating a fragmented
> > > sk_buff - when I change the call to tun_alloc_skb() so that it
> > > allocates a single buffer the bug goes away.
> > >
> >
> > I guess the following patch would fix the issue
> >
> > (I will submit it more formally)
> No, as far as I can see it doesn't.
> Saving sid before __skb_pull() is still a good idea, but in this
> particular case |ph| doesn't change.

Yes, we probably need to save sid.

But I think the problem found by syzbot is related to
eth_hdr(skb)->h_source. PPPoE expects that Ethernet header has already
been parsed and is accessible at skb_mac_header(skb).
But here skb_mac_header(skb) == skb->data, and we may pull only 6 bytes
(sizeof(truct pppoe_hdr)). Therefore eth_hdr(skb)->h_source points past
skb's head length.

Not sure if something needs to be changed in tun.c for properly setting
skb_mac_header. But PPPoE has no reason to consider packets from
non-Ethernet devices anyway.

^ permalink raw reply

* Re: [PATCH net-next V2 00/11] vhost_net TX batching
From: David Miller @ 2018-09-13 16:28 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, linux-kernel, kvm, virtualization, mst
In-Reply-To: <20180912031709.14112-1-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Wed, 12 Sep 2018 11:16:58 +0800

> This series tries to batch submitting packets to underlayer socket
> through msg_control during sendmsg(). This is done by:
 ...

Series applied, thanks Jason.

^ permalink raw reply

* Re: [PATCH net-next v3 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races
From: Neil Armstrong @ 2018-09-13 11:24 UTC (permalink / raw)
  To: Jose Abreu, netdev
  Cc: Florian Fainelli, Jerome Brunet, Martin Blumenstingl,
	David S. Miller, Joao Pinto, Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <b7da85496a6f536decb54bc9cc8f9a77d3fef978.1536762575.git.joabreu@synopsys.com>

Hi Jose,

On 13/09/2018 10:02, Jose Abreu wrote:
> This follows David Miller advice and tries to fix coalesce timer in
> multi-queue scenarios.
> 
> We are now using per-queue coalesce values and per-queue TX timer.
> 
> Coalesce timer default values was changed to 1ms and the coalesce frames
> to 25.
> 
> Tested in B2B setup between XGMAC2 and GMAC5.

I will run a test today,

Thanks,
Neil

> 
> Signed-off-by: Jose Abreu <joabreu@synopsys.com>
> Cc: Florian Fainelli <f.fainelli@gmail.com>
> Cc: Neil Armstrong <narmstrong@baylibre.com>
> Cc: Jerome Brunet <jbrunet@baylibre.com>
> Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Joao Pinto <jpinto@synopsys.com>
> Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
> Cc: Alexandre Torgue <alexandre.torgue@st.com>
> ---
>  drivers/net/ethernet/stmicro/stmmac/common.h      |   4 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac.h      |  14 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 233 ++++++++++++----------
>  include/linux/stmmac.h                            |   1 +
>  4 files changed, 146 insertions(+), 106 deletions(-)

[...]

^ permalink raw reply

* Re: KMSAN: uninit-value in pppoe_rcv
From: Eric Dumazet @ 2018-09-13 16:35 UTC (permalink / raw)
  To: Alexander Potapenko, Eric Dumazet
  Cc: syzbot+f5f6080811c849739212, LKML, mostrows, Networking,
	syzkaller-bugs
In-Reply-To: <CAG_fn=WXPsQBWR1GDsprY3aL4oW0v6EGhNB5wrvPJCOMh_U7wQ@mail.gmail.com>



On 09/13/2018 07:12 AM, Alexander Potapenko wrote:
> On Thu, Sep 13, 2018 at 3:57 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>
>>
>> On 09/12/2018 03:38 AM, Alexander Potapenko wrote:
>>> On Wed, Sep 12, 2018 at 12:24 PM syzbot
>>> <syzbot+f5f6080811c849739212@syzkaller.appspotmail.com> wrote:
>>>>
>>>> Hello,
>>>>
>>>> syzbot found the following crash on:
>>>>
>>>> HEAD commit:    d2d741e5d189 kmsan: add initialization for shmem pages
>>>> git tree:       https://github.com/google/kmsan.git/master
>>>> console output: https://syzkaller.appspot.com/x/log.txt?x=1465fc37800000
>>>> kernel config:  https://syzkaller.appspot.com/x/.config?x=48f9de3384bcd0f
>>>> dashboard link: https://syzkaller.appspot.com/bug?extid=f5f6080811c849739212
>>>> compiler:       clang version 7.0.0 (trunk 329391)
>>>> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=14d6e607800000
>>>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10a15b5b800000
>>>>
>>>> IMPORTANT: if you fix the bug, please add the following tag to the commit:
>>>> Reported-by: syzbot+f5f6080811c849739212@syzkaller.appspotmail.com
>>>>
>>>> IPVS: ftp: loaded support on port[0] = 21
>>>> ==================================================================
>>>> BUG: KMSAN: uninit-value in __get_item drivers/net/ppp/pppoe.c:172 [inline]
>>>> BUG: KMSAN: uninit-value in get_item drivers/net/ppp/pppoe.c:236 [inline]
>>>> BUG: KMSAN: uninit-value in pppoe_rcv+0xcef/0x10e0
>>>> drivers/net/ppp/pppoe.c:450
>>>> CPU: 0 PID: 4543 Comm: syz-executor355 Not tainted 4.16.0+ #87
>>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>>>> Google 01/01/2011
>>>> Call Trace:
>>>>   __dump_stack lib/dump_stack.c:17 [inline]
>>>>   dump_stack+0x185/0x1d0 lib/dump_stack.c:53
>>>>   kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
>>>>   __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
>>>>   __get_item drivers/net/ppp/pppoe.c:172 [inline]
>>>>   get_item drivers/net/ppp/pppoe.c:236 [inline]
>>>>   pppoe_rcv+0xcef/0x10e0 drivers/net/ppp/pppoe.c:450
>>>>   __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4562
>>>>   __netif_receive_skb net/core/dev.c:4627 [inline]
>>>>   netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701
>>>>   netif_receive_skb+0x230/0x240 net/core/dev.c:4725
>>>>   tun_rx_batched drivers/net/tun.c:1555 [inline]
>>>>   tun_get_user+0x740f/0x7c60 drivers/net/tun.c:1962
>>>>   tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
>>>>   call_write_iter include/linux/fs.h:1782 [inline]
>>>>   new_sync_write fs/read_write.c:469 [inline]
>>>>   __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
>>>>   vfs_write+0x463/0x8d0 fs/read_write.c:544
>>>>   SYSC_write+0x172/0x360 fs/read_write.c:589
>>>>   SyS_write+0x55/0x80 fs/read_write.c:581
>>>>   do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
>>>>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>>>> RIP: 0033:0x4447c9
>>>> RSP: 002b:00007fff64c8fc28 EFLAGS: 00000297 ORIG_RAX: 0000000000000001
>>>> RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004447c9
>>>> RDX: 000000000000fd87 RSI: 0000000020000600 RDI: 0000000000000004
>>>> RBP: 00000000006cf018 R08: 00007fff64c8fda8 R09: 00007fff00006bda
>>>> R10: 0000000000005fe7 R11: 0000000000000297 R12: 00000000004020d0
>>>> R13: 0000000000402160 R14: 0000000000000000 R15: 0000000000000000
>>>>
>>>> Uninit was created at:
>>>>   kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
>>>>   kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
>>>>   kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
>>>>   kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
>>>>   slab_post_alloc_hook mm/slab.h:445 [inline]
>>>>   slab_alloc_node mm/slub.c:2737 [inline]
>>>>   __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
>>>>   __kmalloc_reserve net/core/skbuff.c:138 [inline]
>>>>   __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
>>>>   alloc_skb include/linux/skbuff.h:984 [inline]
>>>>   alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234
>>>>   sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085
>>>>   tun_alloc_skb drivers/net/tun.c:1532 [inline]
>>>>   tun_get_user+0x2242/0x7c60 drivers/net/tun.c:1829
>>>>   tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
>>>>   call_write_iter include/linux/fs.h:1782 [inline]
>>>>   new_sync_write fs/read_write.c:469 [inline]
>>>>   __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
>>>>   vfs_write+0x463/0x8d0 fs/read_write.c:544
>>>>   SYSC_write+0x172/0x360 fs/read_write.c:589
>>>>   SyS_write+0x55/0x80 fs/read_write.c:581
>>>>   do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
>>>>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>>>> ==================================================================
>>> I did a little digging before sending the bug upstream.
>>> If I add memset(obj, 0xfe, size) to __kmalloc_reserve(), these 0xfe
>>> bytes are visible in __get_item() at the place where KMSAN reports an
>>> error.
>>>
>>> The problem is somehow related to tun_get_user() creating a fragmented
>>> sk_buff - when I change the call to tun_alloc_skb() so that it
>>> allocates a single buffer the bug goes away.
>>>
>>
>> I guess the following patch would fix the issue
>>
>> (I will submit it more formally)
> No, as far as I can see it doesn't.
> Saving sid before __skb_pull() is still a good idea, but in this
> particular case |ph| doesn't change.
>> diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
>> index ce61231e96ea5fe27f512fbd0d80d4609997e508..333e967ed968ea3ff2dda25289f7f657263db2b9 100644
>> --- a/drivers/net/ppp/pppoe.c
>> +++ b/drivers/net/ppp/pppoe.c
>> @@ -423,6 +423,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
>>         struct pppoe_hdr *ph;
>>         struct pppox_sock *po;
>>         struct pppoe_net *pn;
>> +       __be16 sid;
>>         int len;
>>
>>         skb = skb_share_check(skb, GFP_ATOMIC);
>> @@ -434,6 +435,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
>>
>>         ph = pppoe_hdr(skb);
>>         len = ntohs(ph->length);

Then ph->length needs to be better validated.

>> +       sid = ph->sid;

I'll take a look, thanks.

^ permalink raw reply

* Re: Regression: kernel 4.14 an later very slow with many ipsec tunnels
From: Florian Westphal @ 2018-09-13 16:38 UTC (permalink / raw)
  To: Wolfgang Walter
  Cc: Florian Westphal, netdev, Steffen Klassert, David Miller,
	Linux Kernel Mailing List, Linus Torvalds
In-Reply-To: <3448099.9yk84El3Sa@stwm.de>

Wolfgang Walter <linux@stwm.de> wrote:
> I'll try that but this isn't that easy as the router image does not contain 
> perf. I also have to do that on our production router. I try to do that 
> tomorrow evening.

No need if its too difficult.

> What I can say is that it depends mainly on number of policy rules and SA.

Thats already a good hint, I guess we're hitting long hash chains in
xfrm_policy_lookup_bytype().

^ permalink raw reply

* Regression: kernel 4.14 an later very slow with many ipsec tunnels
From: Wolfgang Walter @ 2018-09-13 11:30 UTC (permalink / raw)
  To: netdev
  Cc: Florian Westphal, Steffen Klassert, David Miller,
	Linux Kernel Mailing List, Linus Torvalds

Hello,

thanks to the fix from Steffen Klassert I could now run 4.14.69 + his patch 
and 4.18.7 + his patch without oopsing immediately.

But I found that those kernels perform very bad. They perform so bad that they 
are unusable for our router with about 3000 ipsec tunnels (tunnel mode network 
<-> network).

With 4.9. (and all other kernels I used in the last 10 years with much less 
potent hardware) I never had an comparable performance issue with networking.

4.18.7 is better then 4.14.69 but still remains unusuable for us.

Even with very little traffic all 8 cores are working 100% in ksoftirqd. As 
soon as there is real traffic network gets rather unusuable.

Latency of packets goes from between 0.1ms to 1ms up to 100ms to 500ms (4.14) 
or between 15ms to 90ms (4.18).

Throughput also suffers a lot.

I have a simple test I run after every upgrade. This test basically copies 
with scp large files to 60 different remote locations (involving ipsec), 
limited to 1GBit/s combined, and in paralled I ping from different networks 
over this router to machines in other networks of this router (no ipsec-
tunnels involved).

With 4.9 and earlier copying needs about 2 minutes and the pings all remain 
under 2ms roundtrip. 

With 4.14 copying these files needs more than one our. The roundtrip time of 
the ping is > 1 second.

With 4.18 this is much better, copying needs around 6 minutes and ping 
roundtrip is between 30ms and 180ms. But even that is much worse then 4.9.

I think this dramatic loss in performance is due to the removal of the flow 
cache. I propose to revert that for 4.14. I also propose to revert it for the 
next longterm kernel if no other solution is found bringing back 4.9 
performance (at least about the same order of magnitude).

Maybe it should generally be reverted until a solution to the problem exists.

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts

^ permalink raw reply

* [PATCH net-next] net: dsa: b53: Do not fail when IRQ are not initialized
From: Florian Fainelli @ 2018-09-13 16:50 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, Andrew Lunn, Vivien Didelot, David S. Miller,
	open list

When the Device Tree is not providing the per-port interrupts, do not fail
during b53_srab_irq_enable() but instead bail out gracefully. The SRAB driver
is used on the BCM5301X (Northstar) platforms which do not yet have the SRAB
interrupts wired up.

Fixes: 16994374a6fc ("net: dsa: b53: Make SRAB driver manage port interrupts")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_srab.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index 321052732799..90f514252987 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -415,7 +415,13 @@ static int b53_srab_irq_enable(struct b53_device *dev, int port)
 {
 	struct b53_srab_priv *priv = dev->priv;
 	struct b53_srab_port_priv *p = &priv->port_intrs[port];
-	int ret;
+	int ret = 0;
+
+	/* Interrupt is optional and was not specified, do not make
+	 * this fatal
+	 */
+	if (p->irq == -ENXIO)
+		return ret;
 
 	ret = request_threaded_irq(p->irq, b53_srab_port_isr,
 				   b53_srab_port_thread, 0,
-- 
2.17.1

^ permalink raw reply related

* [PATCH] socket: fix struct ifreq size in compat ioctl
From: Johannes Berg @ 2018-09-13 11:49 UTC (permalink / raw)
  To: netdev; +Cc: Robert O'Callahan, Al Viro, Johannes Berg

From: Johannes Berg <johannes.berg@intel.com>

As reported by Reobert O'Callahan, since Viro's commit to kill
dev_ifsioc() we attempt to copy too much data in compat mode,
which may lead to EFAULT when the 32-bit version of struct ifreq
sits at/near the end of a page boundary, and the next page isn't
mapped.

Fix this by passing whether or not we're doing a compat call and
copying the appropriate size in/out, as we did before. This works
because only the embedded "struct ifmap" has different size, and
this is only used in SIOCGIFMAP/SIOCSIFMAP which has a different
handler. All other parts of the union are naturally compatible.

Fixes: bf4405737f9f ("kill dev_ifsioc()")
Reported-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/socket.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index e6945e318f02..cef0725a2aaf 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -941,7 +941,8 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
 EXPORT_SYMBOL(dlci_ioctl_set);
 
 static long sock_do_ioctl(struct net *net, struct socket *sock,
-				 unsigned int cmd, unsigned long arg)
+			  unsigned int cmd, unsigned long arg,
+			  bool compat)
 {
 	int err;
 	void __user *argp = (void __user *)arg;
@@ -967,11 +968,15 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 	} else {
 		struct ifreq ifr;
 		bool need_copyout;
-		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+		if (copy_from_user(&ifr, argp,
+				   compat ? sizeof(struct compat_ifreq) :
+					    sizeof(struct ifreq)))
 			return -EFAULT;
 		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
 		if (!err && need_copyout)
-			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+			if (copy_to_user(argp, &ifr,
+					 compat ? sizeof(struct compat_ifreq) :
+						  sizeof(struct ifreq)))
 				return -EFAULT;
 	}
 	return err;
@@ -1070,7 +1075,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			err = open_related_ns(&net->ns, get_net_ns);
 			break;
 		default:
-			err = sock_do_ioctl(net, sock, cmd, arg);
+			err = sock_do_ioctl(net, sock, cmd, arg, false);
 			break;
 		}
 	return err;
@@ -2750,7 +2755,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock,
 	int err;
 
 	set_fs(KERNEL_DS);
-	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv, true);
 	set_fs(old_fs);
 	if (!err)
 		err = compat_put_timeval(&ktv, up);
@@ -2766,7 +2771,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock,
 	int err;
 
 	set_fs(KERNEL_DS);
-	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts, true);
 	set_fs(old_fs);
 	if (!err)
 		err = compat_put_timespec(&kts, up);
@@ -3072,7 +3077,7 @@ static int routing_ioctl(struct net *net, struct socket *sock,
 	}
 
 	set_fs(KERNEL_DS);
-	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
+	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r, true);
 	set_fs(old_fs);
 
 out:
@@ -3185,7 +3190,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBONDSETHWADDR:
 	case SIOCBONDCHANGEACTIVE:
 	case SIOCGIFNAME:
-		return sock_do_ioctl(net, sock, cmd, arg);
+		return sock_do_ioctl(net, sock, cmd, arg, true);
 	}
 
 	return -ENOIOCTLCMD;
-- 
2.14.4

^ permalink raw reply related

* Re: [RFC PATCH iproute2-next] man: Add devlink health man page
From: Eran Ben Elisha @ 2018-09-13 11:58 UTC (permalink / raw)
  To: Tobin C. Harding
  Cc: netdev, Jiri Pirko, Andy Gospodarek, Michael Chan, Jakub Kicinski,
	Simon Horman, Alexander Duyck, Andrew Lunn, Florian Fainelli,
	Tal Alon, Ariel Almog
In-Reply-To: <20180913102732.GG11198@eros>



On 9/13/2018 1:27 PM, Tobin C. Harding wrote:
> On Thu, Sep 13, 2018 at 11:18:16AM +0300, Eran Ben Elisha wrote:
>> Add devlink-health man page. Devlink-health tool will control device
>> health attributes, sensors, actions and logging.
>>
>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>>
>> -------------------------------------------------------
>> Copy paste man output to here for easier review process of the RFC.
>>
>> DEVLINK-HEALTH(8)                                                                                               Linux                                                                                              DEVLINK-HEALTH(8)
>>
>> NAME
>>         devlink-health - devlink health configuration
>>
>> SYNOPSIS
>>         devlink [ OPTIONS ] health  { COMMAND | help }
>>
>>         OPTIONS := { -V[ersion] | -n[no-nice-names] }
>>
>>         devlink health show [ DEV ] [ sensor NAME ]
>>
>>         devlink health sensor set DEV name NAME [ action NAME { active | inactive } ]"
>>
>>         devlink health action set DEV name NAME period PERIOD count COUNT fail { ignore | down }
>>
>>         devlink health action reinit DEV name NAME
>>
>>         devlink health help
>>
>> DESCRIPTION
>>         devlink-health tool allows user to configure the way driver treats unexpected status. The tool allows configuration of the sensors that can trigger health activity. Set for each sensor the follow up operations, such as,
>>         reset and dump of info. In addition, set the health activity termination action.
>>
>>     devlink health show - Display devlink health sensors and actions attributes
>>         DEV - Specifies the devlink device to show.  If this argument is omitted, all devices are listed.
>>
>>             Format is:
>>               BUS_NAME/BUS_ADDRESS
>>
>>         sensor NAME - Specifies the devlink sensor to show.
>>
> 
> Perhaps the commands should include the optional arguments so when
> reading the description one doesn't have to scroll to the top of the
> page all the time
> 
> e.g
>       devlink health show [ DEV ] [ sensor NAME ] - Display devlink health sensors and actions attributes
> 

I followed the scheme presented in all other devlink man pages.
see devlink-region, devlink-port, etc.

 From my perspective, I am fine with adding it to devlink-health, need 
ack from the devlink maintainer to see if he likes it...

>>     devlink health sensor set - sets devlink health sensor attributes
>>         DEV    Specifies the devlink device to show.
> 
> 	 		      	      	     	set
> 
>>         name NAME
>>                Name of the sensor to set.
>>
>>         action NAME { active | inactive }
>>                    Specify which actions to activate and which to deactivate once a sensor was triggered. actions can be dump, reset, etc.
>>
>>     devlink health action set - sets devlink action attributes
>>         DEV    Specifies the devlink device to set.
>>
>>         name NAME
>>                Specifies the devlink action to set.
> 
> This is a little unclear to me?

what is not clear? the term 'action' or the naming? can you elaborate?

> 
>>         period PERIOD
>>                The period on which we limit the amount of performed actions, measured in seconds.
>>
>>         count COUNT
>>                The maximum amount of actions performed in a limit time frame.
> 
> Perhaps		    	    	      	
>                  The maximum number of actions performed in a limited time frame.

ack

> 
>>         fail   { ignore | down }
>>                    Specify the behavior once count limit was reached.
>>
>>                    ignore - Ignore errors without execution of any action.
>>
>>                    down - Driver will remain in nonoperational state.
>>
>>     devlink health action reinit - reset devlink action attributes (period, count, fail, etc)
>>         DEV    Specifies the devlink device to set.
>>
>>         name NAME
>>                Specifies the devlink action to set.
> 
> Perhaps s/set/reinitialise/g for the above two descriptions.

ack

> 
> Hope this helps,
> Tobin.

thanks

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox