Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH bpf-next v4 6/8] samples/bpf: add use of need_wakeup flag in xdpsock
From: Jonathan Lemon @ 2019-08-14 14:53 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-7-git-send-email-magnus.karlsson@intel.com>



On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> This commit adds using the need_wakeup flag to the xdpsock sample
> application. It is turned on by default as we think it is a feature
> that seems to always produce a performance benefit, if the application
> has been written taking advantage of it. It can be turned off in the
> sample app by using the '-m' command line option.
>
> The txpush and l2fwd sub applications have also been updated to
> support poll() with multiple sockets.
>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>


> ---
>  samples/bpf/xdpsock_user.c | 192 
> ++++++++++++++++++++++++++++-----------------
>  1 file changed, 120 insertions(+), 72 deletions(-)
>
> diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
> index 93eaaf7..da84c76 100644
> --- a/samples/bpf/xdpsock_user.c
> +++ b/samples/bpf/xdpsock_user.c
> @@ -67,8 +67,10 @@ static int opt_ifindex;
>  static int opt_queue;
>  static int opt_poll;
>  static int opt_interval = 1;
> -static u32 opt_xdp_bind_flags;
> +static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
>  static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> +static int opt_timeout = 1000;
> +static bool opt_need_wakeup = true;
>  static __u32 prog_id;
>
>  struct xsk_umem_info {
> @@ -352,6 +354,7 @@ static struct option long_options[] = {
>  	{"zero-copy", no_argument, 0, 'z'},
>  	{"copy", no_argument, 0, 'c'},
>  	{"frame-size", required_argument, 0, 'f'},
> +	{"no-need-wakeup", no_argument, 0, 'm'},
>  	{0, 0, 0, 0}
>  };
>
> @@ -372,6 +375,7 @@ static void usage(const char *prog)
>  		"  -z, --zero-copy      Force zero-copy mode.\n"
>  		"  -c, --copy           Force copy mode.\n"
>  		"  -f, --frame-size=n   Set the frame size (must be a power of two, 
> default is %d).\n"
> +		"  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
>  		"\n";
>  	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
>  	exit(EXIT_FAILURE);
> @@ -384,8 +388,9 @@ static void parse_command_line(int argc, char 
> **argv)
>  	opterr = 0;
>
>  	for (;;) {
> -		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
> -				&option_index);
> +
> +		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:m",
> +				long_options, &option_index);
>  		if (c == -1)
>  			break;
>
> @@ -429,6 +434,9 @@ static void parse_command_line(int argc, char 
> **argv)
>  			break;
>  		case 'f':
>  			opt_xsk_frame_size = atoi(optarg);
> +		case 'm':
> +			opt_need_wakeup = false;
> +			opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
>  			break;
>  		default:
>  			usage(basename(argv[0]));
> @@ -459,7 +467,8 @@ static void kick_tx(struct xsk_socket_info *xsk)
>  	exit_with_error(errno);
>  }
>
> -static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
> +static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
> +				     struct pollfd *fds)
>  {
>  	u32 idx_cq = 0, idx_fq = 0;
>  	unsigned int rcvd;
> @@ -468,7 +477,9 @@ static inline void complete_tx_l2fwd(struct 
> xsk_socket_info *xsk)
>  	if (!xsk->outstanding_tx)
>  		return;
>
> -	kick_tx(xsk);
> +	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
> +		kick_tx(xsk);
> +
>  	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
>  		xsk->outstanding_tx;
>
> @@ -482,6 +493,8 @@ static inline void complete_tx_l2fwd(struct 
> xsk_socket_info *xsk)
>  		while (ret != rcvd) {
>  			if (ret < 0)
>  				exit_with_error(-ret);
> +			if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
> +				ret = poll(fds, num_socks, opt_timeout);
>  			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
>  						     &idx_fq);
>  		}
> @@ -505,7 +518,8 @@ static inline void complete_tx_only(struct 
> xsk_socket_info *xsk)
>  	if (!xsk->outstanding_tx)
>  		return;
>
> -	kick_tx(xsk);
> +	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
> +		kick_tx(xsk);
>
>  	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
>  	if (rcvd > 0) {
> @@ -515,20 +529,25 @@ static inline void complete_tx_only(struct 
> xsk_socket_info *xsk)
>  	}
>  }
>
> -static void rx_drop(struct xsk_socket_info *xsk)
> +static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
>  {
>  	unsigned int rcvd, i;
>  	u32 idx_rx = 0, idx_fq = 0;
>  	int ret;
>
>  	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
> -	if (!rcvd)
> +	if (!rcvd) {
> +		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
> +			ret = poll(fds, num_socks, opt_timeout);
>  		return;
> +	}
>
>  	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
>  	while (ret != rcvd) {
>  		if (ret < 0)
>  			exit_with_error(-ret);
> +		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
> +			ret = poll(fds, num_socks, opt_timeout);
>  		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
>  	}

I'll just point out that it seems a little odd that if one ring
needs a wakeup, all the rings get polled again.  However, I suppose
that it does amortize costs of a kernel call.
-- 
Jonathan


> @@ -549,42 +568,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
>  static void rx_drop_all(void)
>  {
>  	struct pollfd fds[MAX_SOCKS + 1];
> -	int i, ret, timeout, nfds = 1;
> +	int i, ret;
>
>  	memset(fds, 0, sizeof(fds));
>
>  	for (i = 0; i < num_socks; i++) {
>  		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
>  		fds[i].events = POLLIN;
> -		timeout = 1000; /* 1sn */
>  	}
>
>  	for (;;) {
>  		if (opt_poll) {
> -			ret = poll(fds, nfds, timeout);
> +			ret = poll(fds, num_socks, opt_timeout);
>  			if (ret <= 0)
>  				continue;
>  		}
>
>  		for (i = 0; i < num_socks; i++)
> -			rx_drop(xsks[i]);
> +			rx_drop(xsks[i], fds);
> +	}
> +}
> +
> +static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
> +{
> +	u32 idx;
> +
> +	if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == 
> BATCH_SIZE) {
> +		unsigned int i;
> +
> +		for (i = 0; i < BATCH_SIZE; i++) {
> +			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr	=
> +				(frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
> +			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
> +				sizeof(pkt_data) - 1;
> +		}
> +
> +		xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
> +		xsk->outstanding_tx += BATCH_SIZE;
> +		frame_nb += BATCH_SIZE;
> +		frame_nb %= NUM_FRAMES;
>  	}
> +
> +	complete_tx_only(xsk);
>  }
>
> -static void tx_only(struct xsk_socket_info *xsk)
> +static void tx_only_all(void)
>  {
> -	int timeout, ret, nfds = 1;
> -	struct pollfd fds[nfds + 1];
> -	u32 idx, frame_nb = 0;
> +	struct pollfd fds[MAX_SOCKS];
> +	u32 frame_nb[MAX_SOCKS] = {};
> +	int i, ret;
>
>  	memset(fds, 0, sizeof(fds));
> -	fds[0].fd = xsk_socket__fd(xsk->xsk);
> -	fds[0].events = POLLOUT;
> -	timeout = 1000; /* 1sn */
> +	for (i = 0; i < num_socks; i++) {
> +		fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
> +		fds[0].events = POLLOUT;
> +	}
>
>  	for (;;) {
>  		if (opt_poll) {
> -			ret = poll(fds, nfds, timeout);
> +			ret = poll(fds, num_socks, opt_timeout);
>  			if (ret <= 0)
>  				continue;
>
> @@ -592,69 +634,75 @@ static void tx_only(struct xsk_socket_info *xsk)
>  				continue;
>  		}
>
> -		if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
> -		    BATCH_SIZE) {
> -			unsigned int i;
> -
> -			for (i = 0; i < BATCH_SIZE; i++) {
> -				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
> -					= (frame_nb + i) * opt_xsk_frame_size;
> -				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
> -					sizeof(pkt_data) - 1;
> -			}
> -
> -			xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
> -			xsk->outstanding_tx += BATCH_SIZE;
> -			frame_nb += BATCH_SIZE;
> -			frame_nb %= NUM_FRAMES;
> -		}
> -
> -		complete_tx_only(xsk);
> +		for (i = 0; i < num_socks; i++)
> +			tx_only(xsks[i], frame_nb[i]);
>  	}
>  }
>
> -static void l2fwd(struct xsk_socket_info *xsk)
> +static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
>  {
> -	for (;;) {
> -		unsigned int rcvd, i;
> -		u32 idx_rx = 0, idx_tx = 0;
> -		int ret;
> +	unsigned int rcvd, i;
> +	u32 idx_rx = 0, idx_tx = 0;
> +	int ret;
>
> -		for (;;) {
> -			complete_tx_l2fwd(xsk);
> +	complete_tx_l2fwd(xsk, fds);
>
> -			rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
> -						   &idx_rx);
> -			if (rcvd > 0)
> -				break;
> -		}
> +	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
> +	if (!rcvd) {
> +		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
> +			ret = poll(fds, num_socks, opt_timeout);
> +		return;
> +	}
>
> +	ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
> +	while (ret != rcvd) {
> +		if (ret < 0)
> +			exit_with_error(-ret);
> +		if (xsk_ring_prod__needs_wakeup(&xsk->tx))
> +			kick_tx(xsk);
>  		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
> -		while (ret != rcvd) {
> -			if (ret < 0)
> -				exit_with_error(-ret);
> -			ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
> -		}
> +	}
> +
> +	for (i = 0; i < rcvd; i++) {
> +		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
> +		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
> +		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
> +
> +		swap_mac_addresses(pkt);
>
> -		for (i = 0; i < rcvd; i++) {
> -			u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
> -							  idx_rx)->addr;
> -			u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
> -							 idx_rx++)->len;
> -			char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
> +		hex_dump(pkt, len, addr);
> +		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
> +		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
> +	}
>
> -			swap_mac_addresses(pkt);
> +	xsk_ring_prod__submit(&xsk->tx, rcvd);
> +	xsk_ring_cons__release(&xsk->rx, rcvd);
>
> -			hex_dump(pkt, len, addr);
> -			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
> -			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
> -		}
> +	xsk->rx_npkts += rcvd;
> +	xsk->outstanding_tx += rcvd;
> +}
> +
> +static void l2fwd_all(void)
> +{
> +	struct pollfd fds[MAX_SOCKS];
> +	int i, ret;
> +
> +	memset(fds, 0, sizeof(fds));
> +
> +	for (i = 0; i < num_socks; i++) {
> +		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
> +		fds[i].events = POLLOUT | POLLIN;
> +	}
>
> -		xsk_ring_prod__submit(&xsk->tx, rcvd);
> -		xsk_ring_cons__release(&xsk->rx, rcvd);
> +	for (;;) {
> +		if (opt_poll) {
> +			ret = poll(fds, num_socks, opt_timeout);
> +			if (ret <= 0)
> +				continue;
> +		}
>
> -		xsk->rx_npkts += rcvd;
> -		xsk->outstanding_tx += rcvd;
> +		for (i = 0; i < num_socks; i++)
> +			l2fwd(xsks[i], fds);
>  	}
>  }
>
> @@ -705,9 +753,9 @@ int main(int argc, char **argv)
>  	if (opt_bench == BENCH_RXDROP)
>  		rx_drop_all();
>  	else if (opt_bench == BENCH_TXONLY)
> -		tx_only(xsks[0]);
> +		tx_only_all();
>  	else
> -		l2fwd(xsks[0]);
> +		l2fwd_all();
>
>  	return 0;
>  }
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH bpf-next v4 7/8] net/mlx5e: Move the SW XSK code from NAPI poll to a separate function
From: Jonathan Lemon @ 2019-08-14 14:53 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-8-git-send-email-magnus.karlsson@intel.com>



On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> From: Maxim Mikityanskiy <maximmi@mellanox.com>
>
> Two XSK tasks are performed during NAPI polling, that are not bound to
> hardware interrupts: TXing packets and polling for frames in the Fill
> Ring. They are special in a way that the hardware doesn't know about
> these tasks, so it doesn't trigger interrupts if there is still some
> work to be done, it's our driver's responsibility to ensure NAPI will be
> rescheduled if needed.
>
> Create a new function to handle these tasks and move the corresponding
> code from mlx5e_napi_poll to the new function to improve modularity and
> prepare for the changes in the following patch.
>
> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
> Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
> Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* [net-next  1/1] tipc: clean up skb list lock handling on send path
From: Jon Maloy @ 2019-08-14 14:55 UTC (permalink / raw)
  To: davem, netdev
  Cc: tung.q.nguyen, hoang.h.le, jon.maloy, lxin, shuali, ying.xue,
	edumazet, tipc-discussion

The policy for handling the skb list locks on the send and receive paths
is simple.

- On the send path we never need to grab the lock on the 'xmitq' list
  when the destination is an exernal node.

- On the receive path we always need to grab the lock on the 'inputq'
  list, irrespective of source node.

However, when transmitting node local messages those will eventually
end up on the receive path of a local socket, meaning that the argument
'xmitq' in tipc_node_xmit() will become the 'ínputq' argument in  the
function tipc_sk_rcv(). This has been handled by always initializing
the spinlock of the 'xmitq' list at message creation, just in case it
may end up on the receive path later, and despite knowing that the lock
in most cases never will be used.

This approach is inaccurate and confusing, and has also concealed the
fact that the stated 'no lock grabbing' policy for the send path is
violated in some cases.

We now clean up this by never initializing the lock at message creation,
instead doing this at the moment we find that the message actually will
enter the receive path. At the same time we fix the four locations
where we incorrectly access the spinlock on the send/error path.

This patch also reverts commit d12cffe9329f ("tipc: ensure head->lock
is initialised") which has now become redundant.

CC: Eric Dumazet <edumazet@google.com>
Reported-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bcast.c      | 10 +++++-----
 net/tipc/link.c       |  4 ++--
 net/tipc/name_distr.c |  2 +-
 net/tipc/node.c       |  7 ++++---
 net/tipc/socket.c     | 14 +++++++-------
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 34f3e56..6ef1abd 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -185,7 +185,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
 	}
 
 	/* We have to transmit across all bearers */
-	skb_queue_head_init(&_xmitq);
+	__skb_queue_head_init(&_xmitq);
 	for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
 		if (!bb->dests[bearer_id])
 			continue;
@@ -256,7 +256,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
 	struct sk_buff_head xmitq;
 	int rc = 0;
 
-	skb_queue_head_init(&xmitq);
+	__skb_queue_head_init(&xmitq);
 	tipc_bcast_lock(net);
 	if (tipc_link_bc_peers(l))
 		rc = tipc_link_xmit(l, pkts, &xmitq);
@@ -286,7 +286,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
 	u32 dnode, selector;
 
 	selector = msg_link_selector(buf_msg(skb_peek(pkts)));
-	skb_queue_head_init(&_pkts);
+	__skb_queue_head_init(&_pkts);
 
 	list_for_each_entry_safe(dst, tmp, &dests->list, list) {
 		dnode = dst->node;
@@ -344,7 +344,7 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
 	msg_set_size(_hdr, MCAST_H_SIZE);
 	msg_set_is_rcast(_hdr, !msg_is_rcast(hdr));
 
-	skb_queue_head_init(&tmpq);
+	__skb_queue_head_init(&tmpq);
 	__skb_queue_tail(&tmpq, _skb);
 	if (method->rcast)
 		tipc_bcast_xmit(net, &tmpq, cong_link_cnt);
@@ -378,7 +378,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
 	int rc = 0;
 
 	skb_queue_head_init(&inputq);
-	skb_queue_head_init(&localq);
+	__skb_queue_head_init(&localq);
 
 	/* Clone packets before they are consumed by next call */
 	if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
diff --git a/net/tipc/link.c b/net/tipc/link.c
index dd3155b..ba057a9 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -959,7 +959,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
 			skb_queue_len(list), msg_user(hdr),
 			msg_type(hdr), msg_size(hdr), mtu);
-		skb_queue_purge(list);
+		__skb_queue_purge(list);
 		return -EMSGSIZE;
 	}
 
@@ -988,7 +988,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		if (likely(skb_queue_len(transmq) < maxwin)) {
 			_skb = skb_clone(skb, GFP_ATOMIC);
 			if (!_skb) {
-				skb_queue_purge(list);
+				__skb_queue_purge(list);
 				return -ENOBUFS;
 			}
 			__skb_dequeue(list);
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 44abc8e..61219f0 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -190,7 +190,7 @@ void tipc_named_node_up(struct net *net, u32 dnode)
 	struct name_table *nt = tipc_name_table(net);
 	struct sk_buff_head head;
 
-	skb_queue_head_init(&head);
+	__skb_queue_head_init(&head);
 
 	read_lock_bh(&nt->cluster_scope_lock);
 	named_distribute(net, &head, dnode, &nt->cluster_scope);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 1bdcf0f..c8f6177 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1444,13 +1444,14 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
 
 	if (in_own_node(net, dnode)) {
 		tipc_loopback_trace(net, list);
+		spin_lock_init(&list->lock);
 		tipc_sk_rcv(net, list);
 		return 0;
 	}
 
 	n = tipc_node_find(net, dnode);
 	if (unlikely(!n)) {
-		skb_queue_purge(list);
+		__skb_queue_purge(list);
 		return -EHOSTUNREACH;
 	}
 
@@ -1459,7 +1460,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
 	if (unlikely(bearer_id == INVALID_BEARER_ID)) {
 		tipc_node_read_unlock(n);
 		tipc_node_put(n);
-		skb_queue_purge(list);
+		__skb_queue_purge(list);
 		return -EHOSTUNREACH;
 	}
 
@@ -1491,7 +1492,7 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
 {
 	struct sk_buff_head head;
 
-	skb_queue_head_init(&head);
+	__skb_queue_head_init(&head);
 	__skb_queue_tail(&head, skb);
 	tipc_node_xmit(net, &head, dnode, selector);
 	return 0;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 83ae41d..3b9f8cc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -809,7 +809,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	msg_set_nameupper(hdr, seq->upper);
 
 	/* Build message as chain of buffers */
-	skb_queue_head_init(&pkts);
+	__skb_queue_head_init(&pkts);
 	rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
 
 	/* Send message if build was successful */
@@ -853,7 +853,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk,
 	msg_set_grp_bc_seqno(hdr, bc_snd_nxt);
 
 	/* Build message as chain of buffers */
-	skb_queue_head_init(&pkts);
+	__skb_queue_head_init(&pkts);
 	mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
 	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
 	if (unlikely(rc != dlen))
@@ -1058,7 +1058,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
 	msg_set_grp_bc_ack_req(hdr, ack);
 
 	/* Build message as chain of buffers */
-	skb_queue_head_init(&pkts);
+	__skb_queue_head_init(&pkts);
 	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
 	if (unlikely(rc != dlen))
 		return rc;
@@ -1387,7 +1387,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	if (unlikely(rc))
 		return rc;
 
-	skb_queue_head_init(&pkts);
+	__skb_queue_head_init(&pkts);
 	mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
 	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
 	if (unlikely(rc != dlen))
@@ -1445,7 +1445,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 	int send, sent = 0;
 	int rc = 0;
 
-	skb_queue_head_init(&pkts);
+	__skb_queue_head_init(&pkts);
 
 	if (unlikely(dlen > INT_MAX))
 		return -EMSGSIZE;
@@ -1805,7 +1805,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 
 	/* Send group flow control advertisement when applicable */
 	if (tsk->group && msg_in_group(hdr) && !grp_evt) {
-		skb_queue_head_init(&xmitq);
+		__skb_queue_head_init(&xmitq);
 		tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen),
 					  msg_orignode(hdr), msg_origport(hdr),
 					  &xmitq);
@@ -2674,7 +2674,7 @@ static void tipc_sk_timeout(struct timer_list *t)
 	struct sk_buff_head list;
 	int rc = 0;
 
-	skb_queue_head_init(&list);
+	__skb_queue_head_init(&list);
 	bh_lock_sock(sk);
 
 	/* Try again later if socket is busy */
-- 
2.1.4


^ permalink raw reply related

* Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Alex Williamson @ 2019-08-14 14:57 UTC (permalink / raw)
  To: Parav Pandit
  Cc: Cornelia Huck, Kirti Wankhede, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, cjia@nvidia.com, Jiri Pirko,
	netdev@vger.kernel.org
In-Reply-To: <AM0PR05MB48666CCDFE985A25F42A0259D1AD0@AM0PR05MB4866.eurprd05.prod.outlook.com>

On Wed, 14 Aug 2019 13:45:49 +0000
Parav Pandit <parav@mellanox.com> wrote:

> > -----Original Message-----
> > From: Cornelia Huck <cohuck@redhat.com>
> > Sent: Wednesday, August 14, 2019 6:39 PM
> > To: Parav Pandit <parav@mellanox.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>; Kirti Wankhede
> > <kwankhede@nvidia.com>; kvm@vger.kernel.org; linux-
> > kernel@vger.kernel.org; cjia@nvidia.com; Jiri Pirko <jiri@mellanox.com>;
> > netdev@vger.kernel.org
> > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > 
> > On Wed, 14 Aug 2019 12:27:01 +0000
> > Parav Pandit <parav@mellanox.com> wrote:
> >   
> > > + Jiri, + netdev
> > > To get perspective on the ndo->phys_port_name for the representor netdev  
> > of mdev.  
> > >
> > > Hi Cornelia,
> > >  
> > > > -----Original Message-----
> > > > From: Cornelia Huck <cohuck@redhat.com>
> > > > Sent: Wednesday, August 14, 2019 1:32 PM
> > > > To: Parav Pandit <parav@mellanox.com>
> > > > Cc: Alex Williamson <alex.williamson@redhat.com>; Kirti Wankhede
> > > > <kwankhede@nvidia.com>; kvm@vger.kernel.org; linux-
> > > > kernel@vger.kernel.org; cjia@nvidia.com
> > > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > > >
> > > > On Wed, 14 Aug 2019 05:54:36 +0000
> > > > Parav Pandit <parav@mellanox.com> wrote:
> > > >  
> > > > > > > I get that part. I prefer to remove the UUID itself from the
> > > > > > > structure and therefore removing this API makes lot more sense?  
> > > > > >
> > > > > > Mdev and support tools around mdev are based on UUIDs because
> > > > > > it's  
> > > > defined  
> > > > > > in the documentation.  
> > > > > When we introduce newer device naming scheme, it will update the  
> > > > documentation also.  
> > > > > May be that is the time to move to .rst format too.  
> > > >
> > > > You are aware that there are existing tools that expect a uuid
> > > > naming scheme, right?
> > > >  
> > > Yes, Alex mentioned too.
> > > The good tool that I am aware of is [1], which is 4 months old. Not sure if it is  
> > part of any distros yet.  
> > >
> > > README also says, that it is in 'early in development. So we have scope to  
> > improve it for non UUID names, but lets discuss that more below.
> > 
> > The up-to-date reference for mdevctl is
> > https://github.com/mdevctl/mdevctl. There is currently an effort to get this
> > packaged in Fedora.
> >   
> Awesome.
> 
> > >  
> > > > >  
> > > > > > I don't think it's as simple as saying "voila, UUID dependencies
> > > > > > are removed, users are free to use arbitrary strings".  We'd
> > > > > > need to create some kind of naming policy, what characters are
> > > > > > allows so that we can potentially expand the creation parameters
> > > > > > as has been proposed a couple times, how do we deal with
> > > > > > collisions and races, and why should we make such a change when
> > > > > > a UUID is a perfectly reasonable devices name.  Thanks,
> > > > > >  
> > > > > Sure, we should define a policy on device naming to be more relaxed.
> > > > > We have enough examples in-kernel.
> > > > > Few that I am aware of are netdev (vxlan, macvlan, ipvlan, lot
> > > > > more), rdma  
> > > > etc which has arbitrary device names and ID based device names.  
> > > > >
> > > > > Collisions and race is already taken care today in the mdev core.
> > > > > Same  
> > > > unique device names continue.
> > > >
> > > > I'm still completely missing a rationale _why_ uuids are supposedly
> > > > bad/restricting/etc.  
> > > There is nothing bad about uuid based naming.
> > > Its just too long name to derive phys_port_name of a netdev.
> > > In details below.
> > >
> > > For a given mdev of networking type, we would like to have
> > > (a) representor netdevice [2]
> > > (b) associated devlink port [3]
> > >
> > > Currently these representor netdevice exist only for the PCIe SR-IOV VFs.
> > > It is further getting extended for mdev without SR-IOV.
> > >
> > > Each of the devlink port is attached to representor netdevice [4].
> > >
> > > This netdevice phys_port_name should be a unique derived from some  
> > property of mdev.  
> > > Udev/systemd uses phys_port_name to derive unique representor netdev  
> > name.  
> > > This netdev name is further use by orchestration and switching software in  
> > user space.  
> > > One such distro supported switching software is ovs [4], which relies on the  
> > persistent device name of the representor netdevice.
> > 
> > Ok, let me rephrase this to check that I understand this correctly. I'm not sure
> > about some of the terms you use here (even after looking at the linked
> > doc/code), but that's probably still ok.
> > 
> > We want to derive an unique (and probably persistent?) netdev name so that
> > userspace can refer to a representor netdevice. Makes sense.
> > For generating that name, udev uses the phys_port_name (which represents
> > the devlink port, IIUC). Also makes sense.
> >   
> You understood it correctly.
> 
> > >
> > > phys_port_name has limitation to be only 15 characters long.
> > > UUID doesn't fit in phys_port_name.  
> > 
> > Understood. But why do we need to derive the phys_port_name from the mdev
> > device name? This netdevice use case seems to be just one use case for using
> > mdev devices? If this is a specialized mdev type for this setup, why not just
> > expose a shorter identifier via an extra attribute?
> >   
> Representor netdev, represents mdev's switch port (like PCI SRIOV VF's switch port).
> So user must be able to relate this two objects in similar manner as SRIOV VFs.
> Phys_port_name is derived from the PCI PF and VF numbering scheme.
> Similarly mdev's such port should be derived from mdev's id/name/attribute.
> 
> > > Longer UUID names are creating snow ball effect, not just in networking stack  
> > but many user space tools too.
> > 
> > This snowball effect mainly comes from the device name -> phys_port_name
> > setup, IIUC.
> >   
> Right.
> 
> > > (as opposed to recently introduced mdevctl, are they more mdev tools
> > > which has dependency on UUID name?)  
> > 
> > I am aware that people have written scripts etc. to manage their mdevs.
> > Given that the mdev infrastructure has been around for quite some time, I'd
> > say the chance of some of those scripts relying on uuid names is non-zero.
> >   
> Ok. but those scripts have never managed networking devices.
> So those scripts won't break because they will always create mdev devices using UUID.
> When they use these new networking devices, they need more things than their scripts.
> So user space upgrade for such mixed mode case is reasonable.

Tools like mdevctl are agnostic of the type of mdev device they're
managing, it shouldn't matter than they've never managed a networking
mdev previously, it follows the standards of mdev management.

> > >
> > > Instead of mdev subsystem creating such effect, one option we are  
> > considering is to have shorter mdev names.  
> > > (Similar to netdev, rdma, nvme devices).
> > > Such as mdev1, mdev2000 etc.

Note that these are kernel generated names, as are the other examples.
In the case of mdev, the user is providing the UUID, which becomes the
device name.  When a user writes to the create attribute, there needs
to be determinism that the user can identify the device they created vs
another that may have been created concurrently.  I don't see that we
can put users in the path of managing device instance numbers.

> > > Second option I was considering is to have an optional alias for UUID based  
> > mdev.  
> > > This name alias is given at time of mdev creation.
> > > Devlink port's phys_port_name is derived out of this shorter mdev name  
> > alias. 
> > > This way, mdev remains to be UUID based with optional extension.
> > > However, I prefer first option to relax mdev naming scheme.  
> > 
> > Actually, I think that second option makes much more sense, as you avoid
> > potentially breaking existing tooling.  
> Let's first understand of what exactly will break with existing tool
> if they see non_uuid based device.

Do we really want a mixed namespace of device names, some UUID, some...
something else?  That seems like a mess.

> Existing tooling continue to work with UUID devices.
> Do you have example of what can break if they see non_uuid based
> device name? I think you are clear, but to be sure, UUID based
> creation will continue to be there. Optionally mdev will be created
> with alpha-numeric string, if we don't it as additional attribute.

I'm not onboard with a UUID being just one of the possible naming
strings via which we can create mdev devices.  I think that becomes
untenable for userspace.  I don't think a sufficient argument has been
made against the alias approach, which seems to keep the UUID as a
canonical name, providing a consistent namespace, augmented with user
or kernel provided short alias.  Thanks,

Alex

> > >  
> > > > We want to uniquely identify a device, across different types of
> > > > vendor drivers. An uuid is a unique identifier and even a
> > > > well-defined one. Tools (e.g. mdevctl) are relying on it for
> > > > mdev devices  
> > today.  
> > > >
> > > > What is the problem you're trying to solve?  
> > > Unique device naming is still achieved without UUID scheme by
> > > various  
> > subsystems in kernel using alpha-numeric string.  
> > > Having such string based continue to provide unique names.
> > >
> > > I hope I described the problem and two solutions above.
> > >
> > > [1] https://github.com/awilliam/mdevctl
> > > [2]
> > > https://elixir.bootlin.com/linux/v5.3-rc4/source/drivers/net/ethernet/
> > > mellanox/mlx5/core/en_rep.c [3]
> > > http://man7.org/linux/man-pages/man8/devlink-port.8.html
> > > [4]
> > > https://elixir.bootlin.com/linux/v5.3-rc4/source/net/core/devlink.c#L6
> > > 921
> > > [5] https://www.openvswitch.org/
> > >  
> 


^ permalink raw reply

* Re: [PATCH bpf-next v4 3/8] i40e: add support for AF_XDP need_wakeup feature
From: Magnus Karlsson @ 2019-08-14 14:59 UTC (permalink / raw)
  To: Jonathan Lemon
  Cc: Magnus Karlsson, Björn Töpel, Alexei Starovoitov,
	Daniel Borkmann, Network Development, Jesper Dangaard Brouer,
	Maxim Mikityanskiy, bpf, bruce.richardson, ciara.loftus,
	Jakub Kicinski, Ye Xiaolong, Zhang, Qi Z, Samudrala, Sridhar,
	Kevin Laatz, ilias.apalodimas, Kiran, axboe, Fijalkowski, Maciej,
	Maciej Fijalkowski, intel-wired-lan
In-Reply-To: <3B2C7C21-4AAC-4126-A31D-58A61D941709@gmail.com>

On Wed, Aug 14, 2019 at 4:48 PM Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>
>
>
> On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:
>
> > This patch adds support for the need_wakeup feature of AF_XDP. If the
> > application has told the kernel that it might sleep using the new bind
> > flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
> > no more buffers on the NIC Rx ring and yield to the application. For
> > Tx, it will set the flag if it has no outstanding Tx completion
> > interrupts and return to the application.
> >
> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 18 ++++++++++++++++++
> >  1 file changed, 18 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > index d0ff5d8..42c9012 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > @@ -626,6 +626,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring
> > *rx_ring, int budget)
> >
> >       i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
> >       i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
> > +
> > +     if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
> > +             if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
> > +                     xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
> > +             else
> > +                     xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
> > +
> > +             return (int)total_rx_packets;
> > +     }
> >       return failure ? budget : (int)total_rx_packets;
>
> Can you elaborate why we're not returning the total budget on failure
> for the wakeup case?

In the non need_wakeup case (the old behavior), when allocation fails
from the fill queue we want to retry right away basically busy
spinning on the fill queue until we find at least one entry and then
go on processing packets. Works well when the app and the driver are
on different cores, but a lousy strategy when they execute on the same
core. That is why in the need_wakeup feature case, we do not return
the total budget if there is a failure. We will just come back at a
later point in time from a syscall since the need_wakeup flag will
have been set and check the fill queue again. We do not want a
busy-spinning behavior in this case.

Thanks: Magnus

^ permalink raw reply

* can: tcan4x5x: spi bits_per_word issue on Raspberry PI
From: FIXED-TERM Buecheler Konstantin (ETAS-SEC/ECT-Mu) @ 2019-08-14 15:01 UTC (permalink / raw)
  To: linux-can@vger.kernel.org, netdev@vger.kernel.org; +Cc: Dan Murphy

> Hi all,
> 
> I am trying to use a tcan4550 together with a Raspberry PI 3 B. I am using the
> tcan4x5x driver from net-next.
> I always get the following error during startup.
> 	tcan4x5x spi0.0: Probe failed, err=-22
> 	tcan4x5x: probe of spi0.0 failed with error -22
> 
> I realized that this happens because the Raspberry PI does only support 8/9 bit
> words. https://elinux.org/index.php?title=RPi_SPI#Supported_bits_per_word
> In the driver it is set to 32.
> 	spi->bits_per_word = 32;
> 
> Setting this to 8 does not help of course since the tcan chip expects a multiple of
> 32 per spi transaction.
> I don't know if this is a Raspberry Pi specific problem or if there are more devices
> with this hardware limitation.
> 
> Does anyone have a workaround for that?

It seems to be enough to just change the bits_per_word value to 8

> 
> If this a common issue it might be a good idea to patch the driver. I will check if I
> can find proper a way to do so.
> 
> Regards,
> Konstantin

Now I have another really confusing problem. Anything I write to SPI is written little endian. The tcan chip expects big endian. 
Anything I read from SPI is treated as little endian but is big endian. Does anyone know why this happens? 
Is there a flag or something I can set for the SPI device/wire to fix this?

Regards,
Konstantin

^ permalink raw reply

* Re: [PATCH v2 bpf-next] mm: mmap: increase sockets maximum memory size pgoff for 32bits
From: Ivan Khoronzhuk @ 2019-08-14 15:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: bjorn.topel, linux-mm, xdp-newbies, netdev, bpf, linux-kernel,
	ast, magnus.karlsson
In-Reply-To: <20190812141924.32136e040904d0c5a819dcb1@linux-foundation.org>

On Mon, Aug 12, 2019 at 02:19:24PM -0700, Andrew Morton wrote:

Hi, Andrew

>On Mon, 12 Aug 2019 15:43:26 +0300 Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org> wrote:
>
>> The AF_XDP sockets umem mapping interface uses XDP_UMEM_PGOFF_FILL_RING
>> and XDP_UMEM_PGOFF_COMPLETION_RING offsets. The offsets seems like are
>> established already and are part of configuration interface.
>>
>> But for 32-bit systems, while AF_XDP socket configuration, the values
>> are to large to pass maximum allowed file size verification.
>> The offsets can be tuned ofc, but instead of changing existent
>> interface - extend max allowed file size for sockets.
>
>
>What are the implications of this?  That all code in the kernel which
>handles mapped sockets needs to be audited (and tested) for correctly
>handling mappings larger than 4G on 32-bit machines?  Has that been

That's to allow only offset to be passed, mapping length is less than 4Gb.
I have verified all list of mmap for sockets and all of them contain dummy
cb sock_no_mmap() except the following:

xsk_mmap()
tcp_mmap()
packet_mmap()

xsk_mmap() - it's what this fix is needed for.
tcp_mmap() - doesn't have obvious issues with pgoff - no any references on it.
packet_mmap() - return -EINVAL if it's even set.


>done?  Are we confident that we aren't introducing user-visible buggy
>behaviour into unsuspecting legacy code?
>
>Also...  what are the user-visible runtime effects of this change?
>Please send along a paragraph which explains this, for the changelog.
>Does this patch fix some user-visible problem?  If so, should be code
>be backported into -stable kernels?
It should go to linux-next, no one has been using it till this patch
with 32 bits as w/o this fix af_xdp sockets can't be used at all.
It unblocks af_xdp socket usage for 32bit systems.


That's example of potential next commit message:
Subject: mm: mmap: increase sockets maximum memory size pgoff for 32bits

The AF_XDP sockets umem mapping interface uses XDP_UMEM_PGOFF_FILL_RING
and XDP_UMEM_PGOFF_COMPLETION_RING offsets.  These offsets are established
already and are part of the configuration interface.

But for 32-bit systems, using AF_XDP socket configuration, these values
are too large to pass the maximum allowed file size verification.  The
offsets can be tuned off, but instead of changing the existing interface,
let's extend the max allowed file size for sockets.

No one has been using it till this patch with 32 bits as w/o this fix
af_xdp sockets can't be used at all, so it unblocks af_xdp socket usage
for 32bit systems.

All list of mmap cbs for sockets were verified on side effects and
all of them contain dummy cb - sock_no_mmap() at this moment, except the
following:

xsk_mmap() - it's what this fix is needed for.
tcp_mmap() - doesn't have obvious issues with pgoff - no any references on it.
packet_mmap() - return -EINVAL if it's even set.




Is it ok to be replicated in PATCH v2 or this explanation is enough here
to use v1?

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* [PATCH v2 2/2] treewide: remove dummy Makefiles for single targets
From: Masahiro Yamada @ 2019-08-14 15:19 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Masahiro Yamada, Alexei Starovoitov, Boris Pismenny,
	Daniel Borkmann, David S. Miller, Igor Russkikh, Jakub Kicinski,
	Leon Romanovsky, Martin KaFai Lau, Saeed Mahameed, Song Liu,
	Yonghong Song, bpf, linux-kernel, linux-rdma, netdev, oss-drivers
In-Reply-To: <20190814151919.16300-1-yamada.masahiro@socionext.com>

Now that the single target build descends into sub-directories in the
same way as the normal build, these dummy Makefiles are not needed
any more.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

Changes in v2: None

 drivers/net/ethernet/aquantia/atlantic/hw_atl/Makefile      | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile      | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile       | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/en/Makefile         | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile     | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile   | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile       | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile      | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile        | 2 --
 drivers/net/ethernet/netronome/nfp/bpf/Makefile             | 2 --
 drivers/net/ethernet/netronome/nfp/flower/Makefile          | 2 --
 drivers/net/ethernet/netronome/nfp/nfpcore/Makefile         | 2 --
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000/Makefile | 2 --
 drivers/net/ethernet/netronome/nfp/nic/Makefile             | 2 --
 14 files changed, 27 deletions(-)
 delete mode 100644 drivers/net/ethernet/aquantia/atlantic/hw_atl/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
 delete mode 100644 drivers/net/ethernet/netronome/nfp/bpf/Makefile
 delete mode 100644 drivers/net/ethernet/netronome/nfp/flower/Makefile
 delete mode 100644 drivers/net/ethernet/netronome/nfp/nfpcore/Makefile
 delete mode 100644 drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000/Makefile
 delete mode 100644 drivers/net/ethernet/netronome/nfp/nic/Makefile

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/Makefile b/drivers/net/ethernet/aquantia/atlantic/hw_atl/Makefile
deleted file mode 100644
index 805fa28f391a..000000000000
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# kbuild requires Makefile in a directory to build individual objects
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile
deleted file mode 100644
index 5ee42991900a..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-subdir-ccflags-y += -I$(src)/../..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
deleted file mode 100644
index c78512eed8d7..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/Makefile b/drivers/net/ethernet/netronome/nfp/bpf/Makefile
deleted file mode 100644
index 805fa28f391a..000000000000
--- a/drivers/net/ethernet/netronome/nfp/bpf/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# kbuild requires Makefile in a directory to build individual objects
diff --git a/drivers/net/ethernet/netronome/nfp/flower/Makefile b/drivers/net/ethernet/netronome/nfp/flower/Makefile
deleted file mode 100644
index 805fa28f391a..000000000000
--- a/drivers/net/ethernet/netronome/nfp/flower/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# kbuild requires Makefile in a directory to build individual objects
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/Makefile b/drivers/net/ethernet/netronome/nfp/nfpcore/Makefile
deleted file mode 100644
index 805fa28f391a..000000000000
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# kbuild requires Makefile in a directory to build individual objects
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000/Makefile b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000/Makefile
deleted file mode 100644
index 805fa28f391a..000000000000
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# kbuild requires Makefile in a directory to build individual objects
diff --git a/drivers/net/ethernet/netronome/nfp/nic/Makefile b/drivers/net/ethernet/netronome/nfp/nic/Makefile
deleted file mode 100644
index 805fa28f391a..000000000000
--- a/drivers/net/ethernet/netronome/nfp/nic/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# kbuild requires Makefile in a directory to build individual objects
-- 
2.17.1


^ permalink raw reply related

* [patch net-next v2 0/2] selftests: netdevsim: add devlink paramstests
From: Jiri Pirko @ 2019-08-14 15:26 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

The first patch is just a helper addition as a dependency of the actual
test in patch number two.

Jiri Pirko (2):
  selftests: net: push jq workaround into separate helper
  selftests: netdevsim: add devlink params tests

 .../drivers/net/netdevsim/devlink.sh          | 62 ++++++++++++++++++-
 tools/testing/selftests/net/forwarding/lib.sh | 16 +++++
 .../selftests/net/forwarding/tc_common.sh     | 17 ++---
 3 files changed, 81 insertions(+), 14 deletions(-)

-- 
2.21.0


^ permalink raw reply

* [patch net-next v2 1/2] selftests: net: push jq workaround into separate helper
From: Jiri Pirko @ 2019-08-14 15:26 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw
In-Reply-To: <20190814152604.6385-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Push the jq return value workaround code into a separate helper so it
could be used by the rest of the code.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
-new patch
---
 tools/testing/selftests/net/forwarding/lib.sh   | 16 ++++++++++++++++
 .../selftests/net/forwarding/tc_common.sh       | 17 ++++-------------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 9385dc971269..9d78841efef6 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -250,6 +250,22 @@ setup_wait()
 	sleep $WAIT_TIME
 }
 
+cmd_jq()
+{
+	local cmd=$1
+	local jq_exp=$2
+	local ret
+	local output
+
+	output="$($cmd)"
+	# workaround the jq bug which causes jq to return 0 in case input is ""
+	ret=$?
+	if [[ $ret -ne 0 ]]; then
+		return $ret
+	fi
+	echo $output | jq -r -e "$jq_exp"
+}
+
 lldpad_app_wait_set()
 {
 	local dev=$1; shift
diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh
index 9d3b64a2a264..315e934358d4 100644
--- a/tools/testing/selftests/net/forwarding/tc_common.sh
+++ b/tools/testing/selftests/net/forwarding/tc_common.sh
@@ -8,18 +8,9 @@ tc_check_packets()
 	local id=$1
 	local handle=$2
 	local count=$3
-	local ret
 
-	output="$(tc -j -s filter show $id)"
-	# workaround the jq bug which causes jq to return 0 in case input is ""
-	ret=$?
-	if [[ $ret -ne 0 ]]; then
-		return $ret
-	fi
-	echo $output | \
-		jq -e ".[] \
-		| select(.options.handle == $handle) \
-		| select(.options.actions[0].stats.packets == $count)" \
-		&> /dev/null
-	return $?
+	cmd_jq "tc -j -s filter show $id" \
+	       ".[] | select(.options.handle == $handle) | \
+	              select(.options.actions[0].stats.packets == $count)" \
+	       &> /dev/null
 }
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v2 2/2] selftests: netdevsim: add devlink params tests
From: Jiri Pirko @ 2019-08-14 15:26 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw
In-Reply-To: <20190814152604.6385-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Test recently added netdevsim devlink param implementation.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
-using cmd_jq helper
---
 .../drivers/net/netdevsim/devlink.sh          | 62 ++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index 9d8baf5d14b3..6828e9404460 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -3,7 +3,7 @@
 
 lib_dir=$(dirname $0)/../../../net/forwarding
 
-ALL_TESTS="fw_flash_test"
+ALL_TESTS="fw_flash_test params_test"
 NUM_NETIFS=0
 source $lib_dir/lib.sh
 
@@ -30,6 +30,66 @@ fw_flash_test()
 	log_test "fw flash test"
 }
 
+param_get()
+{
+	local name=$1
+
+	cmd_jq "devlink dev param show $DL_HANDLE name $name -j" \
+	       '.[][][].values[] | select(.cmode == "driverinit").value'
+}
+
+param_set()
+{
+	local name=$1
+	local value=$2
+
+	devlink dev param set $DL_HANDLE name $name cmode driverinit value $value
+}
+
+check_value()
+{
+	local name=$1
+	local phase_name=$2
+	local expected_param_value=$3
+	local expected_debugfs_value=$4
+	local value
+
+	value=$(param_get $name)
+	check_err $? "Failed to get $name param value"
+	[ "$value" == "$expected_param_value" ]
+	check_err $? "Unexpected $phase_name $name param value"
+	value=$(<$DEBUGFS_DIR/$name)
+	check_err $? "Failed to get $name debugfs value"
+	[ "$value" == "$expected_debugfs_value" ]
+	check_err $? "Unexpected $phase_name $name debugfs value"
+}
+
+params_test()
+{
+	RET=0
+
+	local max_macs
+	local test1
+
+	check_value max_macs initial 32 32
+	check_value test1 initial true Y
+
+	param_set max_macs 16
+	check_err $? "Failed to set max_macs param value"
+	param_set test1 false
+	check_err $? "Failed to set test1 param value"
+
+	check_value max_macs post-set 16 32
+	check_value test1 post-set false Y
+
+	devlink dev reload $DL_HANDLE
+
+	check_value max_macs post-reload 16 16
+	check_value test1 post-reload false N
+
+	log_test "params test"
+}
+
 setup_prepare()
 {
 	modprobe netdevsim
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH RFC 1/4] net: phy: swphy: emulate register MII_ESTATUS
From: Andrew Lunn @ 2019-08-14 15:34 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: Florian Fainelli, Marek Behun, David Miller,
	netdev@vger.kernel.org
In-Reply-To: <5e3d85cd-43b6-c581-be99-b6b0cf025771@gmail.com>

On Tue, Aug 13, 2019 at 11:24:56PM +0200, Heiner Kallweit wrote:
> When the genphy driver binds to a swphy it will call
> genphy_read_abilites that will try to read MII_ESTATUS if BMSR_ESTATEN
> is set in MII_BMSR. So far this would read the default value 0xffff
> and 1000FD and 1000HD are reported as supported just by chance.
> Better add explicit support for emulating MII_ESTATUS.
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* [patch net-next v3 0/2] netdevsim: implement support for devlink region and snapshots
From: Jiri Pirko @ 2019-08-14 15:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

Implement devlink region support for netdevsim and test it.

---
Note the selftest patch depends on "[patch net-next] selftests:
netdevsim: add devlink params tests" patch sent earlier today.

Jiri Pirko (2):
  netdevsim: implement support for devlink region and snapshots
  selftests: netdevsim: add devlink regions tests

 drivers/net/netdevsim/dev.c                   | 63 ++++++++++++++++++-
 drivers/net/netdevsim/netdevsim.h             |  1 +
 .../drivers/net/netdevsim/devlink.sh          | 54 +++++++++++++++-
 3 files changed, 116 insertions(+), 2 deletions(-)

-- 
2.21.0


^ permalink raw reply

* [patch net-next v3 1/2] netdevsim: implement support for devlink region and snapshots
From: Jiri Pirko @ 2019-08-14 15:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw
In-Reply-To: <20190814153735.6923-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Implement dummy region of size 32K and allow user to create snapshots
or random data using debugfs file trigger.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v2->v3:
- return -ENOMEM in case dummy_data cannot be allocated
  and don't print out error message.
- return err in case snapshot creation fails and kfree dummy_data.
- use PTR_ERR_OR_ZERO in nsim_dev_dummy_region_init().
---
 drivers/net/netdevsim/dev.c       | 63 ++++++++++++++++++++++++++++++-
 drivers/net/netdevsim/netdevsim.h |  1 +
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 08ca59fc189b..125a0358bc04 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -27,6 +27,41 @@
 
 static struct dentry *nsim_dev_ddir;
 
+#define NSIM_DEV_DUMMY_REGION_SIZE (1024 * 32)
+
+static ssize_t nsim_dev_take_snapshot_write(struct file *file,
+					    const char __user *data,
+					    size_t count, loff_t *ppos)
+{
+	struct nsim_dev *nsim_dev = file->private_data;
+	void *dummy_data;
+	u32 id;
+	int err;
+
+	dummy_data = kmalloc(NSIM_DEV_DUMMY_REGION_SIZE, GFP_KERNEL);
+	if (!dummy_data)
+		return -ENOMEM;
+
+	get_random_bytes(dummy_data, NSIM_DEV_DUMMY_REGION_SIZE);
+
+	id = devlink_region_shapshot_id_get(priv_to_devlink(nsim_dev));
+	err = devlink_region_snapshot_create(nsim_dev->dummy_region,
+					     dummy_data, id, kfree);
+	if (err) {
+		pr_err("Failed to create region snapshot\n");
+		kfree(dummy_data);
+		return err;
+	}
+
+	return count;
+}
+
+static const struct file_operations nsim_dev_take_snapshot_fops = {
+	.open = simple_open,
+	.write = nsim_dev_take_snapshot_write,
+	.llseek = generic_file_llseek,
+};
+
 static int nsim_dev_debugfs_init(struct nsim_dev *nsim_dev)
 {
 	char dev_ddir_name[16];
@@ -44,6 +79,8 @@ static int nsim_dev_debugfs_init(struct nsim_dev *nsim_dev)
 			   &nsim_dev->max_macs);
 	debugfs_create_bool("test1", 0600, nsim_dev->ddir,
 			    &nsim_dev->test1);
+	debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, nsim_dev,
+			    &nsim_dev_take_snapshot_fops);
 	return 0;
 }
 
@@ -248,6 +285,23 @@ static void nsim_devlink_param_load_driverinit_values(struct devlink *devlink)
 		nsim_dev->test1 = saved_value.vbool;
 }
 
+#define NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX 16
+
+static int nsim_dev_dummy_region_init(struct nsim_dev *nsim_dev,
+				      struct devlink *devlink)
+{
+	nsim_dev->dummy_region =
+		devlink_region_create(devlink, "dummy",
+				      NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX,
+				      NSIM_DEV_DUMMY_REGION_SIZE);
+	return PTR_ERR_OR_ZERO(nsim_dev->dummy_region);
+}
+
+static void nsim_dev_dummy_region_exit(struct nsim_dev *nsim_dev)
+{
+	devlink_region_destroy(nsim_dev->dummy_region);
+}
+
 static int nsim_dev_reload(struct devlink *devlink,
 			   struct netlink_ext_ack *extack)
 {
@@ -363,10 +417,14 @@ nsim_dev_create(struct nsim_bus_dev *nsim_bus_dev, unsigned int port_count)
 		goto err_dl_unregister;
 	nsim_devlink_set_params_init_values(nsim_dev, devlink);
 
-	err = nsim_dev_debugfs_init(nsim_dev);
+	err = nsim_dev_dummy_region_init(nsim_dev, devlink);
 	if (err)
 		goto err_params_unregister;
 
+	err = nsim_dev_debugfs_init(nsim_dev);
+	if (err)
+		goto err_dummy_region_exit;
+
 	err = nsim_bpf_dev_init(nsim_dev);
 	if (err)
 		goto err_debugfs_exit;
@@ -376,6 +434,8 @@ nsim_dev_create(struct nsim_bus_dev *nsim_bus_dev, unsigned int port_count)
 
 err_debugfs_exit:
 	nsim_dev_debugfs_exit(nsim_dev);
+err_dummy_region_exit:
+	nsim_dev_dummy_region_exit(nsim_dev);
 err_params_unregister:
 	devlink_params_unregister(devlink, nsim_devlink_params,
 				  ARRAY_SIZE(nsim_devlink_params));
@@ -396,6 +456,7 @@ static void nsim_dev_destroy(struct nsim_dev *nsim_dev)
 
 	nsim_bpf_dev_exit(nsim_dev);
 	nsim_dev_debugfs_exit(nsim_dev);
+	nsim_dev_dummy_region_exit(nsim_dev);
 	devlink_params_unregister(devlink, nsim_devlink_params,
 				  ARRAY_SIZE(nsim_devlink_params));
 	devlink_unregister(devlink);
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 95751a817508..4c758c6919f5 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -160,6 +160,7 @@ struct nsim_dev {
 	bool fw_update_status;
 	u32 max_macs;
 	bool test1;
+	struct devlink_region *dummy_region;
 };
 
 int nsim_dev_init(void);
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v3 2/2] selftests: netdevsim: add devlink regions tests
From: Jiri Pirko @ 2019-08-14 15:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw
In-Reply-To: <20190814153735.6923-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Test netdevsim devlink region implementation.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- new patch
---
 .../drivers/net/netdevsim/devlink.sh          | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index 6828e9404460..115837355eaf 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -3,7 +3,7 @@
 
 lib_dir=$(dirname $0)/../../../net/forwarding
 
-ALL_TESTS="fw_flash_test params_test"
+ALL_TESTS="fw_flash_test params_test regions_test"
 NUM_NETIFS=0
 source $lib_dir/lib.sh
 
@@ -90,6 +90,58 @@ params_test()
 	log_test "params test"
 }
 
+check_region_size()
+{
+	local name=$1
+	local size
+
+	size=$(devlink region show $DL_HANDLE/$name -j | jq -e -r '.[][].size')
+	check_err $? "Failed to get $name region size"
+	[ $size -eq 32768 ]
+	check_err $? "Invalid $name region size"
+}
+
+check_region_snapshot_count()
+{
+	local name=$1
+	local phase_name=$2
+	local expected_count=$3
+	local count
+
+	count=$(devlink region show $DL_HANDLE/$name -j | jq -e -r '.[][].snapshot | length')
+	[ $count -eq $expected_count ]
+	check_err $? "Unexpected $phase_name snapshot count"
+}
+
+regions_test()
+{
+	RET=0
+
+	local count
+
+	check_region_size dummy
+	check_region_snapshot_count dummy initial 0
+
+	echo ""> $DEBUGFS_DIR/take_snapshot
+	check_err $? "Failed to take first dummy region snapshot"
+	check_region_snapshot_count dummy post-first-snapshot 1
+
+	echo ""> $DEBUGFS_DIR/take_snapshot
+	check_err $? "Failed to take second dummy region snapshot"
+	check_region_snapshot_count dummy post-second-snapshot 2
+
+	echo ""> $DEBUGFS_DIR/take_snapshot
+	check_err $? "Failed to take third dummy region snapshot"
+	check_region_snapshot_count dummy post-third-snapshot 3
+
+	devlink region del $DL_HANDLE/dummy snapshot 1
+	check_err $? "Failed to delete first dummy region snapshot"
+
+	check_region_snapshot_count dummy post-first-delete 2
+
+	log_test "regions test"
+}
+
 setup_prepare()
 {
 	modprobe netdevsim
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH bpf-next v4 8/8] net/mlx5e: Add AF_XDP need_wakeup support
From: Jonathan Lemon @ 2019-08-14 15:40 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-9-git-send-email-magnus.karlsson@intel.com>

On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> From: Maxim Mikityanskiy <maximmi@mellanox.com>
>
> This commit adds support for the new need_wakeup feature of AF_XDP. The
> applications can opt-in by using the XDP_USE_NEED_WAKEUP bind() flag.
> When this feature is enabled, some behavior changes:
>
> RX side: If the Fill Ring is empty, instead of busy-polling, set the
> flag to tell the application to kick the driver when it refills the Fill
> Ring.
>
> TX side: If there are pending completions or packets queued for
> transmission, set the flag to tell the application that it can skip the
> sendto() syscall and save time.
>
> The performance testing was performed on a machine with the following
> configuration:
>
> - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
> - Mellanox ConnectX-5 Ex with 100 Gbit/s link
>
> The results with retpoline disabled:
>
>        | without need_wakeup  | with need_wakeup     |
>        |----------------------|----------------------|
>        | one core | two cores | one core | two cores |
> -------|----------|-----------|----------|-----------|
> txonly | 20.1     | 33.5      | 29.0     | 34.2      |
> rxdrop | 0.065    | 14.1      | 12.0     | 14.1      |
> l2fwd  | 0.032    | 7.3       | 6.6      | 7.2       |
>
> "One core" means the application and NAPI run on the same core. "Two
> cores" means they are pinned to different cores.
>
> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
> Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
> Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: [PATCH net v2] ibmveth: Convert multicast list size for little-endian system
From: Thomas Falcon @ 2019-08-14 15:41 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, liuhangbin, davem, joe
In-Reply-To: <20190813194037.464bea2c@cakuba.netronome.com>


On 8/13/19 9:43 PM, Jakub Kicinski wrote:
> On Mon, 12 Aug 2019 16:13:06 -0500, Thomas Falcon wrote:
>> The ibm,mac-address-filters property defines the maximum number of
>> addresses the hypervisor's multicast filter list can support. It is
>> encoded as a big-endian integer in the OF device tree, but the virtual
>> ethernet driver does not convert it for use by little-endian systems.
>> As a result, the driver is not behaving as it should on affected systems
>> when a large number of multicast addresses are assigned to the device.
>>
>> Reported-by: Hangbin Liu <liuhangbin@gmail.com>
>> Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
> Okay, applied, but:

Thanks!

...

> ibmveth_init_link_settings() is part of your net-next patch which
> you're respining, so I had to apply manually. Please double check your
> patches apply cleanly to the designated tree.

Sorry about that, I thought I fixed that but maybe I got patches mixed 
up somehow.  Thanks again.

Tom


^ permalink raw reply

* Re: [PATCH bpf-next v4 3/8] i40e: add support for AF_XDP need_wakeup feature
From: Jonathan Lemon @ 2019-08-14 15:41 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-4-git-send-email-magnus.karlsson@intel.com>

On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> This patch adds support for the need_wakeup feature of AF_XDP. If the
> application has told the kernel that it might sleep using the new bind
> flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
> no more buffers on the NIC Rx ring and yield to the application. For
> Tx, it will set the flag if it has no outstanding Tx completion
> interrupts and return to the application.
>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-next v4 4/8] ixgbe: add support for AF_XDP need_wakeup feature
From: Jonathan Lemon @ 2019-08-14 15:41 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-5-git-send-email-magnus.karlsson@intel.com>



On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> This patch adds support for the need_wakeup feature of AF_XDP. If the
> application has told the kernel that it might sleep using the new bind
> flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
> no more buffers on the NIC Rx ring and yield to the application. For
> Tx, it will set the flag if it has no outstanding Tx completion
> interrupts and return to the application.
>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-next v4 3/8] i40e: add support for AF_XDP need_wakeup feature
From: Jonathan Lemon @ 2019-08-14 15:42 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: Magnus Karlsson, Björn Töpel, Alexei Starovoitov,
	Daniel Borkmann, Network Development, Jesper Dangaard Brouer,
	Maxim Mikityanskiy, bpf, bruce.richardson, ciara.loftus,
	Jakub Kicinski, Ye Xiaolong, Zhang, Qi Z, Samudrala, Sridhar,
	Kevin Laatz, ilias.apalodimas, Kiran, axboe, Fijalkowski, Maciej,
	Maciej Fijalkowski, intel-wired-lan
In-Reply-To: <CAJ8uoz0Tnb=i-LkGqLU87be9BuYqxmu2pN1Mte0UEWA2+f8bTQ@mail.gmail.com>



On 14 Aug 2019, at 7:59, Magnus Karlsson wrote:

> On Wed, Aug 14, 2019 at 4:48 PM Jonathan Lemon 
> <jonathan.lemon@gmail.com> wrote:
>>
>>
>>
>> On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:
>>
>>> This patch adds support for the need_wakeup feature of AF_XDP. If 
>>> the
>>> application has told the kernel that it might sleep using the new 
>>> bind
>>> flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it 
>>> has
>>> no more buffers on the NIC Rx ring and yield to the application. For
>>> Tx, it will set the flag if it has no outstanding Tx completion
>>> interrupts and return to the application.
>>>
>>> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
>>> ---
>>>  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 18 ++++++++++++++++++
>>>  1 file changed, 18 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> index d0ff5d8..42c9012 100644
>>> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> @@ -626,6 +626,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring
>>> *rx_ring, int budget)
>>>
>>>       i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
>>>       i40e_update_rx_stats(rx_ring, total_rx_bytes, 
>>> total_rx_packets);
>>> +
>>> +     if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
>>> +             if (failure || rx_ring->next_to_clean == 
>>> rx_ring->next_to_use)
>>> +                     xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
>>> +             else
>>> +                     xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
>>> +
>>> +             return (int)total_rx_packets;
>>> +     }
>>>       return failure ? budget : (int)total_rx_packets;
>>
>> Can you elaborate why we're not returning the total budget on failure
>> for the wakeup case?
>
> In the non need_wakeup case (the old behavior), when allocation fails
> from the fill queue we want to retry right away basically busy
> spinning on the fill queue until we find at least one entry and then
> go on processing packets. Works well when the app and the driver are
> on different cores, but a lousy strategy when they execute on the same
> core. That is why in the need_wakeup feature case, we do not return
> the total budget if there is a failure. We will just come back at a
> later point in time from a syscall since the need_wakeup flag will
> have been set and check the fill queue again. We do not want a
> busy-spinning behavior in this case.

That makes sense.  Thanks for all the work on this, Magnus!
-- 
Jonathan

^ permalink raw reply

* Re: [PATCH net-next] net: openvswitch: Set OvS recirc_id from tc chain index
From: Pravin Shelar @ 2019-08-14 15:53 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Linux Kernel Network Developers, David S. Miller, Justin Pettit,
	Simon Horman, Marcelo Ricardo Leitner, Vlad Buslov, Jiri Pirko,
	Roi Dayan, Yossi Kuperman, Rony Efraim, Oz Shlomo
In-Reply-To: <68e7a65c-162a-8bc5-4d80-f4f245944b9c@mellanox.com>

On Tue, Aug 13, 2019 at 1:29 AM Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 8/12/2019 7:18 PM, Pravin Shelar wrote:
> > On Sun, Aug 11, 2019 at 3:46 AM Paul Blakey <paulb@mellanox.com> wrote:
> >>
> >> On 8/8/2019 11:53 PM, Pravin Shelar wrote:
> >>> On Wed, Aug 7, 2019 at 5:08 AM Paul Blakey <paulb@mellanox.com> wrote:
> >>>> Offloaded OvS datapath rules are translated one to one to tc rules,
> >>>> for example the following simplified OvS rule:
> >>>>
> >>>> recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
> >>>>
> >>>> Will be translated to the following tc rule:
> >>>>
> >>>> $ tc filter add dev dev1 ingress \
> >>>>               prio 1 chain 0 proto ip \
> >>>>                   flower tcp ct_state -trk \
> >>>>                   action ct pipe \
> >>>>                   action goto chain 2
> >>>>
> >>>> Received packets will first travel though tc, and if they aren't stolen
> >>>> by it, like in the above rule, they will continue to OvS datapath.
> >>>> Since we already did some actions (action ct in this case) which might
> >>>> modify the packets, and updated action stats, we would like to continue
> >>>> the proccessing with the correct recirc_id in OvS (here recirc_id(2))
> >>>> where we left off.
> >>>>
> >>>> To support this, introduce a new skb extension for tc, which
> >>>> will be used for translating tc chain to ovs recirc_id to
> >>>> handle these miss cases. Last tc chain index will be set
> >>>> by tc goto chain action and read by OvS datapath.
> >>>>
> >>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> >>>> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> >>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
> >>>> ---
> >>>>    include/linux/skbuff.h    | 13 +++++++++++++
> >>>>    include/net/sch_generic.h |  5 ++++-
> >>>>    net/core/skbuff.c         |  6 ++++++
> >>>>    net/openvswitch/flow.c    |  9 +++++++++
> >>>>    net/sched/Kconfig         | 13 +++++++++++++
> >>>>    net/sched/act_api.c       |  1 +
> >>>>    net/sched/cls_api.c       | 12 ++++++++++++
> >>>>    7 files changed, 58 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> >>>> index 3aef8d8..fb2a792 100644
> >>>> --- a/include/linux/skbuff.h
> >>>> +++ b/include/linux/skbuff.h
> >>>> @@ -279,6 +279,16 @@ struct nf_bridge_info {
> >>>>    };
> >>>>    #endif
> >>>>
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +/* Chain in tc_skb_ext will be used to share the tc chain with
> >>>> + * ovs recirc_id. It will be set to the current chain by tc
> >>>> + * and read by ovs to recirc_id.
> >>>> + */
> >>>> +struct tc_skb_ext {
> >>>> +       __u32 chain;
> >>>> +};
> >>>> +#endif
> >>>> +
> >>>>    struct sk_buff_head {
> >>>>           /* These two members must be first. */
> >>>>           struct sk_buff  *next;
> >>>> @@ -4050,6 +4060,9 @@ enum skb_ext_id {
> >>>>    #ifdef CONFIG_XFRM
> >>>>           SKB_EXT_SEC_PATH,
> >>>>    #endif
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       TC_SKB_EXT,
> >>>> +#endif
> >>>>           SKB_EXT_NUM, /* must be last */
> >>>>    };
> >>>>
> >>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> >>>> index 6b6b012..871feea 100644
> >>>> --- a/include/net/sch_generic.h
> >>>> +++ b/include/net/sch_generic.h
> >>>> @@ -275,7 +275,10 @@ struct tcf_result {
> >>>>                           unsigned long   class;
> >>>>                           u32             classid;
> >>>>                   };
> >>>> -               const struct tcf_proto *goto_tp;
> >>>> +               struct {
> >>>> +                       const struct tcf_proto *goto_tp;
> >>>> +                       u32 goto_index;
> >>>> +               };
> >>>>
> >>>>                   /* used in the skb_tc_reinsert function */
> >>>>                   struct {
> >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>> index ea8e8d3..2b40b5a 100644
> >>>> --- a/net/core/skbuff.c
> >>>> +++ b/net/core/skbuff.c
> >>>> @@ -4087,6 +4087,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >>>>    #ifdef CONFIG_XFRM
> >>>>           [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
> >>>>    #endif
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
> >>>> +#endif
> >>>>    };
> >>>>
> >>>>    static __always_inline unsigned int skb_ext_total_length(void)
> >>>> @@ -4098,6 +4101,9 @@ static __always_inline unsigned int skb_ext_total_length(void)
> >>>>    #ifdef CONFIG_XFRM
> >>>>                   skb_ext_type_len[SKB_EXT_SEC_PATH] +
> >>>>    #endif
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +               skb_ext_type_len[TC_SKB_EXT] +
> >>>> +#endif
> >>>>                   0;
> >>>>    }
> >>>>
> >>>> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> >>>> index bc89e16..0287ead 100644
> >>>> --- a/net/openvswitch/flow.c
> >>>> +++ b/net/openvswitch/flow.c
> >>>> @@ -816,6 +816,9 @@ static int key_extract_mac_proto(struct sk_buff *skb)
> >>>>    int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
> >>>>                            struct sk_buff *skb, struct sw_flow_key *key)
> >>>>    {
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       struct tc_skb_ext *tc_ext;
> >>>> +#endif
> >>>>           int res, err;
> >>>>
> >>>>           /* Extract metadata from packet. */
> >>>> @@ -848,7 +851,13 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
> >>>>           if (res < 0)
> >>>>                   return res;
> >>>>           key->mac_proto = res;
> >>>> +
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       tc_ext = skb_ext_find(skb, TC_SKB_EXT);
> >>>> +       key->recirc_id = tc_ext ? tc_ext->chain : 0;
> >>>> +#else
> >>>>           key->recirc_id = 0;
> >>>> +#endif
> >>>>
> >>> Most of cases the config would be turned on, so the ifdef is not that
> >>> useful. Can you add static key to avoid searching the skb-ext in non
> >>> offload cases.
> >> Hi,
> >>
> >> What do you mean by a static key?
> >>
> > https://www.kernel.org/doc/Documentation/static-keys.txt
> >
> > Static key can be enabled when a flow is added to the tc filter.
>
> Hi and thanks for the feedback,
>
> The skb_ext_find() just checks a single bit on the
> skb->active_extensions, and if so returns an offset. Do you think it
> will impact performance much?
>
I do not see much down side of adding static key here.

>
> But to your suggestion, do you mean that the first tc goto action
> instance with the relevant ifdef (CONFIG_NET_TC_SKB_EXT) it will enable
> the OvS static key that guards this skb_ext_find()?
>
> I guess calling it in tcf_action_set_ctrlact() if goto_chain != 0.
>
> This will expose some OvS helper function (or static key) to
> net/sched/act_api.c right?

Right, The patch adds this dependency anyways, so this symbol
definition in OVS would make it explicit to user.

^ permalink raw reply

* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Yonghong Song @ 2019-08-14 15:51 UTC (permalink / raw)
  To: Andrii Nakryiko, Magnus Karlsson, Björn Töpel,
	David S. Miller, Jesper Dangaard Brouer, john fastabend,
	Jakub Kicinski, Daniel Borkmann, Networking, bpf,
	xdp-newbies@vger.kernel.org, open list
In-Reply-To: <20190814092403.GA4142@khorivan>



On 8/14/19 2:24 AM, Ivan Khoronzhuk wrote:
> On Tue, Aug 13, 2019 at 04:38:13PM -0700, Andrii Nakryiko wrote:
> 
> Hi, Andrii
> 
>> On Tue, Aug 13, 2019 at 3:24 AM Ivan Khoronzhuk
>> <ivan.khoronzhuk@linaro.org> wrote:
>>>
>>> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>>>
>>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>>> ---
>>>  tools/lib/bpf/xsk.c | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
>>> index 5007b5d4fd2c..f2fc40f9804c 100644
>>> --- a/tools/lib/bpf/xsk.c
>>> +++ b/tools/lib/bpf/xsk.c
>>> @@ -12,6 +12,7 @@
>>>  #include <stdlib.h>
>>>  #include <string.h>
>>>  #include <unistd.h>
>>> +#include <asm/unistd.h>
>>
>> asm/unistd.h is not present in Github libbpf projection. Is there any
> 
> Look on includes from
> tools/lib/bpf/libpf.c
> tools/lib/bpf/bpf.c
> 
> That's how it's done... Copping headers to arch/arm will not
> solve this, it includes both of them anyway, and anyway it needs
> asm/unistd.h inclusion here, only because xsk.c needs __NR_*
> 
> 
>> way to avoid including this header? Generally, libbpf can't easily use
>> all of kernel headers, we need to re-implemented all the extra used
>> stuff for Github version of libbpf, so we try to minimize usage of new
>> headers that are not just plain uapi headers from include/uapi.
> 
> Yes I know, it's far away from real number of changes needed.
> I faced enough about this already and kernel headers, especially
> for arm32 it's a bit decency problem. But this patch it's part of
> normal one. I have couple issues despite this normally fixed mmap2
> that is the same even if uapi includes are coppied to tools/arch/arm.
> 
> In continuation of kernel headers inclusion and arm build:
> 
> For instance, what about this rough "kernel headers" hack:
> https://github.com/ikhorn/af_xdp_stuff/commit/aa645ccca4d844f404ec3c2b27402d4d7848d1b5 

The ".syntax unified" is mentioned a couple of times
in bcc mailing list as well. llvm bpf backend might
be able to solve it. I have not looked at the details though.

> 
> or this one related for arm32 only:
> https://github.com/ikhorn/af_xdp_stuff/commit/2c6c6d538605aac39600dcb3c9b66de11c70b963 

This may not work if bpf program tries to handle kernel headers.
bpf program may get wrong layout.

Anyway, the above two comments are irrelevant to this patch set
and if needed should be discussed separately.

> 
> 
> I have more...
> 
>>
>>>  #include <arpa/inet.h>
>>>  #include <asm/barrier.h>
>>>  #include <linux/compiler.h>
>>> -- 
>>> 2.17.1
>>>
> 

^ permalink raw reply

* Re: [PATCH net-next 1/5] RDS: Re-add pf/sol access via sysctl
From: Doug Ledford @ 2019-08-14 15:56 UTC (permalink / raw)
  To: Gerd Rausch, Santosh Shilimkar, netdev, linux-rdma, rds-devel
  Cc: David Miller
In-Reply-To: <e0397d30-7405-a7af-286c-fe76887caf0a@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 2686 bytes --]

On Tue, 2019-08-13 at 11:20 -0700, Gerd Rausch wrote:
> From: Andy Grover <andy.grover@oracle.com>
> Date: Tue, 24 Nov 2009 15:35:51 -0800
> 
> Although RDS has an official PF_RDS value now, existing software
> expects to look for rds sysctls to determine it. We need to maintain
> these for now, for backwards compatibility.
> 
> Signed-off-by: Andy Grover <andy.grover@oracle.com>
> Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
> ---
>  net/rds/sysctl.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
> index e381bbcd9cc1..9760292a0af4 100644
> --- a/net/rds/sysctl.c
> +++ b/net/rds/sysctl.c
> @@ -49,6 +49,13 @@ unsigned int  rds_sysctl_max_unacked_bytes = (16 <<
> 20);
>  
>  unsigned int rds_sysctl_ping_enable = 1;
>  
> +/*
> + * We have official values, but must maintain the sysctl interface
> for existing
> + * software that expects to find these values here.
> + */
> +static int rds_sysctl_pf_rds = PF_RDS;
> +static int rds_sysctl_sol_rds = SOL_RDS;
> +
>  static struct ctl_table rds_sysctl_rds_table[] = {
>  	{
>  		.procname       = "reconnect_min_delay_ms",
> @@ -68,6 +75,20 @@ static struct ctl_table rds_sysctl_rds_table[] = {
>  		.extra1		= &rds_sysctl_reconnect_min_jiffies,
>  		.extra2		= &rds_sysctl_reconnect_max,
>  	},
> +	{
> +		.procname       = "pf_rds",
> +		.data		= &rds_sysctl_pf_rds,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0444,
> +		.proc_handler   = &proc_dointvec,
> +	},
> +	{
> +		.procname       = "sol_rds",
> +		.data		= &rds_sysctl_sol_rds,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0444,
> +		.proc_handler   = &proc_dointvec,
> +	},
>  	{
>  		.procname	= "max_unacked_packets",
>  		.data		= &rds_sysctl_max_unacked_packets,

Good Lord...RDS was taken into the kernel in Feb of 2009, so over 10
years ago.  The patch to put PF_RDS/AF_RDS/SOL_RDS was taken into
include/linux/socket.h Feb 26, 2009.  The RDS ports were allocated by
IANA on Feb 27 and May 20, 2009.  And you *still* have software that
needs this?  The only software that has ever used RDS was Oracle
software.  I would have expected you guys to update your source code to
do the right thing long before now.  In fact, I would expect you were
ready to retire all of the legacy software that needs this by now.  As
of today, does your current build of Oracle software still require this,
or have you at least fixed it up in your modern builds?

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 0/5] net/rds: Fixes from internal Oracle repo
From: Doug Ledford @ 2019-08-14 15:59 UTC (permalink / raw)
  To: Gerd Rausch, Santosh Shilimkar, netdev, linux-rdma, rds-devel
  Cc: David Miller
In-Reply-To: <0b05ed75-6772-e339-11e6-1fb5714981c0@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 1110 bytes --]

On Tue, 2019-08-13 at 11:20 -0700, Gerd Rausch wrote:
> This is the first set of (mostly old) patches from our internal
> repository
> in an effort to synchronize what Oracle had been using internally
> with what is shipped with the Linux kernel.
> 
> Andy Grover (2):
>   RDS: Re-add pf/sol access via sysctl
>   rds: check for excessive looping in rds_send_xmit
> 
> Chris Mason (2):
>   RDS: limit the number of times we loop in rds_send_xmit
>   RDS: don't use GFP_ATOMIC for sk_alloc in rds_create
> 
> Gerd Rausch (1):
>   net/rds: Add a few missing rds_stat_names entries
> 
>  net/rds/af_rds.c  |  2 +-
>  net/rds/ib_recv.c | 12 +++++++++++-
>  net/rds/rds.h     |  2 +-
>  net/rds/send.c    | 12 ++++++++++++
>  net/rds/stats.c   |  3 +++
>  net/rds/sysctl.c  | 21 +++++++++++++++++++++
>  6 files changed, 49 insertions(+), 3 deletions(-)
> 

That first patch looks like a total bogon to me, but the remaining four
look fine.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 0/4] net: bridge: mdb: allow dump/add/del of host-joined entries
From: Nikolay Aleksandrov @ 2019-08-14 16:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, roopa, bridge
In-Reply-To: <20190814144024.9710-1-nikolay@cumulusnetworks.com>

On 8/14/19 5:40 PM, Nikolay Aleksandrov wrote:
> Hi,
> This set makes the bridge dump host-joined mdb entries, they should be
> treated as normal entries since they take a slot and are aging out.
> We already have notifications for them but we couldn't dump them until
> now so they remained hidden. We dump them similar to how they're
> notified, in order to keep user-space compatibility with the dumped
> objects (e.g. iproute2 dumps mdbs in a format which can be fed into
> add/del commands) we allow host-joined groups also to be added/deleted via
> mdb commands. That can later be used for L2 mcast MAC manipulation as
> was recently discussed. Note that iproute2 changes are not necessary,
> this set will work with the current user-space mdb code.
> 
> Patch 01 - a trivial comment move
> Patch 02 - factors out the mdb filling code so it can be
>            re-used for the host-joined entries
> Patch 03 - dumps host-joined entries
> Patch 04 - allows manipulation of host-joined entries via standard mdb
>            calls
> 
> Thanks,
>  Nik
> 
> Nikolay Aleksandrov (4):
>   net: bridge: mdb: move vlan comments
>   net: bridge: mdb: factor out mdb filling
>   net: bridge: mdb: dump host-joined entries as well
>   net: bridge: mdb: allow add/delete for host-joined groups
> 
>  net/bridge/br_mdb.c       | 171 +++++++++++++++++++++++++-------------
>  net/bridge/br_multicast.c |  24 ++++--
>  net/bridge/br_private.h   |   2 +
>  3 files changed, 133 insertions(+), 64 deletions(-)
> 

Self-NAK
There's a double notification sent for manual add/del of host groups.
It's a trivial fix, I'll spin v2 later after running more tests.


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox