Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v3 4/4] net: vhost: add rx busy polling in tx path
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang
  Cc: mst, makita.toshiaki, virtualization, netdev, Tonghao Zhang,
	Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patch improves the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.

We set the poll-us=100us and use the iperf3 to test
its bandwidth, use the netperf to test throughput and mean
latency. When running the tests, the vhost-net kthread of
that VM, is alway 100% CPU. The commands are shown as below.

iperf3  -s -D
iperf3  -c IP -i 1 -P 1 -t 20 -M 1400

or
netserver
netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"

host -> guest:
iperf3:
* With the patch:     27.0 Gbits/sec
* Without the patch:  14.4 Gbits/sec

netperf (TCP_RR):
* With the patch:     48039.56 trans/s, 20.64us mean latency
* Without the patch:  46027.07 trans/s, 21.58us mean latency

This patch also improves the guest transmit performance.

guest -> host:
iperf3:
* With the patch:     27.2 Gbits/sec
* Without the patch:  24.4 Gbits/sec

netperf (TCP_RR):
* With the patch:     47963.25 trans/s, 20.71us mean latency
* Without the patch:  45796.70 trans/s, 21.68us mean latency

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 458f81d..fb43d82 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -478,17 +478,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
-	unsigned long uninitialized_var(endtime);
+	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
 	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-		while (vhost_can_busy_poll(vq->dev, endtime) &&
-		       vhost_vq_avail_empty(vq->dev, vq))
-			cpu_relax();
-		preempt_enable();
+		vhost_net_busy_poll(net, &nvq_rx->vq, vq, false);
+
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 				      out_num, in_num, NULL, NULL);
 	}
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang
  Cc: mst, makita.toshiaki, virtualization, netdev, Tonghao Zhang,
	Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Factor out generic busy polling logic and will be
used for tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..458f81d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
+static int sk_has_rx_data(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (sock->ops->peek_len)
+		return sock->ops->peek_len(sock);
+
+	return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+				struct vhost_virtqueue *rvq,
+				struct vhost_virtqueue *tvq,
+				bool rx)
+{
+	unsigned long uninitialized_var(endtime);
+	struct socket *sock = rvq->private_data;
+	struct vhost_virtqueue *vq = rx ? tvq : rvq;
+	unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
+					      tvq->busyloop_timeout;
+
+	mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+	vhost_disable_notify(&net->dev, vq);
+
+	preempt_disable();
+	endtime = busy_clock() + busyloop_timeout;
+	while (vhost_can_busy_poll(tvq->dev, endtime) &&
+	       !(sock && sk_has_rx_data(sock->sk)) &&
+	       vhost_vq_avail_empty(tvq->dev, tvq))
+		cpu_relax();
+	preempt_enable();
+
+	if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
+	    (!rx && (sock && sk_has_rx_data(sock->sk)))) {
+		vhost_poll_queue(&vq->poll);
+	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+		vhost_disable_notify(&net->dev, vq);
+		vhost_poll_queue(&vq->poll);
+	}
+
+	mutex_unlock(&vq->mutex);
+}
+
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
 				    struct iovec iov[], unsigned int iov_size,
@@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	return len;
 }
 
-static int sk_has_rx_data(struct sock *sk)
-{
-	struct socket *sock = sk->sk_socket;
-
-	if (sock->ops->peek_len)
-		return sock->ops->peek_len(sock);
-
-	return skb_queue_empty(&sk->sk_receive_queue);
-}
-
 static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
@@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
-	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
-	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
-	struct vhost_virtqueue *vq = &nvq->vq;
-	unsigned long uninitialized_var(endtime);
-	int len = peek_head_len(rvq, sk);
+	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
 
-	if (!len && vq->busyloop_timeout) {
-		/* Flush batched heads first */
-		vhost_rx_signal_used(rvq);
-		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
-		vhost_disable_notify(&net->dev, vq);
+	int len = peek_head_len(nvq_rx, sk);
 
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-
-		while (vhost_can_busy_poll(&net->dev, endtime) &&
-		       !sk_has_rx_data(sk) &&
-		       vhost_vq_avail_empty(&net->dev, vq))
-			cpu_relax();
-
-		preempt_enable();
-
-		if (!vhost_vq_avail_empty(&net->dev, vq))
-			vhost_poll_queue(&vq->poll);
-		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-			vhost_disable_notify(&net->dev, vq);
-			vhost_poll_queue(&vq->poll);
-		}
+	if (!len && nvq_rx->vq.busyloop_timeout) {
+		/* Flush batched heads first */
+		vhost_rx_signal_used(nvq_rx);
 
-		mutex_unlock(&vq->mutex);
+		/* Both tx vq and rx socket were polled here */
+		vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
 
-		len = peek_head_len(rvq, sk);
+		len = peek_head_len(nvq_rx, sk);
 	}
 
 	return len;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 2/4] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang
  Cc: mst, makita.toshiaki, virtualization, netdev, Tonghao Zhang,
	Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..62bb8e8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
-	mutex_lock(&vq->mutex);
+	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
@@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 		/* Flush batched heads first */
 		vhost_rx_signal_used(rvq);
 		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, 1);
+		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
 		vhost_disable_notify(&net->dev, vq);
 
 		preempt_disable();
@@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
 	__virtio16 num_buffers;
 	int recv_pkts = 0;
 
-	mutex_lock_nested(&vq->mutex, 0);
+	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 1/4] net: vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang
  Cc: mst, makita.toshiaki, virtualization, netdev, Tonghao Zhang,
	Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/vhost.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 895eaa2..4ca9383 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 {
 	int i;
 
-	for (i = 0; i < d->nvqs; ++i)
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(&d->vqs[i]->mutex);
 		__vhost_vq_meta_reset(d->vqs[i]);
+		mutex_unlock(&d->vqs[i]->mutex);
+	}
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 #define vhost_get_used(vq, x, ptr) \
 	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_unlock(&d->vqs[i]->mutex);
-}
-
 static int vhost_new_umem_range(struct vhost_umem *umem,
 				u64 start, u64 size, u64 end,
 				u64 userspace_addr, int perm)
@@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
 		if (msg->iova <= vq_msg->iova &&
 		    msg->iova + msg->size - 1 > vq_msg->iova &&
 		    vq_msg->type == VHOST_IOTLB_MISS) {
+			mutex_lock(&node->vq->mutex);
 			vhost_poll_queue(&node->vq->poll);
+			mutex_unlock(&node->vq->mutex);
+
 			list_del(&node->node);
 			kfree(node);
 		}
@@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 	int ret = 0;
 
 	mutex_lock(&dev->mutex);
-	vhost_dev_lock_vqs(dev);
 	switch (msg->type) {
 	case VHOST_IOTLB_UPDATE:
 		if (!dev->iotlb) {
@@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 		break;
 	}
 
-	vhost_dev_unlock_vqs(dev);
 	mutex_unlock(&dev->mutex);
 
 	return ret;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 0/4] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: mst, makita.toshiaki, virtualization, netdev, Tonghao Zhang

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patches improve the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the same time.
handle_rx do that in the same way.

This patches are splited from previous big patch:
http://patchwork.ozlabs.org/patch/934673/

For more performance report, see patch 4.

Tonghao Zhang (4):
  net: vhost: lock the vqs one by one
  net: vhost: replace magic number of lock annotation
  net: vhost: factor out busy polling logic to vhost_net_busy_poll()
  net: vhost: add rx busy polling in tx path

 drivers/vhost/net.c   | 106 +++++++++++++++++++++++++++-----------------------
 drivers/vhost/vhost.c |  24 ++++--------
 2 files changed, 65 insertions(+), 65 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v2] atmel: using strlcpy() to avoid possible buffer overflows
From: YueHaibing @ 2018-06-30  6:33 UTC (permalink / raw)
  To: simon, kvalo
  Cc: linux-kernel, netdev, linux-wireless, davem, andy.shevchenko,
	YueHaibing

'firmware' is a module param which may been longer than firmware_id,
so using strlcpy() to guard against overflows. Also priv is allocated
with zeroed memory,no need to set firmware_id[0] to '\0'.

v1 -> v2: remove priv->firmware_id[0] = '\0';

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/wireless/atmel/atmel.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index b01dc34..cec715b 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -1516,10 +1516,9 @@ struct net_device *init_atmel_card(unsigned short irq, unsigned long port,
 	priv->present_callback = card_present;
 	priv->card = card;
 	priv->firmware = NULL;
-	priv->firmware_id[0] = '\0';
 	priv->firmware_type = fw_type;
 	if (firmware) /* module parameter */
-		strcpy(priv->firmware_id, firmware);
+		strlcpy(priv->firmware_id, firmware, sizeof(priv->firmware_id));
 	priv->bus_type = card_present ? BUS_TYPE_PCCARD : BUS_TYPE_PCI;
 	priv->station_state = STATION_STATE_DOWN;
 	priv->do_rx_crc = 0;
-- 
2.7.0

^ permalink raw reply related

* Re: [PATCH] atmel: using strlcpy() to avoid possible buffer overflows
From: YueHaibing @ 2018-06-30  6:30 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: simon, Kalle Valo, Linux Kernel Mailing List, netdev,
	open list:TI WILINK WIRELES..., David S. Miller
In-Reply-To: <CAHp75VcTVtzLuwH=Ff6DBKJyQE4t=HT7hB2O=gvgN+V-38LH5g@mail.gmail.com>


On 2018/6/30 5:51, Andy Shevchenko wrote:
> On Sat, Jun 30, 2018 at 12:47 AM, Andy Shevchenko
> <andy.shevchenko@gmail.com> wrote:
>> On Fri, Jun 29, 2018 at 5:51 AM, YueHaibing <yuehaibing@huawei.com> wrote:
>>> 'firmware' is a module param which may been longer than firmware_id,
>>> so using strlcpy() to guard against overflows
>>
>> strncat() is against overflow, this does a bit more.
>>
>>>         priv->firmware_id[0] = '\0';
>> ...
>>>         if (firmware) /* module parameter */
>>> -               strcpy(priv->firmware_id, firmware);
>>> +               strlcpy(priv->firmware_id, firmware, sizeof(priv->firmware_id));
>>
>> In either case the above '\0' is not needed.
>> But it looks like the intention was to use strncat() / strlcat().
> 
> Ah, this is under condition, yes. If no parameter supplied, this needs
> to be clean, but
> priv is allocated with zeroed memory
> https://elixir.bootlin.com/linux/latest/source/net/core/dev.c#L8369

so just remove this line:
	 priv->firmware_id[0] = '\0';

and using strlcpy while 'firmware' NOT null is enough.

Will send v2.

> 

^ permalink raw reply

* [PATCH net-next 0/2] tcp: fix high tail latencies in DCTCP
From: Lawrence Brakmo @ 2018-06-30  1:48 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Neal Cardwell,
	Yuchung Cheng, Steve Ibanez, Eric Dumazet

When have observed high tail latencies when using DCTCP for RPCs as
compared to using Cubic. For example, in one setup there are 2 hosts
sending to a 3rd one, with each sender having 3 flows (1 stream,
1 1MB back-to-back RPCs and 1 10KB back-to-back RPCs). The following
table shows the 99% and 99.9% latencies for both Cubic and dctcp:

           Cubic 99%  Cubic 99.9%   dctcp 99%    dctcp 99.9%
  1MB RPCs    2.6ms       5.5ms         43ms          208ms
 10KB RPCs    1.1ms       1.3ms         53ms          212ms

Looking at tcpdump traces showed that there are two causes for the
latency.  

  1) RTOs caused by the receiver sending a dup ACK and not ACKing
     the last (and only) packet sent.
  2) Delaying ACKs when the sender has a cwnd of 1, so everything
     pauses for the duration of the delayed ACK.

The first patch fixes the cause of the dup ACKs, not updating DCTCP
state when an ACK that was initially delayed has been sent with a
data packet.

The second patch insures that an ACK is sent immediately when a
CWR marked packet arrives.

With the patches the latencies for DCTCP now look like:

           dctcp 99%  dctcp 99.9% 
  1MB RPCs    4.8ms       6.5ms
 10KB RPCs    143us       184us

Note that while the 1MB RPCs tail latencies are higher than Cubic's,
the 10KB latencies are much smaller than Cubic's. These patches fix
issues on the receiver, but tcpdump traces indicate there is an
opportunity to also fix an issue at the sender that adds about 3ms
to the tail latencies.

The following trace shows the issue that tiggers an RTO (fixed by these patches):

   Host A sends the last packets of the request
   Host B receives them, and the last packet is marked with congestion (CE)
   Host B sends ACKs for packets not marked with congestion
   Host B sends data packet with reply and ACK for packet marked with
          congestion (TCP flag ECE)
   Host A receives ACKs with no ECE flag
   Host A receives data packet with ACK for the last packet of request
          and which has TCP ECE bit set
   Host A sends 1st data packet of the next request with TCP flag CWR
   Host B receives the packet (as seen in tcpdump at B), no CE flag
   Host B sends a dup ACK that also has the TCP ECE flag
   Host A RTO timer fires!
   Host A to send the next packet
   Host A receives an ACK for everything it has sent (i.e. Host B
          did receive 1st packet of request)
   Host A send more packets…

[PATCH net-next 1/2] tcp: notify when a delayed ack is sent
[PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives

 net/ipv4/tcp_input.c  | 25 +++++++++++++++++--------
 net/ipv4/tcp_output.c |  2 ++
 2 files changed, 19 insertions(+), 8 deletions(-)

^ permalink raw reply

* [PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives
From: Lawrence Brakmo @ 2018-06-30  1:48 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Neal Cardwell,
	Yuchung Cheng, Steve Ibanez, Eric Dumazet
In-Reply-To: <20180630014815.2881895-1-brakmo@fb.com>

We observed high 99 and 99.9% latencies when doing RPCs with DCTCP. The
problem is triggered when the last packet of a request arrives CE
marked. The reply will carry the ECE mark causing TCP to shrink its cwnd
to 1 (because there are no packets in flight). When the 1st packet of
the next request arrives it was sometimes delayed adding up to 40ms to
the latency.

This patch insures that CWR makred data packets arriving will be acked
immediately.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 76ca88f63b70..b024d36f0d56 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,8 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 #define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
 #define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
 #define FLAG_ACK_MAYBE_DELAYED	0x10000 /* Likely a delayed ACK */
+#define FLAG_OFO_POSSIBLE	0x20000 /* Possible OFO */
+#define FLAG_CWR		0x40000 /* CWR in this ACK */
 
 #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -5087,7 +5089,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
 /*
  * Check if sending an ack is needed.
  */
-static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+static void __tcp_ack_snd_check(struct sock *sk, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned long rtt, delay;
@@ -5102,13 +5104,16 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
 	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
 	    /* We ACK each frame or... */
-	    tcp_in_quickack_mode(sk)) {
+	    tcp_in_quickack_mode(sk) ||
+	    /* We received CWR */
+	    flags & FLAG_CWR) {
 send_now:
 		tcp_send_ack(sk);
 		return;
 	}
 
-	if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+	if (!(flags & FLAG_OFO_POSSIBLE) ||
+	    RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tcp_send_delayed_ack(sk);
 		return;
 	}
@@ -5134,13 +5139,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 		      HRTIMER_MODE_REL_PINNED_SOFT);
 }
 
-static inline void tcp_ack_snd_check(struct sock *sk)
+static inline void tcp_ack_snd_check(struct sock *sk, int flags)
 {
 	if (!inet_csk_ack_scheduled(sk)) {
 		/* We sent a data segment already. */
 		return;
 	}
-	__tcp_ack_snd_check(sk, 1);
+	__tcp_ack_snd_check(sk, flags | FLAG_OFO_POSSIBLE);
 }
 
 /*
@@ -5489,6 +5494,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 				goto discard;
 			}
 		} else {
+			int flags;
 			int eaten = 0;
 			bool fragstolen = false;
 
@@ -5525,7 +5531,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 					goto no_ack;
 			}
 
-			__tcp_ack_snd_check(sk, 0);
+			flags = (tcp_flag_word(th) & TCP_FLAG_CWR) ?
+				FLAG_CWR : 0;
+			__tcp_ack_snd_check(sk, flags);
 no_ack:
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
@@ -5561,7 +5569,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 	tcp_data_queue(sk, skb);
 
 	tcp_data_snd_check(sk);
-	tcp_ack_snd_check(sk);
+	tcp_ack_snd_check(sk, (tcp_flag_word(th) & TCP_FLAG_CWR) ?
+			  FLAG_CWR : 0);
 	return;
 
 csum_error:
@@ -6150,7 +6159,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	/* tcp_data could move socket to TIME-WAIT */
 	if (sk->sk_state != TCP_CLOSE) {
 		tcp_data_snd_check(sk);
-		tcp_ack_snd_check(sk);
+		tcp_ack_snd_check(sk, 0);
 	}
 
 	if (!queued) {
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 1/2] tcp: notify when a delayed ack is sent
From: Lawrence Brakmo @ 2018-06-30  1:48 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Neal Cardwell,
	Yuchung Cheng, Steve Ibanez, Eric Dumazet
In-Reply-To: <20180630014815.2881895-1-brakmo@fb.com>

DCTCP depends on the CA_EVENT_NON_DELAYED_ACK and CA_EVENT_DELAYED_ACK
notifications to keep track if it needs to send an ACK for packets that
were received with a particular ECN state but whose ACK was delayed.

Under some circumstances, for example when a delayed ACK is sent with a
data packet, DCTCP state was not being updated due to a lack of
notification that the previously delayed ACK was sent. As a result, it
would sometimes send a duplicate ACK when a new data packet arrived.

This patch insures that DCTCP's state is correctly updated so it will
not send the duplicate ACK.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/ipv4/tcp_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f8f6129160dd..41f6ad7a21e4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -172,6 +172,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 			__sock_put(sk);
 	}
 	tcp_dec_quickack_mode(sk, pkts);
+	if (inet_csk_ack_scheduled(sk))
+		tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
-- 
2.17.1

^ permalink raw reply related

* Re: [RFC v2 PATCH 1/4] eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER
From: Tushar Dave @ 2018-06-30  0:46 UTC (permalink / raw)
  To: Daniel Borkmann, ast, davem, jakub.kicinski, quentin.monnet,
	jiong.wang, guro, sandipan, john.fastabend, kafai, rdna, brakmo,
	netdev, acme, sowmini.varadhan
In-Reply-To: <9795dd0e-c5d8-2970-bef3-9cca6000d68a@iogearbox.net>



On 06/29/2018 01:27 AM, Daniel Borkmann wrote:
> On 06/19/2018 08:00 PM, Tushar Dave wrote:
>> Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which uses the
>> existing socket filter infrastructure for bpf program attach and load.
>> SOCKET_SG_FILTER eBPF program receives struct scatterlist as bpf context
>> contrast to SOCKET_FILTER which deals with struct skb. This is useful
>> for kernel entities that don't have skb to represent packet data but
>> want to run eBPF socket filter on packet data that is in form of struct
>> scatterlist e.g. IB/RDMA
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>> Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
>> ---
>>   include/linux/bpf_types.h      |  1 +
>>   include/linux/filter.h         |  8 +++++
>>   include/uapi/linux/bpf.h       |  7 ++++
>>   kernel/bpf/syscall.c           |  1 +
>>   kernel/bpf/verifier.c          |  1 +
>>   net/core/filter.c              | 77 ++++++++++++++++++++++++++++++++++++++++--
>>   samples/bpf/bpf_load.c         | 11 ++++--
>>   tools/bpf/bpftool/prog.c       |  1 +
>>   tools/include/uapi/linux/bpf.h |  7 ++++
>>   tools/lib/bpf/libbpf.c         |  3 ++
>>   tools/lib/bpf/libbpf.h         |  2 ++
>>   11 files changed, 114 insertions(+), 5 deletions(-)
>>
> [...]
>>   
>> +static bool socksg_filter_is_valid_access(int off, int size,
>> +					  enum bpf_access_type type,
>> +					  const struct bpf_prog *prog,
>> +					  struct bpf_insn_access_aux *info)
>> +{
>> +	switch (off) {
>> +	case offsetof(struct sg_filter_md, data):
>> +		info->reg_type = PTR_TO_PACKET;
>> +		break;
>> +	case offsetof(struct sg_filter_md, data_end):
>> +		info->reg_type = PTR_TO_PACKET_END;
>> +		break;
>> +	}
>> +
>> +	if (off < 0 || off >= sizeof(struct sg_filter_md))
>> +		return false;
>> +	if (off % size != 0)
>> +		return false;
>> +	if (size != sizeof(__u64))
>> +		return false;
>> +
>> +	return true;
>> +}
> 
> Just a note, don't know much about rds, but when you make this writeable for
> rds/tcp you definitely must make sure that it can be handled properly in there,
> meaning when program rewrites packet data that this data is private to the BPF
> prog (to avoid races/corruption) and that the rewritten data is correctly handled
> from there.

Sure thing. When I add something like bpf_sg_store_bytes(), I will make
sure to take care of rewrites.

Thanks.

-Tushar
> 

^ permalink raw reply

* [PATCH net-next 13/13] selftests: mlxsw: Add scale test for resources
From: Petr Machata @ 2018-06-30  0:53 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

From: Yuval Mintz <yuvalm@mellanox.com>

Add a scale test capable of validating that offloaded network
functionality is indeed functional at scale when configured to
the different KVD profiles available.

Start by testing offloaded routes are functional at scale by
passing traffic on each one of them in turn.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 .../drivers/net/mlxsw/spectrum/resource_scale.sh   | 55 ++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
new file mode 100755
index 000000000000..a0a80e1a69e8
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NUM_NETIFS=6
+source ../../../../net/forwarding/lib.sh
+source ../../../../net/forwarding/tc_common.sh
+source devlink_lib_spectrum.sh
+
+current_test=""
+
+cleanup()
+{
+	pre_cleanup
+	if [ ! -z $current_test ]; then
+		${current_test}_cleanup
+	fi
+	devlink_sp_size_kvd_to_default
+}
+
+devlink_sp_read_kvd_defaults
+trap cleanup EXIT
+
+ALL_TESTS="router tc_flower mirror_gre"
+for current_test in ${TESTS:-$ALL_TESTS}; do
+	source ${current_test}_scale.sh
+
+	num_netifs_var=${current_test^^}_NUM_NETIFS
+	num_netifs=${!num_netifs_var:-$NUM_NETIFS}
+
+	for profile in $KVD_PROFILES; do
+		RET=0
+		devlink_sp_resource_kvd_profile_set $profile
+		if [[ $RET -gt 0 ]]; then
+			log_test "'$current_test' [$profile] setting"
+			continue
+		fi
+
+		for should_fail in 0 1; do
+			RET=0
+			target=$(${current_test}_get_target "$should_fail")
+			${current_test}_setup_prepare
+			setup_wait $num_netifs
+			${current_test}_test "$target" "$should_fail"
+			${current_test}_cleanup
+			if [[ "$should_fail" -eq 0 ]]; then
+				log_test "'$current_test' [$profile] $target"
+			else
+				log_test "'$current_test' [$profile] overflow $target"
+			fi
+		done
+	done
+done
+current_test=""
+
+exit "$RET"
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 12/13] selftests: mlxsw: Add target for mirror-to-gretap test on spectrum
From: Petr Machata @ 2018-06-30  0:53 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

Add a wrapper around mlxsw/mirror_gre_scale.sh that parameterized number
of offloadable mirrors on Spectrum machines.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
 .../drivers/net/mlxsw/spectrum/mirror_gre_scale.sh          | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
new file mode 100644
index 000000000000..8d2186c7c62b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../mirror_gre_scale.sh
+
+mirror_gre_get_target()
+{
+	local should_fail=$1; shift
+
+	if ((! should_fail)); then
+		echo 3
+	else
+		echo 4
+	fi
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 11/13] selftests: mlxsw: Add scale test for mirror-to-gretap
From: Petr Machata @ 2018-06-30  0:52 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

Test that it's possible to offload a given number of mirrors.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
 .../drivers/net/mlxsw/mirror_gre_scale.sh          | 197 +++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
new file mode 100644
index 000000000000..6f3a70df63bc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Test offloading a number of mirrors-to-gretap. The test creates a number of
+# tunnels. Then it adds one flower mirror for each of the tunnels, matching a
+# given host IP. Then it generates traffic at each of the host IPs and checks
+# that the traffic has been mirrored at the appropriate tunnel.
+#
+#   +--------------------------+                   +--------------------------+
+#   | H1                       |                   |                       H2 |
+#   |     + $h1                |                   |                $h2 +     |
+#   |     | 2001:db8:1:X::1/64 |                   | 2001:db8:1:X::2/64 |     |
+#   +-----|--------------------+                   +--------------------|-----+
+#         |                                                             |
+#   +-----|-------------------------------------------------------------|-----+
+#   | SW  o--> mirrors                                                  |     |
+#   | +---|-------------------------------------------------------------|---+ |
+#   | |   + $swp1                    BR                           $swp2 +   | |
+#   | +---------------------------------------------------------------------+ |
+#   |                                                                         |
+#   |     + $swp3                          + gt6-<X> (ip6gretap)              |
+#   |     | 2001:db8:2:X::1/64             : loc=2001:db8:2:X::1              |
+#   |     |                                : rem=2001:db8:2:X::2              |
+#   |     |                                : ttl=100                          |
+#   |     |                                : tos=inherit                      |
+#   |     |                                :                                  |
+#   +-----|--------------------------------:----------------------------------+
+#         |                                :
+#   +-----|--------------------------------:----------------------------------+
+#   | H3  + $h3                            + h3-gt6-<X> (ip6gretap)           |
+#   |       2001:db8:2:X::2/64               loc=2001:db8:2:X::2              |
+#   |                                        rem=2001:db8:2:X::1              |
+#   |                                        ttl=100                          |
+#   |                                        tos=inherit                      |
+#   |                                                                         |
+#   +-------------------------------------------------------------------------+
+
+source ../../../../net/forwarding/mirror_lib.sh
+
+MIRROR_NUM_NETIFS=6
+
+mirror_gre_ipv6_addr()
+{
+	local net=$1; shift
+	local num=$1; shift
+
+	printf "2001:db8:%x:%x" $net $num
+}
+
+mirror_gre_tunnels_create()
+{
+	local count=$1; shift
+	local should_fail=$1; shift
+
+	MIRROR_GRE_BATCH_FILE="$(mktemp)"
+	for ((i=0; i < count; ++i)); do
+		local match_dip=$(mirror_gre_ipv6_addr 1 $i)::2
+		local htun=h3-gt6-$i
+		local tun=gt6-$i
+
+		((mirror_gre_tunnels++))
+
+		ip address add dev $h1 $(mirror_gre_ipv6_addr 1 $i)::1/64
+		ip address add dev $h2 $(mirror_gre_ipv6_addr 1 $i)::2/64
+
+		ip address add dev $swp3 $(mirror_gre_ipv6_addr 2 $i)::1/64
+		ip address add dev $h3 $(mirror_gre_ipv6_addr 2 $i)::2/64
+
+		tunnel_create $tun ip6gretap \
+			      $(mirror_gre_ipv6_addr 2 $i)::1 \
+			      $(mirror_gre_ipv6_addr 2 $i)::2 \
+			      ttl 100 tos inherit allow-localremote
+
+		tunnel_create $htun ip6gretap \
+			      $(mirror_gre_ipv6_addr 2 $i)::2 \
+			      $(mirror_gre_ipv6_addr 2 $i)::1
+		ip link set $htun vrf v$h3
+		matchall_sink_create $htun
+
+		cat >> $MIRROR_GRE_BATCH_FILE <<-EOF
+			filter add dev $swp1 ingress pref 1000 \
+				protocol ipv6 \
+				flower $tcflags dst_ip $match_dip \
+				action mirred egress mirror dev $tun
+		EOF
+	done
+
+	tc -b $MIRROR_GRE_BATCH_FILE
+	check_err_fail $should_fail $? "Mirror rule insertion"
+}
+
+mirror_gre_tunnels_destroy()
+{
+	local count=$1; shift
+
+	for ((i=0; i < count; ++i)); do
+		local htun=h3-gt6-$i
+		local tun=gt6-$i
+
+		ip address del dev $h3 $(mirror_gre_ipv6_addr 2 $i)::2/64
+		ip address del dev $swp3 $(mirror_gre_ipv6_addr 2 $i)::1/64
+
+		ip address del dev $h2 $(mirror_gre_ipv6_addr 1 $i)::2/64
+		ip address del dev $h1 $(mirror_gre_ipv6_addr 1 $i)::1/64
+
+		tunnel_destroy $htun
+		tunnel_destroy $tun
+	done
+}
+
+__mirror_gre_test()
+{
+	local count=$1; shift
+	local should_fail=$1; shift
+
+	mirror_gre_tunnels_create $count $should_fail
+	if ((should_fail)); then
+	    return
+	fi
+
+	sleep 5
+
+	for ((i = 0; i < count; ++i)); do
+		local dip=$(mirror_gre_ipv6_addr 1 $i)::2
+		local htun=h3-gt6-$i
+		local message
+
+		icmp6_capture_install $htun
+		mirror_test v$h1 "" $dip $htun 100 10
+		icmp6_capture_uninstall $htun
+	done
+}
+
+mirror_gre_test()
+{
+	local count=$1; shift
+	local should_fail=$1; shift
+
+	if ! tc_offload_check $TC_FLOWER_NUM_NETIFS; then
+		check_err 1 "Could not test offloaded functionality"
+		return
+	fi
+
+	tcflags="skip_sw"
+	__mirror_gre_test $count $should_fail
+}
+
+mirror_gre_setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	mirror_gre_tunnels=0
+
+	vrf_prepare
+
+	simple_if_init $h1
+	simple_if_init $h2
+	simple_if_init $h3
+
+	ip link add name br1 type bridge vlan_filtering 1
+	ip link set dev br1 up
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+	tc qdisc add dev $swp1 clsact
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+
+	ip link set dev $swp3 up
+}
+
+mirror_gre_cleanup()
+{
+	mirror_gre_tunnels_destroy $mirror_gre_tunnels
+
+	ip link set dev $swp3 down
+
+	ip link set dev $swp2 down
+
+	tc qdisc del dev $swp1 clsact
+	ip link set dev $swp1 down
+
+	ip link del dev br1
+
+	simple_if_fini $h3
+	simple_if_fini $h2
+	simple_if_fini $h1
+
+	vrf_cleanup
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 10/13] selftests: mlxsw: Add target for tc flower test on spectrum
From: Petr Machata @ 2018-06-30  0:51 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

Add a wrapper around mlxsw/tc_flower_scale.sh that parameterizes the
generic tc flower scale test template with Spectrum-specific target
values.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
---
 .../drivers/net/mlxsw/spectrum/tc_flower_scale.sh     | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
new file mode 100644
index 000000000000..f9bfd8937765
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../tc_flower_scale.sh
+
+tc_flower_get_target()
+{
+	local should_fail=$1; shift
+
+	# 6144 (6x1024) is the theoretical maximum.
+	# One bank of 512 rules is taken by the 18-byte MC router rule.
+	# One rule is the ACL catch-all.
+	# 6144 - 512 - 1 = 5631
+	local target=5631
+
+	if ((! should_fail)); then
+		echo $target
+	else
+		echo $((target + 1))
+	fi
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 09/13] selftests: mlxsw: Add tc flower scale test
From: Petr Machata @ 2018-06-30  0:51 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

Add test of capacity to offload flower.

This is a generic portion of the test that is meant to be called from a
driver that supplies a particular number of rules to be tested with.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
---
 .../selftests/drivers/net/mlxsw/tc_flower_scale.sh | 134 +++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
new file mode 100644
index 000000000000..a6d733d2a4b4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for resource limit of offloaded flower rules. The test adds a given
+# number of flower matches for different IPv6 addresses, then generates traffic,
+# and ensures each was hit exactly once. This file contains functions to set up
+# a testing topology and run the test, and is meant to be sourced from a test
+# script that calls the testing routine with a given number of rules.
+
+TC_FLOWER_NUM_NETIFS=2
+
+tc_flower_h1_create()
+{
+	simple_if_init $h1
+	tc qdisc add dev $h1 clsact
+}
+
+tc_flower_h1_destroy()
+{
+	tc qdisc del dev $h1 clsact
+	simple_if_fini $h1
+}
+
+tc_flower_h2_create()
+{
+	simple_if_init $h2
+	tc qdisc add dev $h2 clsact
+}
+
+tc_flower_h2_destroy()
+{
+	tc qdisc del dev $h2 clsact
+	simple_if_fini $h2
+}
+
+tc_flower_setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	h2=${NETIFS[p2]}
+
+	vrf_prepare
+
+	tc_flower_h1_create
+	tc_flower_h2_create
+}
+
+tc_flower_cleanup()
+{
+	pre_cleanup
+
+	tc_flower_h2_destroy
+	tc_flower_h1_destroy
+
+	vrf_cleanup
+
+	if [[ -v TC_FLOWER_BATCH_FILE ]]; then
+		rm -f $TC_FLOWER_BATCH_FILE
+	fi
+}
+
+tc_flower_addr()
+{
+	local num=$1; shift
+
+	printf "2001:db8:1::%x" $num
+}
+
+tc_flower_rules_create()
+{
+	local count=$1; shift
+	local should_fail=$1; shift
+
+	TC_FLOWER_BATCH_FILE="$(mktemp)"
+
+	for ((i = 0; i < count; ++i)); do
+		cat >> $TC_FLOWER_BATCH_FILE <<-EOF
+			filter add dev $h2 ingress \
+				prot ipv6 \
+				pref 1000 \
+				flower $tcflags dst_ip $(tc_flower_addr $i) \
+				action drop
+		EOF
+	done
+
+	tc -b $TC_FLOWER_BATCH_FILE
+	check_err_fail $should_fail $? "Rule insertion"
+}
+
+__tc_flower_test()
+{
+	local count=$1; shift
+	local should_fail=$1; shift
+	local last=$((count - 1))
+
+	tc_flower_rules_create $count $should_fail
+
+	for ((i = 0; i < count; ++i)); do
+		$MZ $h1 -q -c 1 -t ip -p 20 -b bc -6 \
+			-A 2001:db8:2::1 \
+			-B $(tc_flower_addr $i)
+	done
+
+	MISMATCHES=$(
+		tc -j -s filter show dev $h2 ingress |
+		jq -r '[ .[] | select(.kind == "flower") | .options |
+		         values as $rule | .actions[].stats.packets |
+		         select(. != 1) | "\(.) on \($rule.keys.dst_ip)" ] |
+		       join(", ")'
+	)
+
+	test -z "$MISMATCHES"
+	check_err $? "Expected to capture 1 packet for each IP, but got $MISMATCHES"
+}
+
+tc_flower_test()
+{
+	local count=$1; shift
+	local should_fail=$1; shift
+
+	# We use lower 16 bits of IPv6 address for match. Also there are only 16
+	# bits of rule priority space.
+	if ((count > 65536)); then
+		check_err 1 "Invalid count of $count. At most 65536 rules supported"
+		return
+	fi
+
+	if ! tc_offload_check $TC_FLOWER_NUM_NETIFS; then
+		check_err 1 "Could not test offloaded functionality"
+		return
+	fi
+
+	tcflags="skip_sw"
+	__tc_flower_test $count $should_fail
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 08/13] selftests: mlxsw: Add target for router test on spectrum
From: Petr Machata @ 2018-06-30  0:50 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

From: Yuval Mintz <yuvalm@mellanox.com>

IPv4 routes in Spectrum are based on the kvd single-hash, but as it's
a hash we need to assume we cannot reach 100% of its capacity.

Add a wrapper that provides us with good/bad target numbers for the
Spectrum ASIC.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
[petrm@mellanox.com: Drop shebang.]
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 .../drivers/net/mlxsw/spectrum/router_scale.sh         | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
new file mode 100644
index 000000000000..21c4697d5bab
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../router_scale.sh
+
+router_get_target()
+{
+	local should_fail=$1
+	local target
+
+	target=$(devlink_resource_size_get kvd hash_single)
+
+	if [[ $should_fail -eq 0 ]]; then
+		target=$((target * 85 / 100))
+	else
+		target=$((target + 1))
+	fi
+
+	echo $target
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 07/13] selftests: mlxsw: Add router test
From: Petr Machata @ 2018-06-30  0:49 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

This test aims for both stand alone and internal usage by the resource
infra. The test receives the number routes to offload and checks:
- The routes were offloaded correctly
- Traffic for each route.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 .../selftests/drivers/net/mlxsw/router_scale.sh    | 167 +++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/router_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
new file mode 100644
index 000000000000..d231649b4f01
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ROUTER_NUM_NETIFS=4
+
+router_h1_create()
+{
+	simple_if_init $h1 192.0.1.1/24
+	ip route add 193.0.0.0/8 via 192.0.1.2 dev $h1
+}
+
+router_h1_destroy()
+{
+	ip route del 193.0.0.0/8 via 192.0.1.2 dev $h1
+	simple_if_fini $h1 192.0.1.1/24
+}
+
+router_h2_create()
+{
+	simple_if_init $h2 192.0.2.1/24
+	tc qdisc add dev $h2 handle ffff: ingress
+}
+
+router_h2_destroy()
+{
+	tc qdisc del dev $h2 handle ffff: ingress
+	simple_if_fini $h2 192.0.2.1/24
+}
+
+router_create()
+{
+	ip link set dev $rp1 up
+	ip link set dev $rp2 up
+
+	ip address add 192.0.1.2/24 dev $rp1
+	ip address add 192.0.2.2/24 dev $rp2
+}
+
+router_destroy()
+{
+	ip address del 192.0.2.2/24 dev $rp2
+	ip address del 192.0.1.2/24 dev $rp1
+
+	ip link set dev $rp2 down
+	ip link set dev $rp1 down
+}
+
+router_setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	rp1=${NETIFS[p2]}
+
+	rp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	h1mac=$(mac_get $h1)
+	rp1mac=$(mac_get $rp1)
+
+	vrf_prepare
+
+	router_h1_create
+	router_h2_create
+
+	router_create
+}
+
+router_offload_validate()
+{
+	local route_count=$1
+	local offloaded_count
+
+	offloaded_count=$(ip route | grep -o 'offload' | wc -l)
+	[[ $offloaded_count -ge $route_count ]]
+}
+
+router_routes_create()
+{
+	local route_count=$1
+	local count=0
+
+	ROUTE_FILE="$(mktemp)"
+
+	for i in {0..255}
+	do
+		for j in {0..255}
+		do
+			for k in {0..255}
+			do
+				if [[ $count -eq $route_count ]]; then
+					break 3
+				fi
+
+				echo route add 193.${i}.${j}.${k}/32 via \
+				       192.0.2.1 dev $rp2  >> $ROUTE_FILE
+				((count++))
+			done
+		done
+	done
+
+	ip -b $ROUTE_FILE &> /dev/null
+}
+
+router_routes_destroy()
+{
+	if [[ -v ROUTE_FILE ]]; then
+		rm -f $ROUTE_FILE
+	fi
+}
+
+router_test()
+{
+	local route_count=$1
+	local should_fail=$2
+	local count=0
+
+	RET=0
+
+	router_routes_create $route_count
+
+	router_offload_validate $route_count
+	check_err_fail $should_fail $? "Offload of $route_count routes"
+	if [[ $RET -ne 0 ]] || [[ $should_fail -eq 1 ]]; then
+		return
+	fi
+
+	tc filter add dev $h2 ingress protocol ip pref 1 flower \
+		skip_sw dst_ip 193.0.0.0/8 action drop
+
+	for i in {0..255}
+	do
+		for j in {0..255}
+		do
+			for k in {0..255}
+			do
+				if [[ $count -eq $route_count ]]; then
+					break 3
+				fi
+
+				$MZ $h1 -c 1 -p 64 -a $h1mac -b $rp1mac \
+					-A 192.0.1.1 -B 193.${i}.${j}.${k} \
+					-t ip -q
+				((count++))
+			done
+		done
+	done
+
+	tc_check_packets "dev $h2 ingress" 1 $route_count
+	check_err $? "Offload mismatch"
+
+	tc filter del dev $h2 ingress protocol ip pref 1 flower \
+		skip_sw dst_ip 193.0.0.0/8 action drop
+
+	router_routes_destroy
+}
+
+router_cleanup()
+{
+	pre_cleanup
+
+	router_routes_destroy
+	router_destroy
+
+	router_h2_destroy
+	router_h1_destroy
+
+	vrf_cleanup
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 06/13] selftests: mlxsw: Add devlink KVD resource test
From: Petr Machata @ 2018-06-30  0:48 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

From: Yuval Mintz <yuvalm@mellanox.com>

Add a selftest that can be used to perform basic sanity of the devlink
resource API as well as test the behavior of KVD manipulation in the
driver.

This is the first case of a HW-only test - in order to test the devlink
resource a driver capable of exposing resources has to be provided
first.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
[petrm@mellanox.com: Extracted two patches out of this patch. Tweaked
commit message.]
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 .../net/mlxsw/spectrum/devlink_resources.sh        | 117 +++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
new file mode 100755
index 000000000000..b1fe960e398a
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NUM_NETIFS=1
+source devlink_lib_spectrum.sh
+
+setup_prepare()
+{
+	devlink_sp_read_kvd_defaults
+}
+
+cleanup()
+{
+	pre_cleanup
+	devlink_sp_size_kvd_to_default
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+profiles_test()
+{
+	local i
+
+	log_info "Running profile tests"
+
+	for i in $KVD_PROFILES; do
+		RET=0
+		devlink_sp_resource_kvd_profile_set $i
+		log_test "'$i' profile"
+	done
+
+	# Default is explicitly tested at end to ensure it's actually applied
+	RET=0
+	devlink_sp_resource_kvd_profile_set "default"
+	log_test "'default' profile"
+}
+
+resources_min_test()
+{
+	local size
+	local i
+	local j
+
+	log_info "Running KVD-minimum tests"
+
+	for i in $KVD_CHILDREN; do
+		RET=0
+		size=$(devlink_resource_get kvd "$i" | jq '.["size_min"]')
+		devlink_resource_size_set "$size" kvd "$i"
+
+		# In case of linear, need to minimize sub-resources as well
+		if [[ "$i" == "linear" ]]; then
+			for j in $KVDL_CHILDREN; do
+				devlink_resource_size_set 0 kvd linear "$j"
+			done
+		fi
+
+		devlink_reload
+		devlink_sp_size_kvd_to_default
+		log_test "'$i' minimize [$size]"
+	done
+}
+
+resources_max_test()
+{
+	local min_size
+	local size
+	local i
+	local j
+
+	log_info "Running KVD-maximum tests"
+	for i in $KVD_CHILDREN; do
+		RET=0
+		devlink_sp_resource_minimize
+
+		# Calculate the maximum possible size for the given partition
+		size=$(devlink_resource_size_get kvd)
+		for j in $KVD_CHILDREN; do
+			if [ "$i" != "$j" ]; then
+				min_size=$(devlink_resource_get kvd "$j" | \
+					   jq '.["size_min"]')
+				size=$((size - min_size))
+			fi
+		done
+
+		# Test almost maximum size
+		devlink_resource_size_set "$((size - 128))" kvd "$i"
+		devlink_reload
+		log_test "'$i' almost maximize [$((size - 128))]"
+
+		# Test above maximum size
+		devlink resource set "$DEVLINK_DEV" \
+			path "kvd/$i" size $((size + 128)) &> /dev/null
+		check_fail $? "Set kvd/$i to size $((size + 128)) should fail"
+		log_test "'$i' Overflow rejection [$((size + 128))]"
+
+		# Test maximum size
+		if [ "$i" == "hash_single" ] || [ "$i" == "hash_double" ]; then
+			echo "SKIP: Observed problem with exact max $i"
+			continue
+		fi
+
+		devlink_resource_size_set "$size" kvd "$i"
+		devlink_reload
+		log_test "'$i' maximize [$size]"
+
+		devlink_sp_size_kvd_to_default
+	done
+}
+
+profiles_test
+resources_min_test
+resources_max_test
+
+exit "$RET"
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 05/13] selftests: mlxsw: Add devlink_lib_spectrum.sh
From: Petr Machata @ 2018-06-30  0:48 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

This library builds on top of devlink_lib.sh and contains functionality
specific to Spectrum ASICs, e.g., re-partitioning the various KVD
sub-parts.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
[petrm@mellanox.com: Split this out from another patch. Fix line length
in devlink_sp_read_kvd_defaults().]
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 MAINTAINERS                                        |   1 +
 .../net/mlxsw/spectrum/devlink_lib_spectrum.sh     | 119 +++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index f3e766e504d8..f3dd6c4f61e1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9154,6 +9154,7 @@ S:	Supported
 W:	http://www.mellanox.com
 Q:	http://patchwork.ozlabs.org/project/netdev/list/
 F:	drivers/net/ethernet/mellanox/mlxsw/
+F:	tools/testing/selftests/drivers/net/mlxsw/
 
 MELLANOX FIRMWARE FLASH LIBRARY (mlxfw)
 M:	mlxsw@mellanox.com
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
new file mode 100644
index 000000000000..73035e25085d
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source "../../../../net/forwarding/devlink_lib.sh"
+
+if [ "$DEVLINK_VIDDID" != "15b3:cb84" ]; then
+	echo "SKIP: test is tailored for Mellanox Spectrum"
+	exit 1
+fi
+
+# Needed for returning to default
+declare -A KVD_DEFAULTS
+
+KVD_CHILDREN="linear hash_single hash_double"
+KVDL_CHILDREN="singles chunks large_chunks"
+
+devlink_sp_resource_minimize()
+{
+	local size
+	local i
+
+	for i in $KVD_CHILDREN; do
+		size=$(devlink_resource_get kvd "$i" | jq '.["size_min"]')
+		devlink_resource_size_set "$size" kvd "$i"
+	done
+
+	for i in $KVDL_CHILDREN; do
+		size=$(devlink_resource_get kvd linear "$i" | \
+		       jq '.["size_min"]')
+		devlink_resource_size_set "$size" kvd linear "$i"
+	done
+}
+
+devlink_sp_size_kvd_to_default()
+{
+	local need_reload=0
+	local i
+
+	for i in $KVD_CHILDREN; do
+		local size=$(echo "${KVD_DEFAULTS[kvd_$i]}" | jq '.["size"]')
+		current_size=$(devlink_resource_size_get kvd "$i")
+
+		if [ "$size" -ne "$current_size" ]; then
+			devlink_resource_size_set "$size" kvd "$i"
+			need_reload=1
+		fi
+	done
+
+	for i in $KVDL_CHILDREN; do
+		local size=$(echo "${KVD_DEFAULTS[kvd_linear_$i]}" | \
+			     jq '.["size"]')
+		current_size=$(devlink_resource_size_get kvd linear "$i")
+
+		if [ "$size" -ne "$current_size" ]; then
+			devlink_resource_size_set "$size" kvd linear "$i"
+			need_reload=1
+		fi
+	done
+
+	if [ "$need_reload" -ne "0" ]; then
+		devlink_reload
+	fi
+}
+
+devlink_sp_read_kvd_defaults()
+{
+	local key
+	local i
+
+	KVD_DEFAULTS[kvd]=$(devlink_resource_get "kvd")
+	for i in $KVD_CHILDREN; do
+		key=kvd_$i
+		KVD_DEFAULTS[$key]=$(devlink_resource_get kvd "$i")
+	done
+
+	for i in $KVDL_CHILDREN; do
+		key=kvd_linear_$i
+		KVD_DEFAULTS[$key]=$(devlink_resource_get kvd linear "$i")
+	done
+}
+
+KVD_PROFILES="default scale ipv4_max"
+
+devlink_sp_resource_kvd_profile_set()
+{
+	local profile=$1
+
+	case "$profile" in
+	scale)
+		devlink_resource_size_set 64000 kvd linear
+		devlink_resource_size_set 15616 kvd linear singles
+		devlink_resource_size_set 32000 kvd linear chunks
+		devlink_resource_size_set 16384 kvd linear large_chunks
+		devlink_resource_size_set 128000 kvd hash_single
+		devlink_resource_size_set 48000 kvd hash_double
+		devlink_reload
+		;;
+	ipv4_max)
+		devlink_resource_size_set 64000 kvd linear
+		devlink_resource_size_set 15616 kvd linear singles
+		devlink_resource_size_set 32000 kvd linear chunks
+		devlink_resource_size_set 16384 kvd linear large_chunks
+		devlink_resource_size_set 144000 kvd hash_single
+		devlink_resource_size_set 32768 kvd hash_double
+		devlink_reload
+		;;
+	default)
+		devlink_resource_size_set 98304 kvd linear
+		devlink_resource_size_set 16384 kvd linear singles
+		devlink_resource_size_set 49152 kvd linear chunks
+		devlink_resource_size_set 32768 kvd linear large_chunks
+		devlink_resource_size_set 87040 kvd hash_single
+		devlink_resource_size_set 60416 kvd hash_double
+		devlink_reload
+		;;
+	*)
+		check_err 1 "Unknown profile $profile"
+	esac
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 04/13] selftests: forwarding: Add devlink_lib.sh
From: Petr Machata @ 2018-06-30  0:47 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

This helper library contains wrappers to devlink functionality agnostic
to the underlying device.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
[petrm@mellanox.com: Split this out from another patch.]
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 .../selftests/net/forwarding/devlink_lib.sh        | 108 +++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 tools/testing/selftests/net/forwarding/devlink_lib.sh

diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh
new file mode 100644
index 000000000000..5ab1e5f43022
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+##############################################################################
+# Source library
+
+relative_path="${BASH_SOURCE%/*}"
+if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
+	relative_path="."
+fi
+
+source "$relative_path/lib.sh"
+
+##############################################################################
+# Defines
+
+DEVLINK_DEV=$(devlink port show | grep "${NETIFS[p1]}" | \
+	      grep -v "${NETIFS[p1]}[0-9]" | cut -d" " -f1 | \
+	      rev | cut -d"/" -f2- | rev)
+if [ -z "$DEVLINK_DEV" ]; then
+	echo "SKIP: ${NETIFS[p1]} has no devlink device registered for it"
+	exit 1
+fi
+if [[ "$(echo $DEVLINK_DEV | grep -c pci)" -eq 0 ]]; then
+	echo "SKIP: devlink device's bus is not PCI"
+	exit 1
+fi
+
+DEVLINK_VIDDID=$(lspci -s $(echo $DEVLINK_DEV | cut -d"/" -f2) \
+		 -n | cut -d" " -f3)
+
+##############################################################################
+# Sanity checks
+
+devlink -j resource show "$DEVLINK_DEV" &> /dev/null
+if [ $? -ne 0 ]; then
+	echo "SKIP: iproute2 too old, missing devlink resource support"
+	exit 1
+fi
+
+##############################################################################
+# Devlink helpers
+
+devlink_resource_names_to_path()
+{
+	local resource
+	local path=""
+
+	for resource in "${@}"; do
+		if [ "$path" == "" ]; then
+			path="$resource"
+		else
+			path="${path}/$resource"
+		fi
+	done
+
+	echo "$path"
+}
+
+devlink_resource_get()
+{
+	local name=$1
+	local resource_name=.[][\"$DEVLINK_DEV\"]
+
+	resource_name="$resource_name | .[] | select (.name == \"$name\")"
+
+	shift
+	for resource in "${@}"; do
+		resource_name="${resource_name} | .[\"resources\"][] | \
+			       select (.name == \"$resource\")"
+	done
+
+	devlink -j resource show "$DEVLINK_DEV" | jq "$resource_name"
+}
+
+devlink_resource_size_get()
+{
+	local size=$(devlink_resource_get "$@" | jq '.["size_new"]')
+
+	if [ "$size" == "null" ]; then
+		devlink_resource_get "$@" | jq '.["size"]'
+	else
+		echo "$size"
+	fi
+}
+
+devlink_resource_size_set()
+{
+	local new_size=$1
+	local path
+
+	shift
+	path=$(devlink_resource_names_to_path "$@")
+	devlink resource set "$DEVLINK_DEV" path "$path" size "$new_size"
+	check_err $? "Failed setting path $path to size $size"
+}
+
+devlink_reload()
+{
+	local still_pending
+
+	devlink dev reload "$DEVLINK_DEV" &> /dev/null
+	check_err $? "Failed reload"
+
+	still_pending=$(devlink resource show "$DEVLINK_DEV" | \
+			grep -c "size_new")
+	check_err $still_pending "Failed reload - There are still unset sizes"
+}
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 03/13] selftests: forwarding: lib: Parameterize NUM_NETIFS in two functions
From: Petr Machata @ 2018-06-30  0:46 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

setup_wait() and tc_offload_check() both assume that all NUM_NETIFS
interfaces are relevant for a given test. However, the scale test script
acts as an umbrella for a number of sub-tests, some of which may not
require all the interfaces.

Thus it's suboptimal for tc_offload_check() to query all the interfaces.
In case of setup_wait() it's incorrect, because the sub-test in question
of course doesn't configure any interfaces beyond what it needs, and
setup_wait() then ends up waiting indefinitely for the extraneous
interfaces to come up.

For that reason, give setup_wait() and tc_offload_check() an optional
parameter with a number of interfaces to probe. Fall back to global
NUM_NETIFS if the parameter is not given.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 5f4b7ed2c65a..e073918bbe15 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -220,7 +220,9 @@ setup_wait_dev()
 
 setup_wait()
 {
-	for i in $(eval echo {1..$NUM_NETIFS}); do
+	local num_netifs=${1:-$NUM_NETIFS}
+
+	for ((i = 1; i <= num_netifs; ++i)); do
 		setup_wait_dev ${NETIFS[p$i]}
 	done
 
@@ -481,7 +483,9 @@ forwarding_restore()
 
 tc_offload_check()
 {
-	for i in $(eval echo {1..$NUM_NETIFS}); do
+	local num_netifs=${1:-$NUM_NETIFS}
+
+	for ((i = 1; i <= num_netifs; ++i)); do
 		ethtool -k ${NETIFS[p$i]} \
 			| grep "hw-tc-offload: on" &> /dev/null
 		if [[ $? -ne 0 ]]; then
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 02/13] selftests: forwarding: lib: Add check_err_fail()
From: Petr Machata @ 2018-06-30  0:45 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

In the scale testing scenarios, one usually has a condition that is
expected to either fail, or pass, depending on which side of the scale
is being tested.

To capture this logic, add a function check_err_fail(), which dispatches
either to check_err() or check_fail(), depending on the value of the
first argument, should_fail.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 59272824ef37..5f4b7ed2c65a 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -156,6 +156,19 @@ check_fail()
 	fi
 }
 
+check_err_fail()
+{
+	local should_fail=$1; shift
+	local err=$1; shift
+	local what=$1; shift
+
+	if ((should_fail)); then
+		check_fail $err "$what succeeded, but should have failed"
+	else
+		check_err $err "$what failed"
+	fi
+}
+
 log_test()
 {
 	local test_name=$1
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 01/13] selftests: forwarding: Allow lib.sh sourcing from other directories
From: Petr Machata @ 2018-06-30  0:45 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem
In-Reply-To: <cover.1530319109.git.petrm@mellanox.com>

From: Yuval Mintz <yuvalm@mellanox.com>

The devlink related scripts are mlxsw-specific. As a result, they'll
reside in a different directory - but would still need the common logic
implemented in lib.sh.
So as a preliminary step, allow lib.sh to be sourced from other
directories as well.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index d1f14f83979e..59272824ef37 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -14,8 +14,13 @@ PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no}
 NETIF_TYPE=${NETIF_TYPE:=veth}
 NETIF_CREATE=${NETIF_CREATE:=yes}
 
-if [[ -f forwarding.config ]]; then
-	source forwarding.config
+relative_path="${BASH_SOURCE%/*}"
+if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
+	relative_path="."
+fi
+
+if [[ -f $relative_path/forwarding.config ]]; then
+	source "$relative_path/forwarding.config"
 fi
 
 ##############################################################################
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next 00/13] mlxsw: Add resource scale tests
From: Petr Machata @ 2018-06-30  0:44 UTC (permalink / raw)
  To: netdev, linux-kselftest; +Cc: jiri, idosch, shuah, davem

There are a number of tests that check features of the Linux networking
stack. By running them on suitable interfaces, one can exercise the
mlxsw offloading code. However none of these tests attempts to push
mlxsw to the limits supported by the ASIC.

As an additional wrinkle, the "limits supported by the ASIC" themselves
may not be a set of fixed numbers, but rather depend on a profile that
determines how the ASIC resources are allocated for different purposes.

This patchset introduces several tests that verify capability of mlxsw
to offload amounts of routes, flower rules, and mirroring sessions that
match predicted ASIC capacity, at different configuration profiles.
Additionally they verify that amounts exceeding the predicted capacity
can *not* be offloaded.

These are not generic tests, but ones that are tailored for mlxsw
specifically. For that reason they are not added to net/forwarding
selftests subdirectory, but rather to a newly-added drivers/net/mlxsw.

Patches #1, #2 and #3 tweak the generic forwarding/lib.sh to support the
new additions.

In patches #4 and #5, new libraries for interfacing with devlink are
introduced, first a generic one, then a Spectrum-specific one.

In patch #6, a devlink resource test is introduced.

Patches #7 and #8, #9 and #10, and #11 and #12 introduce three scale
tests: router, flower and mirror-to-gretap. The first of each pair of
patches introduces a generic portion of the test (mlxsw-specific), the
second introduces a Spectrum-specific wrapper.

Patch #13 then introduces a scale test driver that runs (possibly a
subset of) the tests introduced by patches from previous paragraph.

Arkadi Sharshevsky (1):
  selftests: mlxsw: Add router test

Petr Machata (8):
  selftests: forwarding: lib: Add check_err_fail()
  selftests: forwarding: lib: Parameterize NUM_NETIFS in two functions
  selftests: forwarding: Add devlink_lib.sh
  selftests: mlxsw: Add devlink_lib_spectrum.sh
  selftests: mlxsw: Add tc flower scale test
  selftests: mlxsw: Add target for tc flower test on spectrum
  selftests: mlxsw: Add scale test for mirror-to-gretap
  selftests: mlxsw: Add target for mirror-to-gretap test on spectrum

Yuval Mintz (4):
  selftests: forwarding: Allow lib.sh sourcing from other directories
  selftests: mlxsw: Add devlink KVD resource test
  selftests: mlxsw: Add target for router test on spectrum
  selftests: mlxsw: Add scale test for resources

 MAINTAINERS                                        |   1 +
 .../drivers/net/mlxsw/mirror_gre_scale.sh          | 197 +++++++++++++++++++++
 .../selftests/drivers/net/mlxsw/router_scale.sh    | 167 +++++++++++++++++
 .../net/mlxsw/spectrum/devlink_lib_spectrum.sh     | 119 +++++++++++++
 .../net/mlxsw/spectrum/devlink_resources.sh        | 117 ++++++++++++
 .../drivers/net/mlxsw/spectrum/mirror_gre_scale.sh |  13 ++
 .../drivers/net/mlxsw/spectrum/resource_scale.sh   |  55 ++++++
 .../drivers/net/mlxsw/spectrum/router_scale.sh     |  18 ++
 .../drivers/net/mlxsw/spectrum/tc_flower_scale.sh  |  19 ++
 .../selftests/drivers/net/mlxsw/tc_flower_scale.sh | 134 ++++++++++++++
 .../selftests/net/forwarding/devlink_lib.sh        | 108 +++++++++++
 tools/testing/selftests/net/forwarding/lib.sh      |  30 +++-
 12 files changed, 974 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
 create mode 100644 tools/testing/selftests/net/forwarding/devlink_lib.sh

-- 
2.4.11

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox