Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH v2] hv_sock: Add support for delayed close
From: Sunil Muthuswamy @ 2019-05-15  0:56 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Dexuan Cui, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org

Currently, hvsock does not implement any delayed or background close
logic. Whenever the hvsock socket is closed, a FIN is sent to the peer, and
the last reference to the socket is dropped, which leads to a call to
.destruct where the socket can hang indefinitely waiting for the peer to
close it's side. The can cause the user application to hang in the close()
call.

This change implements proper STREAM(TCP) closing handshake mechanism by
sending the FIN to the peer and the waiting for the peer's FIN to arrive
for a given timeout. On timeout, it will try to terminate the connection
(i.e. a RST). This is in-line with other socket providers such as virtio.

This change does not address the hang in the vmbus_hvsock_device_unregister
where it waits indefinitely for the host to rescind the channel. That
should be taken up as a separate fix.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
Changes since v1:
- Updated the title and description to better reflect the change. The title
was previously called 'hv_sock: Fix data loss upon socket close'
- Removed the sk_state_change call to keep the fix focused.
- Removed 'inline' keyword from the .c file and letting compiler do it.

 net/vmw_vsock/hyperv_transport.c | 108 ++++++++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 31 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index a827547..982a8dc 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -35,6 +35,9 @@
 /* The MTU is 16KB per the host side's design */
 #define HVS_MTU_SIZE		(1024 * 16)
 
+/* How long to wait for graceful shutdown of a connection */
+#define HVS_CLOSE_TIMEOUT (8 * HZ)
+
 struct vmpipe_proto_header {
 	u32 pkt_type;
 	u32 data_size;
@@ -305,19 +308,32 @@ static void hvs_channel_cb(void *ctx)
 		sk->sk_write_space(sk);
 }
 
-static void hvs_close_connection(struct vmbus_channel *chan)
+static void hvs_do_close_lock_held(struct vsock_sock *vsk,
+				   bool cancel_timeout)
 {
-	struct sock *sk = get_per_channel_state(chan);
-	struct vsock_sock *vsk = vsock_sk(sk);
-
-	lock_sock(sk);
+	struct sock *sk = sk_vsock(vsk);
 
-	sk->sk_state = TCP_CLOSE;
 	sock_set_flag(sk, SOCK_DONE);
-	vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN;
-
+	vsk->peer_shutdown = SHUTDOWN_MASK;
+	if (vsock_stream_has_data(vsk) <= 0)
+		sk->sk_state = TCP_CLOSING;
 	sk->sk_state_change(sk);
+	if (vsk->close_work_scheduled &&
+	    (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+		vsk->close_work_scheduled = false;
+		vsock_remove_sock(vsk);
 
+		/* Release the reference taken while scheduling the timeout */
+		sock_put(sk);
+	}
+}
+
+static void hvs_close_connection(struct vmbus_channel *chan)
+{
+	struct sock *sk = get_per_channel_state(chan);
+
+	lock_sock(sk);
+	hvs_do_close_lock_held(vsock_sk(sk), true);
 	release_sock(sk);
 }
 
@@ -452,50 +468,80 @@ static int hvs_connect(struct vsock_sock *vsk)
 	return vmbus_send_tl_connect_request(&h->vm_srv_id, &h->host_srv_id);
 }
 
+static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)
+{
+	struct vmpipe_proto_header hdr;
+
+	if (hvs->fin_sent || !hvs->chan)
+		return;
+
+	/* It can't fail: see hvs_channel_writable_bytes(). */
+	(void)hvs_send_data(hvs->chan, (struct hvs_send_buf *)&hdr, 0);
+	hvs->fin_sent = true;
+}
+
 static int hvs_shutdown(struct vsock_sock *vsk, int mode)
 {
 	struct sock *sk = sk_vsock(vsk);
-	struct vmpipe_proto_header hdr;
-	struct hvs_send_buf *send_buf;
-	struct hvsock *hvs;
 
 	if (!(mode & SEND_SHUTDOWN))
 		return 0;
 
 	lock_sock(sk);
+	hvs_shutdown_lock_held(vsk->trans, mode);
+	release_sock(sk);
+	return 0;
+}
 
-	hvs = vsk->trans;
-	if (hvs->fin_sent)
-		goto out;
-
-	send_buf = (struct hvs_send_buf *)&hdr;
+static void hvs_close_timeout(struct work_struct *work)
+{
+	struct vsock_sock *vsk =
+		container_of(work, struct vsock_sock, close_work.work);
+	struct sock *sk = sk_vsock(vsk);
 
-	/* It can't fail: see hvs_channel_writable_bytes(). */
-	(void)hvs_send_data(hvs->chan, send_buf, 0);
+	sock_hold(sk);
+	lock_sock(sk);
+	if (!sock_flag(sk, SOCK_DONE))
+		hvs_do_close_lock_held(vsk, false);
 
-	hvs->fin_sent = true;
-out:
+	vsk->close_work_scheduled = false;
 	release_sock(sk);
-	return 0;
+	sock_put(sk);
 }
 
-static void hvs_release(struct vsock_sock *vsk)
+/* Returns true, if it is safe to remove socket; false otherwise */
+static bool hvs_close_lock_held(struct vsock_sock *vsk)
 {
 	struct sock *sk = sk_vsock(vsk);
-	struct hvsock *hvs = vsk->trans;
-	struct vmbus_channel *chan;
 
-	lock_sock(sk);
+	if (!(sk->sk_state == TCP_ESTABLISHED ||
+	      sk->sk_state == TCP_CLOSING))
+		return true;
 
-	sk->sk_state = TCP_CLOSING;
-	vsock_remove_sock(vsk);
+	if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+		hvs_shutdown_lock_held(vsk->trans, SHUTDOWN_MASK);
 
-	release_sock(sk);
+	if (sock_flag(sk, SOCK_DONE))
+		return true;
 
-	chan = hvs->chan;
-	if (chan)
-		hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+	/* This reference will be dropped by the delayed close routine */
+	sock_hold(sk);
+	INIT_DELAYED_WORK(&vsk->close_work, hvs_close_timeout);
+	vsk->close_work_scheduled = true;
+	schedule_delayed_work(&vsk->close_work, HVS_CLOSE_TIMEOUT);
+	return false;
+}
 
+static void hvs_release(struct vsock_sock *vsk)
+{
+	struct sock *sk = sk_vsock(vsk);
+	bool remove_sock;
+
+	lock_sock(sk);
+	remove_sock = hvs_close_lock_held(vsk);
+	release_sock(sk);
+	if (remove_sock)
+		vsock_remove_sock(vsk);
 }
 
 static void hvs_destruct(struct vsock_sock *vsk)
-- 
2.7.4


^ permalink raw reply related

* RE: [PATCH v2] hv_sock: Add support for delayed close
From: Dexuan Cui @ 2019-05-16  4:34 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB0465043C08E519774EE73E99C0090@BN6PR21MB0465.namprd21.prod.outlook.com>

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Tuesday, May 14, 2019 5:56 PM
> ...
> +static bool hvs_close_lock_held(struct vsock_sock *vsk)
>  {
> ...
> +	schedule_delayed_work(&vsk->close_work, HVS_CLOSE_TIMEOUT);

Reviewed-by: Dexuan Cui <decui@microsoft.com>

The patch looks good to me. Thank you, Sunil!

Next, we're going to remove the "channel->rescind" check in 
vmbus_hvsock_device_unregister() -- when doing that, IMO we need to
fix a potential race revealed by the schedule_delayed_work() in this
patch:

When hvs_close_timeout() finishes, the "sk" struct has been freed, but
vmbus_onoffer_rescind() -> channel->chn_rescind_callback(), i.e. 
hvs_close_connection(), may be still running and referencing the "chan"
and "sk" structs (), which should no longer be referenced when 
hvs_close_timeout() finishes, i.e. "get_per_channel_state(chan)" is no
longer safe. The problem is: currently there is no sync mechanism
between vmbus_onoffer_rescind() and hvs_close_timeout().

The race is a real issue only after we remove the "channel->rescind"
in vmbus_hvsock_device_unregister().

I guess we need to introduce a new single-threaded workqueue in the
vmbus driver, and offload both vmbus_onoffer_rescind() and 
hvs_close_timeout() onto the new workqueue.

Thanks,
-- Dexuan


^ permalink raw reply

* RE: [PATCH v2] hv_sock: Add support for delayed close
From: Dexuan Cui @ 2019-05-16 17:16 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <PU1P153MB01698261307593C5D58AAF4FBF0A0@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>

> From: linux-hyperv-owner@vger.kernel.org
> <linux-hyperv-owner@vger.kernel.org> On Behalf Of Dexuan Cui
> Sent: Wednesday, May 15, 2019 9:34 PM
> ...

Hi Sunil,
To make it clear, your patch itself is good, and I was just talking about
the next change we're going to make. Once we make the next change,
IMO we need a further patch to schedule hvs_close_timeout() to the new
single-threaded workqueue rather than the global "system_wq".

> Next, we're going to remove the "channel->rescind" check in
> vmbus_hvsock_device_unregister() -- when doing that, IMO we need to
> fix a potential race revealed by the schedule_delayed_work() in this
> patch:
> 
> When hvs_close_timeout() finishes, the "sk" struct has been freed, but
> vmbus_onoffer_rescind() -> channel->chn_rescind_callback(), i.e.
> hvs_close_connection(), may be still running and referencing the "chan"
> and "sk" structs (), which should no longer be referenced when
> hvs_close_timeout() finishes, i.e. "get_per_channel_state(chan)" is no
> longer safe. The problem is: currently there is no sync mechanism
> between vmbus_onoffer_rescind() and hvs_close_timeout().
> 
> The race is a real issue only after we remove the "channel->rescind"
> in vmbus_hvsock_device_unregister().

A correction: IMO the race is real even for the current code, i.e. without
your patch: in vmbus_onoffer_rescind(), between we set channel->rescind
and we call channel->chn_rescind_callback(), the channel may have been
freed by vmbus_hvsock_device_unregister().

This race window is small and I guess that's why we never noticed it.

> I guess we need to introduce a new single-threaded workqueue in the
> vmbus driver, and offload both vmbus_onoffer_rescind() and
> hvs_close_timeout() onto the new workqueue.
 
Thanks,
-- Dexuan


^ permalink raw reply

* RE: [PATCH v2] hv_sock: Add support for delayed close
From: Sunil Muthuswamy @ 2019-05-16 18:11 UTC (permalink / raw)
  To: Dexuan Cui, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <PU1P153MB01693DB2206CD639AF356DBDBF0A0@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>



> -----Original Message-----
> From: Dexuan Cui <decui@microsoft.com>
> Sent: Thursday, May 16, 2019 10:17 AM
> To: Sunil Muthuswamy <sunilmut@microsoft.com>; KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Sasha Levin <sashal@kernel.org>; David S. Miller <davem@davemloft.net>;
> Michael Kelley <mikelley@microsoft.com>
> Cc: netdev@vger.kernel.org; linux-hyperv@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH v2] hv_sock: Add support for delayed close
> 
> > From: linux-hyperv-owner@vger.kernel.org
> > <linux-hyperv-owner@vger.kernel.org> On Behalf Of Dexuan Cui
> > Sent: Wednesday, May 15, 2019 9:34 PM
> > ...
> 
> Hi Sunil,
> To make it clear, your patch itself is good, and I was just talking about
> the next change we're going to make. Once we make the next change,
> IMO we need a further patch to schedule hvs_close_timeout() to the new
> single-threaded workqueue rather than the global "system_wq".
> 
Thanks for your review. Can you add a 'signed-off' from your side to the patch.
> > Next, we're going to remove the "channel->rescind" check in
> > vmbus_hvsock_device_unregister() -- when doing that, IMO we need to
> > fix a potential race revealed by the schedule_delayed_work() in this
> > patch:
> >
> > When hvs_close_timeout() finishes, the "sk" struct has been freed, but
> > vmbus_onoffer_rescind() -> channel->chn_rescind_callback(), i.e.
> > hvs_close_connection(), may be still running and referencing the "chan"
> > and "sk" structs (), which should no longer be referenced when
> > hvs_close_timeout() finishes, i.e. "get_per_channel_state(chan)" is no
> > longer safe. The problem is: currently there is no sync mechanism
> > between vmbus_onoffer_rescind() and hvs_close_timeout().
> >
> > The race is a real issue only after we remove the "channel->rescind"
> > in vmbus_hvsock_device_unregister().
> 
> A correction: IMO the race is real even for the current code, i.e. without
> your patch: in vmbus_onoffer_rescind(), between we set channel->rescind
> and we call channel->chn_rescind_callback(), the channel may have been
> freed by vmbus_hvsock_device_unregister().
> 
> This race window is small and I guess that's why we never noticed it.
> 
> > I guess we need to introduce a new single-threaded workqueue in the
> > vmbus driver, and offload both vmbus_onoffer_rescind() and
> > hvs_close_timeout() onto the new workqueue.
> 
Something is a miss if the guest has to wait for the host to close the channel
prior to cleaning it up from it's side. That's waste of resources, doesn't matter
if you do it in a system thread, dedicated pool or anyway else. I think the right
way to deal with this is to unregister the rescind callback routine, wait for any
running rescind callback routine to finish and then drop the last reference to
the socket, which should lead to all the cleanup. I understand that some of the
facility of unregistering the rescind callback might not exist today.

> Thanks,
> -- Dexuan


^ permalink raw reply

* RE: [PATCH v2] hv_sock: Add support for delayed close
From: Dexuan Cui @ 2019-05-16 18:55 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB0465373CC2D240A47BED9717C00A0@BN6PR21MB0465.namprd21.prod.outlook.com>

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Thursday, May 16, 2019 11:11 AM
> > Hi Sunil,
> > To make it clear, your patch itself is good, and I was just talking about
> > the next change we're going to make. Once we make the next change,
> > IMO we need a further patch to schedule hvs_close_timeout() to the new
> > single-threaded workqueue rather than the global "system_wq".
> >
> Thanks for your review. Can you add a 'signed-off' from your side to the patch.

I have provided my Reviewed-by. I guess this should be enough. Of course,
David makes the final call. It would be great if the maintaners of the Hyper-V
drivers listed in the "To:" could provide their Signed-off-by.

> > > Next, we're going to remove the "channel->rescind" check in
> > > vmbus_hvsock_device_unregister() -- when doing that, IMO we need to
> > > fix a potential race revealed by the schedule_delayed_work() in this
> > > patch:
> > >
> > > When hvs_close_timeout() finishes, the "sk" struct has been freed, but
> > > vmbus_onoffer_rescind() -> channel->chn_rescind_callback(), i.e.
> > > hvs_close_connection(), may be still running and referencing the "chan"
> > > and "sk" structs (), which should no longer be referenced when
> > > hvs_close_timeout() finishes, i.e. "get_per_channel_state(chan)" is no
> > > longer safe. The problem is: currently there is no sync mechanism
> > > between vmbus_onoffer_rescind() and hvs_close_timeout().
> > >
> > > The race is a real issue only after we remove the "channel->rescind"
> > > in vmbus_hvsock_device_unregister().
> >
> > A correction: IMO the race is real even for the current code, i.e. without
> > your patch: in vmbus_onoffer_rescind(), between we set channel->rescind
> > and we call channel->chn_rescind_callback(), the channel may have been
> > freed by vmbus_hvsock_device_unregister().
> >
> > This race window is small and I guess that's why we never noticed it.
> >
> > > I guess we need to introduce a new single-threaded workqueue in the
> > > vmbus driver, and offload both vmbus_onoffer_rescind() and
> > > hvs_close_timeout() onto the new workqueue.
> >
> Something is a miss if the guest has to wait for the host to close the channel
> prior to cleaning it up from it's side. That's waste of resources, doesn't matter

I agree. 

> if you do it in a system thread, dedicated pool or anyway else. I think the right
> way to deal with this is to unregister the rescind callback routine, wait for any
> running rescind callback routine to finish and then drop the last reference to
> the socket, which should lead to all the cleanup. I understand that some of the
> facility of unregistering the rescind callback might not exist today.

Considering the concurrency, I'm not sure if it's easy or possible to safely
unregister the chn_rescind_callback. My hunch is: doing that may be more
difficult than adding a new single-thread workqueue.

Thanks,
-- Dexuan


^ permalink raw reply

* Re: [PATCH v2] hv_sock: Add support for delayed close
From: David Miller @ 2019-05-16 19:12 UTC (permalink / raw)
  To: sunilmut
  Cc: kys, haiyangz, sthemmin, sashal, decui, mikelley, netdev,
	linux-hyperv, linux-kernel
In-Reply-To: <BN6PR21MB0465043C08E519774EE73E99C0090@BN6PR21MB0465.namprd21.prod.outlook.com>

From: Sunil Muthuswamy <sunilmut@microsoft.com>
Date: Wed, 15 May 2019 00:56:05 +0000

> Currently, hvsock does not implement any delayed or background close
> logic. Whenever the hvsock socket is closed, a FIN is sent to the peer, and
> the last reference to the socket is dropped, which leads to a call to
> .destruct where the socket can hang indefinitely waiting for the peer to
> close it's side. The can cause the user application to hang in the close()
> call.
> 
> This change implements proper STREAM(TCP) closing handshake mechanism by
> sending the FIN to the peer and the waiting for the peer's FIN to arrive
> for a given timeout. On timeout, it will try to terminate the connection
> (i.e. a RST). This is in-line with other socket providers such as virtio.
> 
> This change does not address the hang in the vmbus_hvsock_device_unregister
> where it waits indefinitely for the host to rescind the channel. That
> should be taken up as a separate fix.
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>

Applied, thanks.

^ permalink raw reply

* [PATCH] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: Sunil Muthuswamy @ 2019-05-17  0:16 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Dexuan Cui, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org

Currently, the hv_sock buffer size is static and can't scale to the
bandwidth requirements of the application. This change allows the
applications to influence the socket buffer sizes using the SO_SNDBUF and
the SO_RCVBUF socket options.

Few interesting points to note:
1. Since the VMBUS does not allow a resize operation of the ring size, the
socket buffer size option should be set prior to establishing the
connection for it to take effect.
2. Setting the socket option comes with the cost of that much memory being
reserved/allocated by the kernel, for the lifetime of the connection.

Perf data:
Total Data Transfer: 1GB
Single threaded reader/writer
Results below are summarized over 10 iterations.

Linux hvsocket writer + Windows hvsocket reader:
|---------------------------------------------------------------------------------------------|
|Packet size ->   |      128B       |       1KB       |       4KB       |        64KB         |
|---------------------------------------------------------------------------------------------|
|SO_SNDBUF size | |                 Throughput in MB/s (min/max/avg/median):                  |
|               v |                                                                           |
|---------------------------------------------------------------------------------------------|
|      Default    | 109/118/114/116 | 636/774/701/700 | 435/507/480/476 |   410/491/462/470   |
|      16KB       | 110/116/112/111 | 575/705/662/671 | 749/900/854/869 |   592/824/692/676   |
|      32KB       | 108/120/115/115 | 703/823/767/772 | 718/878/850/866 | 1593/2124/2000/2085 |
|      64KB       | 108/119/114/114 | 592/732/683/688 | 805/934/903/911 | 1784/1943/1862/1843 |
|---------------------------------------------------------------------------------------------|

Windows hvsocket writer + Linux hvsocket reader:
|---------------------------------------------------------------------------------------------|
|Packet size ->   |     128B    |      1KB        |          4KB        |        64KB         |
|---------------------------------------------------------------------------------------------|
|SO_RCVBUF size | |               Throughput in MB/s (min/max/avg/median):                    |
|               v |                                                                           |
|---------------------------------------------------------------------------------------------|
|      Default    | 69/82/75/73 | 313/343/333/336 |   418/477/446/445   |   659/701/676/678   |
|      16KB       | 69/83/76/77 | 350/401/375/382 |   506/548/517/516   |   602/624/615/615   |
|      32KB       | 62/83/73/73 | 471/529/496/494 |   830/1046/935/939  | 944/1180/1070/1100  |
|      64KB       | 64/70/68/69 | 467/533/501/497 | 1260/1590/1430/1431 | 1605/1819/1670/1660 |
|---------------------------------------------------------------------------------------------|

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
- The table above exceeds the 75char limit for the patch. If that's a
problem, I can try to squeeze it in somehow within the limit.

 net/vmw_vsock/hyperv_transport.c | 50 ++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 982a8dc..8d3a7b0 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -23,14 +23,14 @@
 #include <net/sock.h>
 #include <net/af_vsock.h>
 
-/* The host side's design of the feature requires 6 exact 4KB pages for
- * recv/send rings respectively -- this is suboptimal considering memory
- * consumption, however unluckily we have to live with it, before the
- * host comes up with a better design in the future.
+/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
+ * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer
+ * hosts don't have this limitation; but, keep the defaults the same for compat.
  */
 #define PAGE_SIZE_4K		4096
 #define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6)
 #define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6)
+#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64)
 
 /* The MTU is 16KB per the host side's design */
 #define HVS_MTU_SIZE		(1024 * 16)
@@ -344,9 +344,12 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 
 	struct sockaddr_vm addr;
 	struct sock *sk, *new = NULL;
-	struct vsock_sock *vnew;
-	struct hvsock *hvs, *hvs_new;
+	struct vsock_sock *vnew = NULL;
+	struct hvsock *hvs = NULL;
+	struct hvsock *hvs_new = NULL;
+	int rcvbuf;
 	int ret;
+	int sndbuf;
 
 	if_type = &chan->offermsg.offer.if_type;
 	if_instance = &chan->offermsg.offer.if_instance;
@@ -388,9 +391,34 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	}
 
 	set_channel_read_mode(chan, HV_CALL_DIRECT);
-	ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE,
-			 RINGBUFFER_HVS_RCV_SIZE, NULL, 0,
-			 hvs_channel_cb, conn_from_host ? new : sk);
+
+	/* Use the socket buffer sizes as hints for the VMBUS ring size. For
+	 * server side sockets, 'sk' is the parent socket and thus, this will
+	 * allow the child sockets to inherit the size from the parent. Keep
+	 * the mins to the default value and align to page size as per VMBUS
+	 * requirements.
+	 * For the max, the socket core library will limit the socket buffer
+	 * size that can be set by the user, but, since currently, the hv_sock
+	 * VMBUS ring buffer is physically contiguous allocation, restrict it
+	 * further.
+	 * Older versions of hv_sock host side code cannot handle bigger VMBUS
+	 * ring buffer size. Use the version number to limit the change to newer
+	 * versions.
+	 */
+	if (vmbus_proto_version < VERSION_WIN10_V5) {
+		sndbuf = RINGBUFFER_HVS_SND_SIZE;
+		rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
+	} else {
+		sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
+		sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
+		sndbuf = ALIGN(sndbuf, PAGE_SIZE);
+		rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
+		rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
+		rcvbuf = ALIGN(rcvbuf, PAGE_SIZE);
+	}
+
+	ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
+			 conn_from_host ? new : sk);
 	if (ret != 0) {
 		if (conn_from_host) {
 			hvs_new->chan = NULL;
@@ -441,6 +469,7 @@ static u32 hvs_get_local_cid(void)
 static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
 {
 	struct hvsock *hvs;
+	struct sock *sk = sk_vsock(vsk);
 
 	hvs = kzalloc(sizeof(*hvs), GFP_KERNEL);
 	if (!hvs)
@@ -448,7 +477,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
 
 	vsk->trans = hvs;
 	hvs->vsk = vsk;
-
+	sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE;
+	sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply related

* RE: [PATCH] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: Dexuan Cui @ 2019-05-17  0:47 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB046528E2099CDE2C6C2200A7C00B0@BN6PR21MB0465.namprd21.prod.outlook.com>

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Thursday, May 16, 2019 5:17 PM
> 
> Currently, the hv_sock buffer size is static and can't scale to the
> bandwidth requirements of the application. This change allows the
> applications to influence the socket buffer sizes using the SO_SNDBUF and
> the SO_RCVBUF socket options.
> 
> Few interesting points to note:
> 1. Since the VMBUS does not allow a resize operation of the ring size, the
> socket buffer size option should be set prior to establishing the
> connection for it to take effect.
> 2. Setting the socket option comes with the cost of that much memory being
> reserved/allocated by the kernel, for the lifetime of the connection.

Reviewed-by: Dexuan Cui <decui@microsoft.com>

The patch looks good to me. Thanks, Sunil!

Thanks,
-- Dexuan

^ permalink raw reply

* [PATCH] hv_sock: perf: loop in send() to maximize bandwidth
From: Sunil Muthuswamy @ 2019-05-17  2:05 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Dexuan Cui, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org

Currently, the hv_sock send() iterates once over the buffer, puts data into
the VMBUS channel and returns. It doesn't maximize on the case when there
is a simultaneous reader draining data from the channel. In such a case,
the send() can maximize the bandwidth (and consequently minimize the cpu
cycles) by iterating until the channel is found to be full.

Perf data:
Total Data Transfer: 10GB/iteration
Single threaded reader/writer, Linux hvsocket writer with Windows hvsocket
reader
Packet size: 64KB
CPU sys time was captured using the 'time' command for the writer to send
10GB of data.
'Send Buffer Loop' is with the patch applied.
The values below are over 10 iterations.

|--------------------------------------------------------|
|        |        Current        |   Send Buffer Loop    |
|--------------------------------------------------------|
|        | Throughput | CPU sys  | Throughput | CPU sys  |
|        | (MB/s)     | time (s) | (MB/s)     | time (s) |
|--------------------------------------------------------|
| Min    |     407    |   7.048  |    401     |  5.958   |
|--------------------------------------------------------|
| Max    |     455    |   7.563  |    542     |  6.993   |
|--------------------------------------------------------|
| Avg    |     440    |   7.411  |    451     |  6.639   |
|--------------------------------------------------------|
| Median |     446    |   7.417  |    447     |  6.761   |
|--------------------------------------------------------|

Observation:
1. The avg throughput doesn't really change much with this change for this
scenario. This is most probably because the bottleneck on throughput is
somewhere else.
2. The average system (or kernel) cpu time goes down by 10%+ with this
change, for the same amount of data transfer.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
 net/vmw_vsock/hyperv_transport.c | 45 +++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 982a8dc..7c13032 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -55,8 +55,9 @@ struct hvs_recv_buf {
 };
 
 /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use
- * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated
- * buffer, because tests show there is no significant performance difference.
+ * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the
+ * guest and the host processing as one VMBUS packet is the smallest processing
+ * unit.
  *
  * Note: the buffer can be eliminated in the future when we add new VMBus
  * ringbuffer APIs that allow us to directly copy data from userspace buffer
@@ -644,7 +645,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
 	struct hvsock *hvs = vsk->trans;
 	struct vmbus_channel *chan = hvs->chan;
 	struct hvs_send_buf *send_buf;
-	ssize_t to_write, max_writable, ret;
+	ssize_t to_write, max_writable;
+	ssize_t ret = 0;
+	ssize_t bytes_written = 0;
 
 	BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K);
 
@@ -652,20 +655,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
 	if (!send_buf)
 		return -ENOMEM;
 
-	max_writable = hvs_channel_writable_bytes(chan);
-	to_write = min_t(ssize_t, len, max_writable);
-	to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
-
-	ret = memcpy_from_msg(send_buf->data, msg, to_write);
-	if (ret < 0)
-		goto out;
+	/* Reader(s) could be draining data from the channel as we write.
+	 * Maximize bandwidth, by iterating until the channel is found to be
+	 * full.
+	 */
+	while (len) {
+		max_writable = hvs_channel_writable_bytes(chan);
+		if (!max_writable)
+			break;
+		to_write = min_t(ssize_t, len, max_writable);
+		to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
+		/* memcpy_from_msg is safe for loop as it advances the offsets
+		 * within the message iterator.
+		 */
+		ret = memcpy_from_msg(send_buf->data, msg, to_write);
+		if (ret < 0)
+			goto out;
 
-	ret = hvs_send_data(hvs->chan, send_buf, to_write);
-	if (ret < 0)
-		goto out;
+		ret = hvs_send_data(hvs->chan, send_buf, to_write);
+		if (ret < 0)
+			goto out;
 
-	ret = to_write;
+		bytes_written += to_write;
+		len -= to_write;
+	}
 out:
+	/* If any data has been sent, return that */
+	if (bytes_written)
+		ret = bytes_written;
 	kfree(send_buf);
 	return ret;
 }
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: David Miller @ 2019-05-18 18:14 UTC (permalink / raw)
  To: sunilmut
  Cc: kys, haiyangz, sthemmin, sashal, decui, mikelley, netdev,
	linux-hyperv, linux-kernel
In-Reply-To: <BN6PR21MB046528E2099CDE2C6C2200A7C00B0@BN6PR21MB0465.namprd21.prod.outlook.com>


These two changes really look like net-next material, and net-next is
closed at the moment.

Please resubmit these when net-next opens back up.

Thank you.

^ permalink raw reply

* RE: [PATCH] hv_sock: perf: loop in send() to maximize bandwidth
From: Dexuan Cui @ 2019-05-19 19:30 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB046557834D46216464A6BA08C00B0@BN6PR21MB0465.namprd21.prod.outlook.com>

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Thursday, May 16, 2019 7:05 PM
> Currently, the hv_sock send() iterates once over the buffer, puts data into
> the VMBUS channel and returns. It doesn't maximize on the case when there
> is a simultaneous reader draining data from the channel. In such a case,
> the send() can maximize the bandwidth (and consequently minimize the cpu
> cycles) by iterating until the channel is found to be full.
>  ...
> Observation:
> 1. The avg throughput doesn't really change much with this change for this
> scenario. This is most probably because the bottleneck on throughput is
> somewhere else.
> 2. The average system (or kernel) cpu time goes down by 10%+ with this
> change, for the same amount of data transfer.
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>

Reviewed-by: Dexuan Cui <decui@microsoft.com>

The patch looks good. Thanks, Sunil!

Thanks,
-- Dexuan

^ permalink raw reply

* [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
From: Haiyang Zhang @ 2019-05-19 22:28 UTC (permalink / raw)
  To: sashal@kernel.org, bhelgaas@google.com, lorenzo.pieralisi@arm.com,
	linux-hyperv@vger.kernel.org, linux-pci@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf@aepfle.de,
	vkuznets, linux-kernel@vger.kernel.org

Due to Azure host agent settings, the device instance ID's bytes 8 and 9
are no longer unique. This causes some of the PCI devices not showing up
in VMs with multiple passthrough devices, such as GPUs. So, as recommended
by Azure host team, we now use the bytes 4 and 5 which usually provide
unique numbers.

In the rare cases of collision, we will detect and find another number
that is not in use.
Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 82acd61..6b9cc6e60a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -37,6 +37,8 @@
  * the PCI back-end driver in Hyper-V.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
 		complete(&hbus->remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+	unsigned int i;
+
+	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+		return dom;
+
+	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+		if (test_and_set_bit(i, hvpci_dom_map) == 0)
+			return i;
+	}
+
+	return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+	clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:	VMBus's tracking struct for this root PCI bus
@@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 			const struct hv_vmbus_device_id *dev_id)
 {
 	struct hv_pcibus_device *hbus;
+	u16 dom_req, dom;
 	int ret;
 
 	/*
@@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device *hdev,
 	hbus->state = hv_pcibus_init;
 
 	/*
-	 * The PCI bus "domain" is what is called "segment" in ACPI and
-	 * other specs.  Pull it from the instance ID, to get something
-	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
-	 * do the same thing for consistency.  Note that, since this code
-	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-	 * that (1) the only domain in use for something that looks like
-	 * a physical PCI bus (which is actually emulated by the
-	 * hypervisor) is domain 0 and (2) there will be no overlap
-	 * between domains derived from these instance IDs in the same
-	 * VM.
+	 * The PCI bus "domain" is what is called "segment" in ACPI and other
+	 * specs. Pull it from the instance ID, to get something usually
+	 * unique. In rare cases of collision, we will find out another number
+	 * not in use.
+	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
+	 * together with this guest driver can guarantee that (1) The only
+	 * domain used by Gen1 VMs for something that looks like a physical
+	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
+	 * (2) There will be no overlap between domains (after fixing possible
+	 * collisions) in the same VM.
 	 */
-	hbus->sysdata.domain = hdev->dev_instance.b[9] |
-			       hdev->dev_instance.b[8] << 8;
+	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+	dom = hv_get_dom_num(dom_req);
+
+	if (dom == HVPCI_DOM_INVALID) {
+		pr_err("Unable to use dom# 0x%hx or other numbers",
+		       dom_req);
+		ret = -EINVAL;
+		goto free_bus;
+	}
+
+	if (dom != dom_req)
+		pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
+			dom_req, dom);
+
+	hbus->sysdata.domain = dom;
 
 	hbus->hdev = hdev;
 	refcount_set(&hbus->remove_lock, 1);
@@ -2559,7 +2616,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 					   hbus->sysdata.domain);
 	if (!hbus->wq) {
 		ret = -ENOMEM;
-		goto free_bus;
+		goto free_dom;
 	}
 
 	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
@@ -2636,6 +2693,8 @@ static int hv_pci_probe(struct hv_device *hdev,
 	vmbus_close(hdev->channel);
 destroy_wq:
 	destroy_workqueue(hbus->wq);
+free_dom:
+	hv_put_dom_num(hbus->sysdata.domain);
 free_bus:
 	free_page((unsigned long)hbus);
 	return ret;
@@ -2717,6 +2776,9 @@ static int hv_pci_remove(struct hv_device *hdev)
 	put_hvpcibus(hbus);
 	wait_for_completion(&hbus->remove_event);
 	destroy_workqueue(hbus->wq);
+
+	hv_put_dom_num(hbus->sysdata.domain);
+
 	free_page((unsigned long)hbus);
 	return 0;
 }
@@ -2744,6 +2806,9 @@ static void __exit exit_hv_pci_drv(void)
 
 static int __init init_hv_pci_drv(void)
 {
+	/* Set the invalid domain number's bit, so it will not be used */
+	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
+
 	return vmbus_driver_register(&hv_pci_drv);
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* Inquiry 20/May/2019
From: Daniel Murray @ 2019-05-20 13:31 UTC (permalink / raw)
  To: linux-hyperv

Hi,friend,

This is Daniel Murray and i am from Sinara Group Co.Ltd in Russia.
We are glad to know about your company from the web and we are interested in your products.
Could you kindly send us your Latest catalog and price list for our trial order.

Best Regards,

Daniel Murray
Purchasing Manager



^ permalink raw reply

* RE: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
From: Dexuan Cui @ 2019-05-22  3:14 UTC (permalink / raw)
  To: Haiyang Zhang, sashal@kernel.org, bhelgaas@google.com,
	lorenzo.pieralisi@arm.com, linux-hyperv@vger.kernel.org,
	linux-pci@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf@aepfle.de,
	vkuznets, linux-kernel@vger.kernel.org
In-Reply-To: <1558304821-36038-1-git-send-email-haiyangz@microsoft.com>

> From: linux-hyperv-owner@vger.kernel.org
> Sent: Sunday, May 19, 2019 3:29 PM
> 
> Due to Azure host agent settings, the device instance ID's bytes 8 and 9
> are no longer unique. This causes some of the PCI devices not showing up
> in VMs with multiple passthrough devices, such as GPUs. So, as recommended
> by Azure host team, we now use the bytes 4 and 5 which usually provide
> unique numbers.
> 
> In the rare cases of collision, we will detect and find another number
> that is not in use.
> Thanks to Michael Kelley <mikelley@microsoft.com> for proposing this idea.
> 
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

The patch looks good to me.

Reviewed-by: Dexuan Cui <decui@microsoft.com>


^ permalink raw reply

* RE: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision
From: Dexuan Cui @ 2019-05-22  5:14 UTC (permalink / raw)
  To: Stephen Hemminger, Haiyang Zhang, sashal@kernel.org,
	bhelgaas@google.com, lorenzo.pieralisi@arm.com,
	linux-hyperv@vger.kernel.org, linux-pci@vger.kernel.org
  Cc: KY Srinivasan, olaf@aepfle.de, vkuznets,
	linux-kernel@vger.kernel.org
In-Reply-To: <BYAPR21MB131778CC3A356745BC74EE3DCC000@BYAPR21MB1317.namprd21.prod.outlook.com>

> From: Stephen Hemminger <sthemmin@microsoft.com> 
> Sent: Tuesday, May 21, 2019 9:40 PM
>
> Thanks for working this out with the host team.
> Now if we could just get some persistent slot information it would make udev in systemd happy.

The slot info comes from the serial number "hpdev->desc.ser", which may not guarantee
uniqueness: "...Hyper-V doesn't provide unique serial numbers": see the changelog of the below
commit:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=29927dfb7f69bcf2ae7fd1cda10997e646a5189c

And we can pass through multiple devices to a VM in any order (i.e. hot add/remove), so IMO
the "slot" info is not unique and persistent.

Thanks,
-- Dexuan

^ permalink raw reply

* [PATCH AUTOSEL 4.19 013/244] hv_netvsc: fix race that may miss tx queue wakeup
From: Sasha Levin @ 2019-05-22 19:22 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Haiyang Zhang, Stephan Klein, David S . Miller, Sasha Levin,
	linux-hyperv, netdev
In-Reply-To: <20190522192630.24917-1-sashal@kernel.org>

From: Haiyang Zhang <haiyangz@microsoft.com>

[ Upstream commit 93aa4792c3908eac87ddd368ee0fe0564148232b ]

When the ring buffer is almost full due to RX completion messages, a
TX packet may reach the "low watermark" and cause the queue stopped.
If the TX completion arrives earlier than queue stopping, the wakeup
may be missed.

This patch moves the check for the last pending packet to cover both
EAGAIN and success cases, so the queue will be reliably waked up when
necessary.

Reported-and-tested-by: Stephan Klein <stephan.klein@wegfinder.at>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/hyperv/netvsc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index fb12b63439c67..35413041dcf81 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -872,12 +872,6 @@ static inline int netvsc_send_pkt(
 	} else if (ret == -EAGAIN) {
 		netif_tx_stop_queue(txq);
 		ndev_ctx->eth_stats.stop_queue++;
-		if (atomic_read(&nvchan->queue_sends) < 1 &&
-		    !net_device->tx_disable) {
-			netif_tx_wake_queue(txq);
-			ndev_ctx->eth_stats.wake_queue++;
-			ret = -ENOSPC;
-		}
 	} else {
 		netdev_err(ndev,
 			   "Unable to send packet pages %u len %u, ret %d\n",
@@ -885,6 +879,15 @@ static inline int netvsc_send_pkt(
 			   ret);
 	}
 
+	if (netif_tx_queue_stopped(txq) &&
+	    atomic_read(&nvchan->queue_sends) < 1 &&
+	    !net_device->tx_disable) {
+		netif_tx_wake_queue(txq);
+		ndev_ctx->eth_stats.wake_queue++;
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+	}
+
 	return ret;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.0 019/317] hv_netvsc: fix race that may miss tx queue wakeup
From: Sasha Levin @ 2019-05-22 19:18 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Haiyang Zhang, Stephan Klein, David S . Miller, Sasha Levin,
	linux-hyperv, netdev
In-Reply-To: <20190522192338.23715-1-sashal@kernel.org>

From: Haiyang Zhang <haiyangz@microsoft.com>

[ Upstream commit 93aa4792c3908eac87ddd368ee0fe0564148232b ]

When the ring buffer is almost full due to RX completion messages, a
TX packet may reach the "low watermark" and cause the queue stopped.
If the TX completion arrives earlier than queue stopping, the wakeup
may be missed.

This patch moves the check for the last pending packet to cover both
EAGAIN and success cases, so the queue will be reliably waked up when
necessary.

Reported-and-tested-by: Stephan Klein <stephan.klein@wegfinder.at>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/hyperv/netvsc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index e0dce373cdd9d..3d4a166a49d58 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -875,12 +875,6 @@ static inline int netvsc_send_pkt(
 	} else if (ret == -EAGAIN) {
 		netif_tx_stop_queue(txq);
 		ndev_ctx->eth_stats.stop_queue++;
-		if (atomic_read(&nvchan->queue_sends) < 1 &&
-		    !net_device->tx_disable) {
-			netif_tx_wake_queue(txq);
-			ndev_ctx->eth_stats.wake_queue++;
-			ret = -ENOSPC;
-		}
 	} else {
 		netdev_err(ndev,
 			   "Unable to send packet pages %u len %u, ret %d\n",
@@ -888,6 +882,15 @@ static inline int netvsc_send_pkt(
 			   ret);
 	}
 
+	if (netif_tx_queue_stopped(txq) &&
+	    atomic_read(&nvchan->queue_sends) < 1 &&
+	    !net_device->tx_disable) {
+		netif_tx_wake_queue(txq);
+		ndev_ctx->eth_stats.wake_queue++;
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+	}
+
 	return ret;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.1 024/375] hv_netvsc: fix race that may miss tx queue wakeup
From: Sasha Levin @ 2019-05-22 19:15 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Haiyang Zhang, Stephan Klein, David S . Miller, Sasha Levin,
	linux-hyperv, netdev
In-Reply-To: <20190522192115.22666-1-sashal@kernel.org>

From: Haiyang Zhang <haiyangz@microsoft.com>

[ Upstream commit 93aa4792c3908eac87ddd368ee0fe0564148232b ]

When the ring buffer is almost full due to RX completion messages, a
TX packet may reach the "low watermark" and cause the queue stopped.
If the TX completion arrives earlier than queue stopping, the wakeup
may be missed.

This patch moves the check for the last pending packet to cover both
EAGAIN and success cases, so the queue will be reliably waked up when
necessary.

Reported-and-tested-by: Stephan Klein <stephan.klein@wegfinder.at>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/hyperv/netvsc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index e0dce373cdd9d..3d4a166a49d58 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -875,12 +875,6 @@ static inline int netvsc_send_pkt(
 	} else if (ret == -EAGAIN) {
 		netif_tx_stop_queue(txq);
 		ndev_ctx->eth_stats.stop_queue++;
-		if (atomic_read(&nvchan->queue_sends) < 1 &&
-		    !net_device->tx_disable) {
-			netif_tx_wake_queue(txq);
-			ndev_ctx->eth_stats.wake_queue++;
-			ret = -ENOSPC;
-		}
 	} else {
 		netdev_err(ndev,
 			   "Unable to send packet pages %u len %u, ret %d\n",
@@ -888,6 +882,15 @@ static inline int netvsc_send_pkt(
 			   ret);
 	}
 
+	if (netif_tx_queue_stopped(txq) &&
+	    atomic_read(&nvchan->queue_sends) < 1 &&
+	    !net_device->tx_disable) {
+		netif_tx_wake_queue(txq);
+		ndev_ctx->eth_stats.wake_queue++;
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+	}
+
 	return ret;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH net-next] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: Sunil Muthuswamy @ 2019-05-22 22:56 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Dexuan Cui, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org

Currently, the hv_sock buffer size is static and can't scale to the
bandwidth requirements of the application. This change allows the
applications to influence the socket buffer sizes using the SO_SNDBUF and
the SO_RCVBUF socket options.

Few interesting points to note:
1. Since the VMBUS does not allow a resize operation of the ring size, the
socket buffer size option should be set prior to establishing the
connection for it to take effect.
2. Setting the socket option comes with the cost of that much memory being
reserved/allocated by the kernel, for the lifetime of the connection.

Perf data:
Total Data Transfer: 1GB
Single threaded reader/writer
Results below are summarized over 10 iterations.

Linux hvsocket writer + Windows hvsocket reader:
|---------------------------------------------------------------------------------------------|
|Packet size ->   |      128B       |       1KB       |       4KB       |        64KB         |
|---------------------------------------------------------------------------------------------|
|SO_SNDBUF size | |                 Throughput in MB/s (min/max/avg/median):                  |
|               v |                                                                           |
|---------------------------------------------------------------------------------------------|
|      Default    | 109/118/114/116 | 636/774/701/700 | 435/507/480/476 |   410/491/462/470   |
|      16KB       | 110/116/112/111 | 575/705/662/671 | 749/900/854/869 |   592/824/692/676   |
|      32KB       | 108/120/115/115 | 703/823/767/772 | 718/878/850/866 | 1593/2124/2000/2085 |
|      64KB       | 108/119/114/114 | 592/732/683/688 | 805/934/903/911 | 1784/1943/1862/1843 |
|---------------------------------------------------------------------------------------------|

Windows hvsocket writer + Linux hvsocket reader:
|---------------------------------------------------------------------------------------------|
|Packet size ->   |     128B    |      1KB        |          4KB        |        64KB         |
|---------------------------------------------------------------------------------------------|
|SO_RCVBUF size | |               Throughput in MB/s (min/max/avg/median):                    |
|               v |                                                                           |
|---------------------------------------------------------------------------------------------|
|      Default    | 69/82/75/73 | 313/343/333/336 |   418/477/446/445   |   659/701/676/678   |
|      16KB       | 69/83/76/77 | 350/401/375/382 |   506/548/517/516   |   602/624/615/615   |
|      32KB       | 62/83/73/73 | 471/529/496/494 |   830/1046/935/939  | 944/1180/1070/1100  |
|      64KB       | 64/70/68/69 | 467/533/501/497 | 1260/1590/1430/1431 | 1605/1819/1670/1660 |
|---------------------------------------------------------------------------------------------|

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
- The table above exceeds the 75char limit for the patch. If that's a
problem, I can try to squeeze it in somehow within the limit.

- The patch has been previously submitted to net and reviewed. The
feedback was to submit it to net-next.

 net/vmw_vsock/hyperv_transport.c | 50 ++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 982a8dc..8d3a7b0 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -23,14 +23,14 @@
 #include <net/sock.h>
 #include <net/af_vsock.h>
 
-/* The host side's design of the feature requires 6 exact 4KB pages for
- * recv/send rings respectively -- this is suboptimal considering memory
- * consumption, however unluckily we have to live with it, before the
- * host comes up with a better design in the future.
+/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
+ * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer
+ * hosts don't have this limitation; but, keep the defaults the same for compat.
  */
 #define PAGE_SIZE_4K		4096
 #define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6)
 #define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6)
+#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64)
 
 /* The MTU is 16KB per the host side's design */
 #define HVS_MTU_SIZE		(1024 * 16)
@@ -344,9 +344,12 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 
 	struct sockaddr_vm addr;
 	struct sock *sk, *new = NULL;
-	struct vsock_sock *vnew;
-	struct hvsock *hvs, *hvs_new;
+	struct vsock_sock *vnew = NULL;
+	struct hvsock *hvs = NULL;
+	struct hvsock *hvs_new = NULL;
+	int rcvbuf;
 	int ret;
+	int sndbuf;
 
 	if_type = &chan->offermsg.offer.if_type;
 	if_instance = &chan->offermsg.offer.if_instance;
@@ -388,9 +391,34 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	}
 
 	set_channel_read_mode(chan, HV_CALL_DIRECT);
-	ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE,
-			 RINGBUFFER_HVS_RCV_SIZE, NULL, 0,
-			 hvs_channel_cb, conn_from_host ? new : sk);
+
+	/* Use the socket buffer sizes as hints for the VMBUS ring size. For
+	 * server side sockets, 'sk' is the parent socket and thus, this will
+	 * allow the child sockets to inherit the size from the parent. Keep
+	 * the mins to the default value and align to page size as per VMBUS
+	 * requirements.
+	 * For the max, the socket core library will limit the socket buffer
+	 * size that can be set by the user, but, since currently, the hv_sock
+	 * VMBUS ring buffer is physically contiguous allocation, restrict it
+	 * further.
+	 * Older versions of hv_sock host side code cannot handle bigger VMBUS
+	 * ring buffer size. Use the version number to limit the change to newer
+	 * versions.
+	 */
+	if (vmbus_proto_version < VERSION_WIN10_V5) {
+		sndbuf = RINGBUFFER_HVS_SND_SIZE;
+		rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
+	} else {
+		sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
+		sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
+		sndbuf = ALIGN(sndbuf, PAGE_SIZE);
+		rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
+		rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
+		rcvbuf = ALIGN(rcvbuf, PAGE_SIZE);
+	}
+
+	ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
+			 conn_from_host ? new : sk);
 	if (ret != 0) {
 		if (conn_from_host) {
 			hvs_new->chan = NULL;
@@ -441,6 +469,7 @@ static u32 hvs_get_local_cid(void)
 static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
 {
 	struct hvsock *hvs;
+	struct sock *sk = sk_vsock(vsk);
 
 	hvs = kzalloc(sizeof(*hvs), GFP_KERNEL);
 	if (!hvs)
@@ -448,7 +477,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
 
 	vsk->trans = hvs;
 	hvs->vsk = vsk;
-
+	sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE;
+	sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply related

* RE: [PATCH net-next] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: Dexuan Cui @ 2019-05-22 23:06 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB04652168EAE5D6D7D39BD4AAC0000@BN6PR21MB0465.namprd21.prod.outlook.com>

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Wednesday, May 22, 2019 3:56 PM
> ...
> Currently, the hv_sock buffer size is static and can't scale to the
> bandwidth requirements of the application. This change allows the
> applications to influence the socket buffer sizes using the SO_SNDBUF and
> the SO_RCVBUF socket options.
>  ...
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>

Reviewed-by: Dexuan Cui <decui@microsoft.com>

The patch looks good. Thanks, Sunil!

Thanks,
-- Dexuan

^ permalink raw reply

* [PATCH net-next] hv_sock: perf: loop in send() to maximize bandwidth
From: Sunil Muthuswamy @ 2019-05-22 23:10 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org

Currently, the hv_sock send() iterates once over the buffer, puts data into
the VMBUS channel and returns. It doesn't maximize on the case when there
is a simultaneous reader draining data from the channel. In such a case,
the send() can maximize the bandwidth (and consequently minimize the cpu
cycles) by iterating until the channel is found to be full.

Perf data:
Total Data Transfer: 10GB/iteration
Single threaded reader/writer, Linux hvsocket writer with Windows hvsocket
reader
Packet size: 64KB
CPU sys time was captured using the 'time' command for the writer to send
10GB of data.
'Send Buffer Loop' is with the patch applied.
The values below are over 10 iterations.

|--------------------------------------------------------|
|        |        Current        |   Send Buffer Loop    |
|--------------------------------------------------------|
|        | Throughput | CPU sys  | Throughput | CPU sys  |
|        | (MB/s)     | time (s) | (MB/s)     | time (s) |
|--------------------------------------------------------|
| Min    |     407    |   7.048  |    401     |  5.958   |
|--------------------------------------------------------|
| Max    |     455    |   7.563  |    542     |  6.993   |
|--------------------------------------------------------|
| Avg    |     440    |   7.411  |    451     |  6.639   |
|--------------------------------------------------------|
| Median |     446    |   7.417  |    447     |  6.761   |
|--------------------------------------------------------|

Observation:
1. The avg throughput doesn't really change much with this change for this
scenario. This is most probably because the bottleneck on throughput is
somewhere else.
2. The average system (or kernel) cpu time goes down by 10%+ with this
change, for the same amount of data transfer.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
- The patch has been previously submitted to net and reviewed. The
feedback was to submit it to net-next.

 net/vmw_vsock/hyperv_transport.c | 45 +++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 982a8dc..7c13032 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -55,8 +55,9 @@ struct hvs_recv_buf {
 };
 
 /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use
- * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated
- * buffer, because tests show there is no significant performance difference.
+ * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the
+ * guest and the host processing as one VMBUS packet is the smallest processing
+ * unit.
  *
  * Note: the buffer can be eliminated in the future when we add new VMBus
  * ringbuffer APIs that allow us to directly copy data from userspace buffer
@@ -644,7 +645,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
 	struct hvsock *hvs = vsk->trans;
 	struct vmbus_channel *chan = hvs->chan;
 	struct hvs_send_buf *send_buf;
-	ssize_t to_write, max_writable, ret;
+	ssize_t to_write, max_writable;
+	ssize_t ret = 0;
+	ssize_t bytes_written = 0;
 
 	BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K);
 
@@ -652,20 +655,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
 	if (!send_buf)
 		return -ENOMEM;
 
-	max_writable = hvs_channel_writable_bytes(chan);
-	to_write = min_t(ssize_t, len, max_writable);
-	to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
-
-	ret = memcpy_from_msg(send_buf->data, msg, to_write);
-	if (ret < 0)
-		goto out;
+	/* Reader(s) could be draining data from the channel as we write.
+	 * Maximize bandwidth, by iterating until the channel is found to be
+	 * full.
+	 */
+	while (len) {
+		max_writable = hvs_channel_writable_bytes(chan);
+		if (!max_writable)
+			break;
+		to_write = min_t(ssize_t, len, max_writable);
+		to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
+		/* memcpy_from_msg is safe for loop as it advances the offsets
+		 * within the message iterator.
+		 */
+		ret = memcpy_from_msg(send_buf->data, msg, to_write);
+		if (ret < 0)
+			goto out;
 
-	ret = hvs_send_data(hvs->chan, send_buf, to_write);
-	if (ret < 0)
-		goto out;
+		ret = hvs_send_data(hvs->chan, send_buf, to_write);
+		if (ret < 0)
+			goto out;
 
-	ret = to_write;
+		bytes_written += to_write;
+		len -= to_write;
+	}
 out:
+	/* If any data has been sent, return that */
+	if (bytes_written)
+		ret = bytes_written;
 	kfree(send_buf);
 	return ret;
 }
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH net-next] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: Stephen Hemminger @ 2019-05-22 23:12 UTC (permalink / raw)
  To: Sunil Muthuswamy
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Dexuan Cui, Michael Kelley,
	netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB04652168EAE5D6D7D39BD4AAC0000@BN6PR21MB0465.namprd21.prod.outlook.com>

On Wed, 22 May 2019 22:56:07 +0000
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:

> Currently, the hv_sock buffer size is static and can't scale to the
> bandwidth requirements of the application. This change allows the
> applications to influence the socket buffer sizes using the SO_SNDBUF and
> the SO_RCVBUF socket options.
> 
> Few interesting points to note:
> 1. Since the VMBUS does not allow a resize operation of the ring size, the
> socket buffer size option should be set prior to establishing the
> connection for it to take effect.
> 2. Setting the socket option comes with the cost of that much memory being
> reserved/allocated by the kernel, for the lifetime of the connection.
> 
> Perf data:
> Total Data Transfer: 1GB
> Single threaded reader/writer
> Results below are summarized over 10 iterations.
> 
> Linux hvsocket writer + Windows hvsocket reader:
> |---------------------------------------------------------------------------------------------|
> |Packet size ->   |      128B       |       1KB       |       4KB       |        64KB         |
> |---------------------------------------------------------------------------------------------|
> |SO_SNDBUF size | |                 Throughput in MB/s (min/max/avg/median):                  |
> |               v |                                                                           |
> |---------------------------------------------------------------------------------------------|
> |      Default    | 109/118/114/116 | 636/774/701/700 | 435/507/480/476 |   410/491/462/470   |
> |      16KB       | 110/116/112/111 | 575/705/662/671 | 749/900/854/869 |   592/824/692/676   |
> |      32KB       | 108/120/115/115 | 703/823/767/772 | 718/878/850/866 | 1593/2124/2000/2085 |
> |      64KB       | 108/119/114/114 | 592/732/683/688 | 805/934/903/911 | 1784/1943/1862/1843 |
> |---------------------------------------------------------------------------------------------|
> 
> Windows hvsocket writer + Linux hvsocket reader:
> |---------------------------------------------------------------------------------------------|
> |Packet size ->   |     128B    |      1KB        |          4KB        |        64KB         |
> |---------------------------------------------------------------------------------------------|
> |SO_RCVBUF size | |               Throughput in MB/s (min/max/avg/median):                    |
> |               v |                                                                           |
> |---------------------------------------------------------------------------------------------|
> |      Default    | 69/82/75/73 | 313/343/333/336 |   418/477/446/445   |   659/701/676/678   |
> |      16KB       | 69/83/76/77 | 350/401/375/382 |   506/548/517/516   |   602/624/615/615   |
> |      32KB       | 62/83/73/73 | 471/529/496/494 |   830/1046/935/939  | 944/1180/1070/1100  |
> |      64KB       | 64/70/68/69 | 467/533/501/497 | 1260/1590/1430/1431 | 1605/1819/1670/1660 |
> |---------------------------------------------------------------------------------------------|
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>

It looks like Exchange mangled you patch. It doesn't apply clean.


>  


^ permalink raw reply

* RE: [PATCH net-next] hv_sock: perf: loop in send() to maximize bandwidth
From: Dexuan Cui @ 2019-05-22 23:31 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev@vger.kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <BN6PR21MB0465FA591662A8B580AEF7D9C0000@BN6PR21MB0465.namprd21.prod.outlook.com>

> From: linux-hyperv-owner@vger.kernel.org
> <linux-hyperv-owner@vger.kernel.org> On Behalf Of Sunil Muthuswamy
> 
> Currently, the hv_sock send() iterates once over the buffer, puts data into
> the VMBUS channel and returns. It doesn't maximize on the case when there
> is a simultaneous reader draining data from the channel. In such a case,
> the send() can maximize the bandwidth (and consequently minimize the cpu
> cycles) by iterating until the channel is found to be full.
 
Reviewed-by: Dexuan Cui <decui@microsoft.com>

The patch looks good. Thanks, Sunil!

Thanks,
-- Dexuan

^ permalink raw reply

* Re: [PATCH net-next] hv_sock: perf: Allow the socket buffer size options to influence the actual socket buffers
From: David Miller @ 2019-05-23  1:00 UTC (permalink / raw)
  To: sunilmut
  Cc: kys, haiyangz, sthemmin, sashal, decui, mikelley, netdev,
	linux-hyperv, linux-kernel
In-Reply-To: <BN6PR21MB04652168EAE5D6D7D39BD4AAC0000@BN6PR21MB0465.namprd21.prod.outlook.com>

From: Sunil Muthuswamy <sunilmut@microsoft.com>
Date: Wed, 22 May 2019 22:56:07 +0000

> Currently, the hv_sock buffer size is static and can't scale to the
> bandwidth requirements of the application. This change allows the
> applications to influence the socket buffer sizes using the SO_SNDBUF and
> the SO_RCVBUF socket options.
> 
> Few interesting points to note:
> 1. Since the VMBUS does not allow a resize operation of the ring size, the
> socket buffer size option should be set prior to establishing the
> connection for it to take effect.
> 2. Setting the socket option comes with the cost of that much memory being
> reserved/allocated by the kernel, for the lifetime of the connection.
> 
> Perf data:
> Total Data Transfer: 1GB
> Single threaded reader/writer
> Results below are summarized over 10 iterations.
 ...
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] hv_sock: perf: loop in send() to maximize bandwidth
From: David Miller @ 2019-05-23  1:00 UTC (permalink / raw)
  To: sunilmut
  Cc: kys, haiyangz, sthemmin, sashal, mikelley, netdev, linux-hyperv,
	linux-kernel
In-Reply-To: <BN6PR21MB0465FA591662A8B580AEF7D9C0000@BN6PR21MB0465.namprd21.prod.outlook.com>

From: Sunil Muthuswamy <sunilmut@microsoft.com>
Date: Wed, 22 May 2019 23:10:44 +0000

> Currently, the hv_sock send() iterates once over the buffer, puts data into
> the VMBUS channel and returns. It doesn't maximize on the case when there
> is a simultaneous reader draining data from the channel. In such a case,
> the send() can maximize the bandwidth (and consequently minimize the cpu
> cycles) by iterating until the channel is found to be full.
> 
> Perf data:
> Total Data Transfer: 10GB/iteration
> Single threaded reader/writer, Linux hvsocket writer with Windows hvsocket
> reader
> Packet size: 64KB
> CPU sys time was captured using the 'time' command for the writer to send
> 10GB of data.
> 'Send Buffer Loop' is with the patch applied.
> The values below are over 10 iterations.
 ...
> Observation:
> 1. The avg throughput doesn't really change much with this change for this
> scenario. This is most probably because the bottleneck on throughput is
> somewhere else.
> 2. The average system (or kernel) cpu time goes down by 10%+ with this
> change, for the same amount of data transfer.
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>

Applied.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox