From: "Kenneth Duda" <kduda@arastra.com>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] Re: Network Performance between Win Host and Linux
Date: Tue, 11 Apr 2006 15:36:32 -0700 [thread overview]
Message-ID: <6fe044190604111536k944383o99ab27411d3864db@mail.gmail.com> (raw)
In-Reply-To: <6fe044190604111020h47108190x23983325567fb51c@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 1476 bytes --]
The "qemu-slirp-performance" patch contains three improvements to qemu
slirp networking performance. Booting my virtual machine (which
NFS-mounts its root filesystem from the host) has been accelerated by
8x, from over 5 minutes to 40 seconds. TCP throughput has been
accelerated from about 2 megabytes/sec to 9 megabytes/sec, in both
directions (measured using a simple python script). The system is
subjectively more responsive (for activities such as logging in or
running simple python scripts).
The specific problems fixed are:
- the mss for the slirp-to-vm direction was 512 bytes (now 1460);
- qemu would block in select() for up to four milliseconds at a
time, even when data was waiting on slirp sockets;
- slirp was deliberately delaying acks until timer expiration
(TF_DELACK), preventing the vm from opening its send window, in
violation of rfc2581.
These fixes are together in one patch (qemu-slirp-performance.patch).
I have also attached some related patches that fix fairly serious
slirp bugs for IP datagrams larger than 4k. Before these patches,
large packets can corrupt the heap or get reassembled in some
entertaining but incorrect orders (because ip_off was being sorted as
though it was signed!) These patches are attached in the order I
apply them.
I hope they are helpful. If there's anything I can do to make them
more likely to be accepted into the mainline, please let me know.
Thanks,
-Ken
[-- Attachment #2: qemu-slirp-mbuf-bug.patch --]
[-- Type: text/plain, Size: 888 bytes --]
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/mbuf.c qemu-snapshot-2006-03-27_23/slirp/mbuf.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/mbuf.c 2004-04-22 00:10:47.000000000 +0000
+++ qemu-snapshot-2006-03-27_23/slirp/mbuf.c 2006-04-05 13:03:03.000000000 +0000
@@ -146,18 +146,19 @@
struct mbuf *m;
int size;
{
+ int datasize;
+
/* some compiles throw up on gotos. This one we can fake. */
if(m->m_size>size) return;
if (m->m_flags & M_EXT) {
- /* datasize = m->m_data - m->m_ext; */
+ datasize = m->m_data - m->m_ext;
m->m_ext = (char *)realloc(m->m_ext,size);
/* if (m->m_ext == NULL)
* return (struct mbuf *)NULL;
*/
- /* m->m_data = m->m_ext + datasize; */
+ m->m_data = m->m_ext + datasize;
} else {
- int datasize;
char *dat;
datasize = m->m_data - m->m_dat;
dat = (char *)malloc(size);
[-- Attachment #3: qemu-slirp-reassembly-bug.patch --]
[-- Type: text/plain, Size: 467 bytes --]
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c qemu-snapshot-2006-03-27_23/slirp/ip_input.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c 2004-04-22 00:10:47.000000000 +0000
+++ qemu-snapshot-2006-03-27_23/slirp/ip_input.c 2006-04-06 06:02:52.000000000 +0000
@@ -344,8 +344,8 @@
while (q != (struct ipasfrag *)fp) {
struct mbuf *t;
t = dtom(q);
- m_cat(m, t);
q = (struct ipasfrag *) q->ipf_next;
+ m_cat(m, t);
}
/*
[-- Attachment #4: qemu-slirp-32k-packets.patch --]
[-- Type: text/plain, Size: 4434 bytes --]
diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/ip.h qemu-snapshot-2006-03-27_23/slirp/ip.h
--- qemu-snapshot-2006-03-27_23.orig/slirp/ip.h 2004-04-21 17:10:47.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/ip.h 2006-04-06 00:28:49.000000000 -0700
@@ -79,6 +79,11 @@
* We declare ip_len and ip_off to be short, rather than u_short
* pragmatically since otherwise unsigned comparisons can result
* against negative integers quite easily, and fail in subtle ways.
+ *
+ * The only problem with the above theory is that these quantities
+ * are in fact unsigned, and sorting fragments by a signed version
+ * of ip_off doesn't work very well, nor does length checks on
+ * ip packets with a signed version of their length!
*/
struct ip {
#ifdef WORDS_BIGENDIAN
@@ -101,6 +106,9 @@
struct in_addr ip_src,ip_dst; /* source and dest address */
};
+#define IP_OFF(ip) (*(u_int16_t *)&((ip)->ip_off))
+#define IP_LEN(ip) (*(u_int16_t *)&((ip)->ip_len))
+
#define IP_MAXPACKET 65535 /* maximum packet size */
/*
diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c qemu-snapshot-2006-03-27_23/slirp/ip_input.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c 2004-04-21 17:10:47.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/ip_input.c 2006-04-06 00:32:19.000000000 -0700
@@ -111,7 +111,7 @@
* Convert fields to host representation.
*/
NTOHS(ip->ip_len);
- if (ip->ip_len < hlen) {
+ if (IP_LEN(ip) < hlen) {
ipstat.ips_badlen++;
goto bad;
}
@@ -124,13 +124,13 @@
* Trim mbufs if longer than we expect.
* Drop packet if shorter than we expect.
*/
- if (m->m_len < ip->ip_len) {
+ if (m->m_len < IP_LEN(ip)) {
ipstat.ips_tooshort++;
goto bad;
}
/* Should drop packet if mbuf too long? hmmm... */
- if (m->m_len > ip->ip_len)
- m_adj(m, ip->ip_len - m->m_len);
+ if (m->m_len > IP_LEN(ip))
+ m_adj(m, IP_LEN(ip) - m->m_len);
/* check ip_ttl for a correct ICMP reply */
if(ip->ip_ttl==0 || ip->ip_ttl==1) {
@@ -191,7 +191,7 @@
* or if this is not the first fragment,
* attempt reassembly; if it succeeds, proceed.
*/
- if (((struct ipasfrag *)ip)->ipf_mff & 1 || ip->ip_off) {
+ if (((struct ipasfrag *)ip)->ipf_mff & 1 || IP_OFF(ip)) {
ipstat.ips_fragments++;
ip = ip_reass((struct ipasfrag *)ip, fp);
if (ip == 0)
@@ -281,7 +281,7 @@
*/
for (q = (struct ipasfrag *)fp->ipq_next; q != (struct ipasfrag *)fp;
q = (struct ipasfrag *)q->ipf_next)
- if (q->ip_off > ip->ip_off)
+ if (IP_OFF(q) > IP_OFF(ip))
break;
/*
@@ -290,10 +290,10 @@
* segment. If it provides all of our data, drop us.
*/
if (q->ipf_prev != (ipasfragp_32)fp) {
- i = ((struct ipasfrag *)(q->ipf_prev))->ip_off +
- ((struct ipasfrag *)(q->ipf_prev))->ip_len - ip->ip_off;
+ i = IP_OFF((struct ipasfrag *)(q->ipf_prev)) +
+ IP_LEN((struct ipasfrag *)(q->ipf_prev)) - IP_OFF(ip);
if (i > 0) {
- if (i >= ip->ip_len)
+ if (i >= IP_LEN(ip))
goto dropfrag;
m_adj(dtom(ip), i);
ip->ip_off += i;
@@ -305,9 +305,9 @@
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
- while (q != (struct ipasfrag *)fp && ip->ip_off + ip->ip_len > q->ip_off) {
- i = (ip->ip_off + ip->ip_len) - q->ip_off;
- if (i < q->ip_len) {
+ while (q != (struct ipasfrag *)fp && IP_OFF(ip) + IP_LEN(ip) > IP_OFF(q)) {
+ i = (IP_OFF(ip) + IP_LEN(ip)) - IP_OFF(q);
+ if (i < IP_LEN(q)) {
q->ip_len -= i;
q->ip_off += i;
m_adj(dtom(q), i);
@@ -327,9 +327,9 @@
next = 0;
for (q = (struct ipasfrag *) fp->ipq_next; q != (struct ipasfrag *)fp;
q = (struct ipasfrag *) q->ipf_next) {
- if (q->ip_off != next)
+ if (IP_OFF(q) != next)
return (0);
- next += q->ip_len;
+ next += IP_LEN(q);
}
if (((struct ipasfrag *)(q->ipf_prev))->ipf_mff & 1)
return (0);
diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/udp.c qemu-snapshot-2006-03-27_23/slirp/udp.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/udp.c 2006-04-06 00:24:30.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/udp.c 2006-04-06 00:32:55.000000000 -0700
@@ -111,12 +111,12 @@
*/
len = ntohs((u_int16_t)uh->uh_ulen);
- if (ip->ip_len != len) {
- if (len > ip->ip_len) {
+ if (IP_LEN(ip) != len) {
+ if (len > IP_LEN(ip)) {
udpstat.udps_badlen++;
goto bad;
}
- m_adj(m, len - ip->ip_len);
+ m_adj(m, len - IP_LEN(ip));
ip->ip_len = len;
}
[-- Attachment #5: qemu-slirp-performance.patch --]
[-- Type: text/plain, Size: 3196 bytes --]
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/tcp.h qemu-snapshot-2006-03-27_23/slirp/tcp.h
--- qemu-snapshot-2006-03-27_23.orig/slirp/tcp.h 2004-04-21 17:10:47.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/tcp.h 2006-04-11 15:22:05.000000000 -0700
@@ -100,8 +100,10 @@
* With an IP MSS of 576, this is 536,
* but 512 is probably more convenient.
* This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)).
+ *
+ * We make this 1460 because we only care about Ethernet in the qemu context.
*/
-#define TCP_MSS 512
+#define TCP_MSS 1460
#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/tcp_input.c qemu-snapshot-2006-03-27_23/slirp/tcp_input.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/tcp_input.c 2004-10-07 16:27:35.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/tcp_input.c 2006-04-11 15:22:05.000000000 -0700
@@ -580,28 +580,11 @@
* congestion avoidance sender won't send more until
* he gets an ACK.
*
- * Here are 3 interpretations of what should happen.
- * The best (for me) is to delay-ack everything except
- * if it's a one-byte packet containing an ESC
- * (this means it's an arrow key (or similar) sent using
- * Nagel, hence there will be no echo)
- * The first of these is the original, the second is the
- * middle ground between the other 2
+ * It is better to not delay acks at all to maximize
+ * TCP throughput. See RFC 2581.
*/
-/* if (((unsigned)ti->ti_len < tp->t_maxseg)) {
- */
-/* if (((unsigned)ti->ti_len < tp->t_maxseg &&
- * (so->so_iptos & IPTOS_LOWDELAY) == 0) ||
- * ((so->so_iptos & IPTOS_LOWDELAY) &&
- * ((struct tcpiphdr_2 *)ti)->first_char == (char)27)) {
- */
- if ((unsigned)ti->ti_len == 1 &&
- ((struct tcpiphdr_2 *)ti)->first_char == (char)27) {
- tp->t_flags |= TF_ACKNOW;
- tcp_output(tp);
- } else {
- tp->t_flags |= TF_DELACK;
- }
+ tp->t_flags |= TF_ACKNOW;
+ tcp_output(tp);
return;
}
} /* header prediction */
diff -BurN qemu-snapshot-2006-03-27_23.orig/vl.c qemu-snapshot-2006-03-27_23/vl.c
--- qemu-snapshot-2006-03-27_23.orig/vl.c 2006-04-11 15:21:27.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/vl.c 2006-04-11 15:22:05.000000000 -0700
@@ -4026,7 +4026,7 @@
void main_loop_wait(int timeout)
{
IOHandlerRecord *ioh, *ioh_next;
- fd_set rfds, wfds;
+ fd_set rfds, wfds, xfds;
int ret, nfds;
struct timeval tv;
@@ -4041,6 +4041,7 @@
nfds = -1;
FD_ZERO(&rfds);
FD_ZERO(&wfds);
+ FD_ZERO(&xfds);
for(ioh = first_io_handler; ioh != NULL; ioh = ioh->next) {
if (ioh->fd_read &&
(!ioh->fd_read_poll ||
@@ -4062,7 +4063,12 @@
#else
tv.tv_usec = timeout * 1000;
#endif
- ret = select(nfds + 1, &rfds, &wfds, NULL, &tv);
+#if defined(CONFIG_SLIRP)
+ if (slirp_inited) {
+ slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
+ }
+#endif
+ ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
if (ret > 0) {
/* XXX: better handling of removal */
for(ioh = first_io_handler; ioh != NULL; ioh = ioh_next) {
next prev parent reply other threads:[~2006-04-11 22:36 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-04-11 17:20 [Qemu-devel] Network Performance between Win Host and Linux Kenneth Duda
2006-04-11 17:28 ` Paul Brook
2006-04-11 17:49 ` Kenneth Duda
2006-04-11 18:19 ` Helmut Auer
2006-04-12 2:10 ` Kazu
2006-04-11 20:40 ` Leonardo E. Reiter
2006-04-11 21:46 ` Kenneth Duda
2006-04-11 21:58 ` Leonardo E. Reiter
2006-04-11 22:42 ` Kenneth Duda
2006-04-11 21:00 ` Leonardo E. Reiter
2006-04-11 22:36 ` Kenneth Duda [this message]
2006-04-12 14:04 ` [Qemu-devel] " Leonardo E. Reiter
2006-04-12 18:19 ` Kenneth Duda
2006-04-12 18:26 ` Leonardo E. Reiter
2006-04-12 14:31 ` Leonardo E. Reiter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6fe044190604111536k944383o99ab27411d3864db@mail.gmail.com \
--to=kduda@arastra.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).