[Qemu-devel] [PATCH] net: add raw backend

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] net: add raw backend
@ 2009-07-01 15:46 Or Gerlitz
  2009-07-01 16:21 ` Jamie Lokier
                   ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-01 15:46 UTC (permalink / raw)
  To: qemu-devel; +Cc: Herbert Xu

Add raw network backend option which uses a packet socket to provide
raw networking access. Once the socket is opened its bouned to a
provided host interface, such that packets received on the interface
are delivered to the VM and packets sent by the VM are sent to the
interface.

Signed-off-by: Or Gerlitz<ogerlitz@voltaire.com>

diff --git a/net.c b/net.c
index 55f70f2..f7ff381 100644
--- a/net.c
+++ b/net.c
@@ -93,6 +93,9 @@
 #endif
 #endif

+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+
 #if defined(__OpenBSD__)
 #include <util.h>
 #endif
@@ -1476,6 +1479,155 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model,

 #endif /* !_WIN32 */

+typedef struct RAWState {
+    VLANClientState *vc;
+    int fd;
+    uint8_t buf[4096];
+    int promisc;
+} RAWState;
+
+static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc)
+{
+	int fd, ret;
+	struct ifreq req;
+	struct sockaddr_ll lladdr;
+
+	fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	if (fd < 0)
+		config_error(mon, "packet socket failed\n");
+
+	memset(&req, 0, sizeof(req));
+	strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
+	ret = ioctl(fd, SIOCGIFINDEX, &req);
+	if (ret < 0)
+		config_error(mon, "SIOCGIFINDEX failed\n");
+
+	memset(&lladdr, 0, sizeof(lladdr));
+	lladdr.sll_family   = AF_PACKET;
+	lladdr.sll_protocol = htons(ETH_P_ALL);
+	lladdr.sll_ifindex  = req.ifr_ifindex;
+	ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
+	if (ret < 0)
+		config_error(mon, "bind failed\n");
+
+	/* set iface to promiscuous mode (packets sent to the VM MAC) */
+	if (promisc) {
+		ret = ioctl(fd, SIOCGIFFLAGS, &req);
+		if (ret < 0)
+			perror("SIOCGIFFLAGS failed\n");
+		req.ifr_flags |= IFF_PROMISC;
+		ret = ioctl(fd, SIOCSIFFLAGS, &req);
+		if (ret < 0)
+			config_error(mon, "SIOCSIFFLAGS to promiscous failed\n");
+	}
+
+	ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+	if (ret < 0)
+		config_error(mon, "O_NONBLOCK set failed\n");
+
+	return fd;
+}
+
+static void raw_cleanup(VLANClientState *vc)
+{
+	struct ifreq req;
+	RAWState *s = vc->opaque;
+
+	qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
+	if (s->promisc) {
+		ioctl(s->fd, SIOCGIFFLAGS, &req);
+		req.ifr_flags &= ~IFF_PROMISC;
+		ioctl(s->fd, SIOCSIFFLAGS, &req);
+	}
+	close(s->fd);
+	qemu_free(s);
+}
+
+static void raw_send(void *opaque);
+
+static int raw_can_send(void *opaque)
+{
+	RAWState *s = opaque;
+
+	return qemu_can_send_packet(s->vc);
+}
+
+static void raw_send_completed(VLANClientState *vc, ssize_t len)
+{
+	RAWState *s = vc->opaque;
+
+	qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s);
+}
+
+static void raw_send(void *opaque)
+{
+	RAWState *s = opaque;
+	int size;
+
+	do {
+		size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
+		if (size <= 0)
+			break;
+
+		size = qemu_send_packet_async(s->vc, s->buf, size,
+						raw_send_completed);
+		if (size == 0)
+			qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
+
+	} while (size > 0);
+}
+
+static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov,
+				int iovcnt)
+{
+	ssize_t len;
+	RAWState *s = vc->opaque;
+
+	do {
+		len = writev(s->fd, iov, iovcnt);
+	} while (len == -1 && (errno == EINTR || errno == EAGAIN));
+
+	return len;
+}
+
+static ssize_t raw_receive(VLANClientState *vc, const uint8_t *buf, size_t size)
+{
+	struct iovec iov[1];
+
+	iov[0].iov_base = (char *)buf;
+	iov[0].iov_len  = size;
+
+	return raw_receive_iov(vc, iov, 1);
+}
+
+static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model,
+			const char *name, const char *ifname,
+			int promisc, int fd)
+{
+	RAWState *s;
+
+	s = qemu_mallocz(sizeof(RAWState));
+
+	if (fd == -1) {
+		s->fd = net_raw_fd_init(mon, ifname, promisc);
+		s->promisc = promisc;
+	} else
+		s->fd = fd;
+
+	s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive,
+					raw_receive_iov, raw_cleanup, s);
+	qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s);
+
+	if (fd == -1)
+		snprintf(s->vc->info_str, sizeof(s->vc->info_str),
+			"raw: ifname=%s, promisc=%d", ifname, promisc);
+	else
+		snprintf(s->vc->info_str, sizeof(s->vc->info_str),
+			"raw: fd=%d", fd);
+
+	return 0;
+}
+
 #if defined(CONFIG_VDE)
 typedef struct VDEState {
     VLANClientState *vc;
@@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p)
         }
     } else
 #endif
+    if (!strcmp(device, "raw")) {
+	char chkbuf[64], ifname[64];
+        int raw_fd = -1;
+        int promisc = 1;
+        if (get_param_value(buf, sizeof(buf), "fd", p) > 0) {
+            static const char * const fd_params[] = {
+                "vlan", "name", "fd", NULL
+            };
+            if (check_params(chkbuf, sizeof(chkbuf), fd_params, p) < 0) {
+                config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p);
+                ret = -1;
+                goto out;
+            }
+	    raw_fd = strtol(buf, NULL, 0);
+	    fcntl(raw_fd, F_SETFL, fcntl(raw_fd, F_GETFL | O_NONBLOCK));
+        } else {
+            static const char * const tap_params[] = {
+                "vlan", "name", "ifname", "promisc", NULL
+            };
+            if (check_params(chkbuf, sizeof(chkbuf), tap_params, p) < 0) {
+                config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p);
+                ret = -1;
+                goto out;
+            }
+            if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) {
+            	config_error(mon, "raw: no interface name\n");
+            	ret = -1;
+            	goto out;
+            }
+            if (get_param_value(buf, sizeof(buf), "promisc", p))
+                promisc = atoi(buf);
+	}
+	vlan->nb_host_devs++;
+	ret = net_raw_init(mon, vlan, device, name, ifname, promisc, raw_fd);
+    } else
     if (!strcmp(device, "socket")) {
         char chkbuf[64];
         if (get_param_value(buf, sizeof(buf), "fd", p) > 0) {
diff --git a/qemu-options.hx b/qemu-options.hx
index 503da33..0a3c807 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -761,6 +761,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
     "                use 'sndbuf=nbytes' to limit the size of the send buffer\n"
 #endif
 #endif
+    "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n"
+    "                bound the host network interface to VLAN 'n' in a raw manner:\n"
+    "                packets received on the interface are delivered to the vlan and\n"
+    "                packets delivered on the vlan are sent to the interface\n"
     "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
     "                connect the vlan 'n' to another VLAN using a socket connection\n"
     "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n"

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-01 15:46 [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz
@ 2009-07-01 16:21 ` Jamie Lokier
  2009-07-02 12:25   ` Or Gerlitz
       [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com>
  2009-07-02 15:43 ` Michael S. Tsirkin
  2 siblings, 1 reply; 30+ messages in thread
From: Jamie Lokier @ 2009-07-01 16:21 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel

Or Gerlitz wrote:
> Add raw network backend option which uses a packet socket to provide
> raw networking access. Once the socket is opened its bouned to a
> provided host interface, such that packets received on the interface
> are delivered to the VM and packets sent by the VM are sent to the
> interface.

It looks like it would be functionally identical to the pcap network
backend, with the same advantages and problems, but one less
dependency on an external library (and removes any compatibility with
ancient kernels that are in libpcap, but QEMU doesn't work on them anyway).

-- Jamie

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-01 16:21 ` Jamie Lokier
@ 2009-07-02 12:25   ` Or Gerlitz
  2009-07-03  2:39     ` Jamie Lokier
  0 siblings, 1 reply; 30+ messages in thread
From: Or Gerlitz @ 2009-07-02 12:25 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Herbert Xu, qemu-devel

Jamie Lokier wrote:
> It looks like it would be functionally identical to the pcap network backend, 
> with the same advantages and problems, but one less dependency on an external
> library (and removes any compatibility with ancient kernels that are in libpcap, 
> but QEMU doesn't work on them anyway).

Assuming that by "pcap network backend" you refer to the -net dump backend, I don't think this (the idea/patch being functionally identical) is the case with the current code: the dump backend opens a file and writes there packets received from the vlan in pcap format, that's all. So first, there's no direct linking with libpcap, and more important, the packets aren't going to the network through the dump backend. With the raw backend, there's no pcap file, but packets are sent to and received from the network through packet socket.

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-02 12:25   ` Or Gerlitz
@ 2009-07-03  2:39     ` Jamie Lokier
  2009-07-07 13:33       ` Or Gerlitz
  0 siblings, 1 reply; 30+ messages in thread
From: Jamie Lokier @ 2009-07-03  2:39 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel

Or Gerlitz wrote:
> Jamie Lokier wrote:
> > It looks like it would be functionally identical to the pcap
> > network backend, with the same advantages and problems, but one
> > less dependency on an external library (and removes any
> > compatibility with ancient kernels that are in libpcap, but QEMU
> > doesn't work on them anyway).
> 

> Assuming that by "pcap network backend" you refer to the -net dump
> backend, I don't think this (the idea/patch being functionally
> identical) is the case with the current code: the dump backend opens a
> file and writes there packets received from the vlan in pcap format,
> that's all. So first, there's no direct linking with libpcap, and more
> important, the packets aren't going to the network through the dump
> backend. With the raw backend, there's no pcap file, but packets are
> sent to and received from the network through packet socket.

No, I meant "-net pcap", whose patch is announced at
http://lists.freebsd.org/pipermail/freebsd-emulation/2007-February/003108.html
and found here:
http://people.freebsd.org/~jkim/patch-qemu-pcap.diff

I saw it mention on this list in the last few months.
It sends and receives packets over a host network interface using libpcap.

-- Jamie

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-03  2:39     ` Jamie Lokier
@ 2009-07-07 13:33       ` Or Gerlitz
  2009-07-07 14:57         ` Jamie Lokier
  0 siblings, 1 reply; 30+ messages in thread
From: Or Gerlitz @ 2009-07-07 13:33 UTC (permalink / raw)
  To: Jamie Lokier, Anthony Liguori; +Cc: Herbert Xu, Jan Kiszka, qemu-devel

Jamie Lokier wrote:
> No, I meant "-net pcap", whose patch is announced at
> http://lists.freebsd.org/pipermail/freebsd-emulation/2007-February/003108.html
> and found here: http://people.freebsd.org/~jkim/patch-qemu-pcap.diff I saw it mention on this list in the last few months. It sends and receives packets over a host network interface using libpcap.
okay, looking a bit on the archives I realized that the -net pcap 
backend was suggested on the qemu-devel list at least twice, on 2007 
(your pointer) and recently, e.g @ 
http://lists.gnu.org/archive/html/qemu-devel/2009-03/msg00895.html and 
the mail threads that followed from March, April and May 2009. Under 
Linux the pcap library uses a packet socket, so basically it could make 
some sense to go through libpcap and not directly to sockets, but there 
are also some disadvantages which will not less qemu implement some 
related optimizations which are not integrated into libpcap.

Now, before going into libpcap vs. packet socket, I'd be happy if you or 
Antony can help me understand the comments that with this approach guest 
<--> host communication is impossible. AFAIK, the only packet sent by 
Qemu is gratuitous ARP after migration, but it doesn't fall into guest 
<--> host communication over the NIC/vlan/back-end gang, so I don't see 
what is this traffic which is impossible with the bridge-less approach.

Also, if for some reason one need to communicate from the guest to the 
host the -net raw can instructed to run over a veth Linux interface 
couple which are connected to a bridge.

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-07 13:33       ` Or Gerlitz
@ 2009-07-07 14:57         ` Jamie Lokier
  2009-07-08 14:45           ` Or Gerlitz
  0 siblings, 1 reply; 30+ messages in thread
From: Jamie Lokier @ 2009-07-07 14:57 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Herbert Xu, Jan Kiszka, qemu-devel

Or Gerlitz wrote:
> Now, before going into libpcap vs. packet socket, I'd be happy if you or 
> Antony can help me understand the comments that with this approach guest 
> <--> host communication is impossible. AFAIK, the only packet sent by 
> Qemu is gratuitous ARP after migration, but it doesn't fall into guest 
> <--> host communication over the NIC/vlan/back-end gang, so I don't see 
> what is this traffic which is impossible with the bridge-less approach.

The problem is simply what the guest sends goes out on the network and
is not looped backed to the host network stack, and vice versa.

So if your host is 192.168.1.1 and is running a DNS server (say), and
the guest is 192.168.1.2, when the guest sends queries to 192.168.1.1
the host won't see those queries.  Same if you're running an FTP
server on the host and the guest wants to connect to it, etc.

It also means multiple guests can't see each other, for the same reason.

So it's much less useful than bridging, where the guests and host can
all see each other and connect to each other.

Unfortunately, bridging is a pain to set up, if your host has any
complicated or automatic network configuration already.

For example, it's impossible to make it work with NetworkManager,
which manages a mobile host's wired and wireless network interfaces.
(I want to use lots of VMs on my laptop, and I want them to appear on
the same network as whatever my host is connecting to automatically.
Bridging - the way Linux does it - makes that difficult).

The main advantage of pcap/packet is you don't need potentially
difficult to configure bridges set up on the host network especially
to support VMs.  pcap/packet just works - as long as you don't need
any host<->guest or guest<->guest communication.

The main advantage of bridging is you get simple, complete
host<->guest and guest<->guest communication - (except the host can
firewall them).

It would be really nice to find a way which has the advantages of
both.  Either by adding a different bridging mode to Linux, where host
interfaces can be configured for IP and the bridge hangs off the host
interface, or by a modified tap interface, or by an alternative
pcap/packet-like interface which forwards packets in a similar way to
bridging.  (Maybe those are different ways of saying the same thing.)

> Also, if for some reason one need to communicate from the guest to the 
> host the -net raw can instructed to run over a veth Linux interface 
> couple which are connected to a bridge.

But you might as well use a tap interface connected to a bridge instead.

tap or veth+packet are virtually equivalent, and they both have the
same bridging configuration problems if the host network is interesting.

-- Jamie

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-07 14:57         ` Jamie Lokier
@ 2009-07-08 14:45           ` Or Gerlitz
  2009-07-14 13:54             ` Or Gerlitz
  2009-07-15 20:38             ` Jamie Lokier
  0 siblings, 2 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-08 14:45 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Herbert Xu, Jan Kiszka, qemu-devel

Jamie Lokier wrote:
> The problem is simply what the guest sends goes out on the network and is not looped backed to the host network stack, and vice versa. So if your host is 192.168.1.1 and is running a DNS server (say), and the guest is 192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't see those queries.  Same if you're running an FTP server on the host and the guest wants to connect to it, etc. It also means multiple guests can't see each other, for the same reason. So it's much less useful than bridging, where the guests and host can all see each other and connect to each other.
I wasn't sure to follow if your example refers to the case when 
networking uses the bridge or NAT. If its bridge, then through which 
bridge interface the packet arrives the host stack? say you have a 
bridge whose attached interfaces are tap1(VM1), tap2(VM2) and eth0(NIC), 
in your example did you mean that the host IP address is assigned to the 
bridge interface? or you were referring a NAT based scheme?


> Unfortunately, bridging is a pain to set up, if your host has any complicated or automatic network configuration already.
As you said bridging requires more configuration, but not less important 
the performance (packets per second and cpu utilization) one can get 
with bridge+tap is much lower vs what you get with the raw mode 
approach. All in all, its clear that with this approach VM/VM and 
VM/Host communication would have to get switched either at the NIC (e.g 
SR/IOV capable NICs supporting a virtual bridge) or at an external 
switch and make a U turn. There's a bunch of reasons why people would 
like to do that, among them performance boost, the ability to shape, 
manage and monitor VM/VM traffic in external switches and more.

> It would be really nice to find a way which has the advantages of both.  Either by adding a different bridging mode to Linux, where host interfaces can be configured for IP and the bridge hangs off the host interface, or by a modified tap interface, or by an alternative
> pcap/packet-like interface which forwards packets in a similar way to bridging.  
It seems that this will not yield  the performance improvement we can 
get with going directly to the NIC. But if someone comes up and makes 
such a mode working, it can be merged into qemu as well along with the 
raw mode.

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-08 14:45           ` Or Gerlitz
@ 2009-07-14 13:54             ` Or Gerlitz
  2009-07-15 20:38             ` Jamie Lokier
  1 sibling, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-14 13:54 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Herbert Xu, Jan Kiszka, qemu-devel

Or Gerlitz wrote:
> Jamie Lokier wrote:
>> The problem is simply what the guest sends goes out on the network 
>> and is not looped backed to the host network stack, and vice versa [...]
> I wasn't sure to follow if your example refers to the case when 
> networking uses the bridge or NAT. If its bridge, then through which 
> bridge interface the packet arrives the host stack? say you have a 
> bridge whose attached interfaces are tap1(VM1), tap2(VM2) and 
> eth0(NIC), in your example did you mean that the host IP address is 
> assigned to the bridge interface? or you were referring a NAT based 
> scheme?
Hi Jamie,

Can you comment on my email?

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-08 14:45           ` Or Gerlitz
  2009-07-14 13:54             ` Or Gerlitz
@ 2009-07-15 20:38             ` Jamie Lokier
  2009-07-15 21:06               ` Jan Kiszka
                                 ` (2 more replies)
  1 sibling, 3 replies; 30+ messages in thread
From: Jamie Lokier @ 2009-07-15 20:38 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Herbert Xu, Jan Kiszka, qemu-devel

Or Gerlitz wrote:
> Jamie Lokier wrote:
> >The problem is simply what the guest sends goes out on the network and is 
> >not looped backed to the host network stack, and vice versa. So if your 
> >host is 192.168.1.1 and is running a DNS server (say), and the guest is 
> >192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't 
> >see those queries.  Same if you're running an FTP server on the host and 
> >the guest wants to connect to it, etc. It also means multiple guests can't 
> >see each other, for the same reason. So it's much less useful than 
> >bridging, where the guests and host can all see each other and connect to 
> >each other.
> I wasn't sure to follow if your example refers to the case when 
> networking uses the bridge or NAT. If its bridge, then through which 
> bridge interface the packet arrives the host stack? say you have a 
> bridge whose attached interfaces are tap1(VM1), tap2(VM2) and eth0(NIC), 
> in your example did you mean that the host IP address is assigned to the 
> bridge interface? or you were referring a NAT based scheme?

When using a bridge, you set the IP address on the bridge itself (for
example, br0).  DHCP runs on the bridge itself, so does the rest of
the Linux host stack, although you can use raw sockets on the other
interfaces.

But reading and controlling the hardware is done on the interfaces.

So if you have some program like NetworkManager which checks if you
have a wire plugged into eth0, it has to read eth0 to get the wire
status, but it has to run DHCP on br0.

Those programs don't generally have that option, which makes bridges
difficult to use for VMs in a transparent way.

I wasn't referring to NAT, but you can use NAT with a bridge on Linux;
it's called brouting :-)

> >Unfortunately, bridging is a pain to set up, if your host has any 
> >complicated or automatic network configuration already.

> As you said bridging requires more configuration

A bridge is quite simple to configure.  Unfortunately because Linux
requires all the IP configuration on the bridge device, but network
device control on the network device, bridges don't work well with
automatic configuration tools.

If you could apply host IP configuration to the network device and
still have a bridge, that would be perfect.  You would just create
br0, add tap1(VM1), tap2(VM2) and eth0(NIC), and everything would work
perfectly.

> but not less important the performance (packets per second and cpu
> utilization) one can get with bridge+tap is much lower vs what you
> get with the raw mode approach.

Have you measured it?

> All in all, its clear that with this approach VM/VM and VM/Host
> communication would have to get switched either at the NIC (e.g
> SR/IOV capable NICs supporting a virtual bridge) or at an external
> switch and make a U turn.

Unfortunately that's usually impossible.  Most switches don't do U
turns, and a lot of simple networks don't have any switches except a
home router.

> There's a bunch of reasons why people would 
> like to do that, among them performance boost,

No, it makes performance _much_ worse if you have packets leaving the
host, do a U turn and come back on the same link.  Much better to use
a bridge inside the host.  Probably ten times faster because host's
internal networking is much faster than a typical gigabit link :-)

> the ability to shape, 
> manage and monitor VM/VM traffic in external switches and more.

That could be useful, but I think it's's probably quite unusual for
someone to want to shape traffic between a VM and it's own host.  Also
if you want to do that, you can do it inside the host.

Sometimes it would be useful to send it outside the host and U turn,
but not very often; only for diagnostics I would think.  And even that
can be done with Linux bridges, using VLANs :-)

> >It would be really nice to find a way which has the advantages of both.  
> >Either by adding a different bridging mode to Linux, where host interfaces 
> >can be configured for IP and the bridge hangs off the host interface, or 
> >by a modified tap interface, or by an alternative
> >pcap/packet-like interface which forwards packets in a similar way to 
> >bridging.  

> It seems that this will not yield  the performance improvement we can 
> get with going directly to the NIC.

If you don't need any host<->VM networking, maybe a raw packet socket
is faster.

But are you sure it's faster?
I'd want to see measurements before I believe it.

If you need any host<->VM networking, most of the time the packet
socket isn't an option at all.  Not many switches will 'U turn'
packets as you suggest.

-- Jamie

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-15 20:38             ` Jamie Lokier
@ 2009-07-15 21:06               ` Jan Kiszka
  2009-07-15 21:52                 ` Jamie Lokier
  2009-07-16  8:29               ` Or Gerlitz
  2009-07-20 14:13               ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz
  2 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2009-07-15 21:06 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Or Gerlitz, Herbert Xu, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 5249 bytes --]

Jamie Lokier wrote:
> Or Gerlitz wrote:
>> Jamie Lokier wrote:
>>> The problem is simply what the guest sends goes out on the network and is 
>>> not looped backed to the host network stack, and vice versa. So if your 
>>> host is 192.168.1.1 and is running a DNS server (say), and the guest is 
>>> 192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't 
>>> see those queries.  Same if you're running an FTP server on the host and 
>>> the guest wants to connect to it, etc. It also means multiple guests can't 
>>> see each other, for the same reason. So it's much less useful than 
>>> bridging, where the guests and host can all see each other and connect to 
>>> each other.
>> I wasn't sure to follow if your example refers to the case when 
>> networking uses the bridge or NAT. If its bridge, then through which 
>> bridge interface the packet arrives the host stack? say you have a 
>> bridge whose attached interfaces are tap1(VM1), tap2(VM2) and eth0(NIC), 
>> in your example did you mean that the host IP address is assigned to the 
>> bridge interface? or you were referring a NAT based scheme?
> 
> When using a bridge, you set the IP address on the bridge itself (for
> example, br0).  DHCP runs on the bridge itself, so does the rest of
> the Linux host stack, although you can use raw sockets on the other
> interfaces.
> 
> But reading and controlling the hardware is done on the interfaces.
> 
> So if you have some program like NetworkManager which checks if you
> have a wire plugged into eth0, it has to read eth0 to get the wire
> status, but it has to run DHCP on br0.
> 
> Those programs don't generally have that option, which makes bridges
> difficult to use for VMs in a transparent way.
> 
> I wasn't referring to NAT, but you can use NAT with a bridge on Linux;
> it's called brouting :-)
> 
>>> Unfortunately, bridging is a pain to set up, if your host has any 
>>> complicated or automatic network configuration already.
> 
>> As you said bridging requires more configuration
> 
> A bridge is quite simple to configure.  Unfortunately because Linux
> requires all the IP configuration on the bridge device, but network
> device control on the network device, bridges don't work well with
> automatic configuration tools.
> 
> If you could apply host IP configuration to the network device and
> still have a bridge, that would be perfect.  You would just create
> br0, add tap1(VM1), tap2(VM2) and eth0(NIC), and everything would work
> perfectly.
> 
>> but not less important the performance (packets per second and cpu
>> utilization) one can get with bridge+tap is much lower vs what you
>> get with the raw mode approach.
> 
> Have you measured it?
> 
>> All in all, its clear that with this approach VM/VM and VM/Host
>> communication would have to get switched either at the NIC (e.g
>> SR/IOV capable NICs supporting a virtual bridge) or at an external
>> switch and make a U turn.
> 
> Unfortunately that's usually impossible.  Most switches don't do U
> turns, and a lot of simple networks don't have any switches except a
> home router.
> 
>> There's a bunch of reasons why people would 
>> like to do that, among them performance boost,
> 
> No, it makes performance _much_ worse if you have packets leaving the
> host, do a U turn and come back on the same link.  Much better to use
> a bridge inside the host.  Probably ten times faster because host's
> internal networking is much faster than a typical gigabit link :-)
> 
>> the ability to shape, 
>> manage and monitor VM/VM traffic in external switches and more.
> 
> That could be useful, but I think it's's probably quite unusual for
> someone to want to shape traffic between a VM and it's own host.  Also
> if you want to do that, you can do it inside the host.
> 
> Sometimes it would be useful to send it outside the host and U turn,
> but not very often; only for diagnostics I would think.  And even that
> can be done with Linux bridges, using VLANs :-)
> 
>>> It would be really nice to find a way which has the advantages of both.  
>>> Either by adding a different bridging mode to Linux, where host interfaces 
>>> can be configured for IP and the bridge hangs off the host interface, or 
>>> by a modified tap interface, or by an alternative
>>> pcap/packet-like interface which forwards packets in a similar way to 
>>> bridging.  
> 
>> It seems that this will not yield  the performance improvement we can 
>> get with going directly to the NIC.
> 
> If you don't need any host<->VM networking, maybe a raw packet socket
> is faster.
> 
> But are you sure it's faster?
> I'd want to see measurements before I believe it.
> 
> If you need any host<->VM networking, most of the time the packet
> socket isn't an option at all.  Not many switches will 'U turn'
> packets as you suggest.

FWIW, the fastest local VM<->VM bridge I've happened to measure so far
was using qemu's -net socket,listen/connect, ie. a plain local IP or
unix domain socket between two qemu instances. No tap devices, no
in-kernel bridges involved. But this picture may change once we have
some in-kernel virtio-net backend.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-15 21:06               ` Jan Kiszka
@ 2009-07-15 21:52                 ` Jamie Lokier
  0 siblings, 0 replies; 30+ messages in thread
From: Jamie Lokier @ 2009-07-15 21:52 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Or Gerlitz, Herbert Xu, qemu-devel

Jan Kiszka wrote:
> > But are you sure it's faster?
> > I'd want to see measurements before I believe it.
> > 
> > If you need any host<->VM networking, most of the time the packet
> > socket isn't an option at all.  Not many switches will 'U turn'
> > packets as you suggest.
> 
> FWIW, the fastest local VM<->VM bridge I've happened to measure so far
> was using qemu's -net socket,listen/connect, ie. a plain local IP or
> unix domain socket between two qemu instances. No tap devices, no
> in-kernel bridges involved.

That's not surprising, but good to know.

Packet sockets aren't much use for VM<->VM bridges either ;-)

However on a positive note, if packet sockets give good performance
for VM<->external, and a unix domain socket gives good performance for
VM<->VM, maybe a packet socket on _lo_ (the loopback interface) can be
used for VM<->host communication?

Then with the right (ugly) hackery in QEMU, it could query the host's
MAC addresses as well as other VMs on the same host, listen on all
three types of interface, and send to the appropriate one depending on
destination MAC address of each packet.

That might give great performance in all cases and solve the bridge
configuration problem at the same time, so that you can run VMs easily
which Just Work(tm) on the host's network.

Then again it might not.  Code would be a bit complicated, it would
interact with Linux iptables differently, and one of the most useful
configurations which is VMs being NAT'd by the host (so invisible
outside the host) would be difficult.

> But this picture may change once we have some in-kernel virtio-net
> backend.

If there's a faster way to send/receive packets, especially if it
_behaves_ differently from tap/packet, it would be nice if it were
available from userspace too, not just from KVM.

If virtio-net is growing a backend to send/receive packets via the
host network stack, it would be nice if it solve the awkward
bridge configuration problem at the same time.

Do you know what direction that backend is going in?  In my experience
with VMs, they are always looked after by some host iptables rules for
safety, and sometimes NAT rules depending on how they are to be
visible outside, and with policy routing at times too.  It would be
great if those facilities still worked, and unfortunate if the new
backend was only usable in quite limited configurations.

-- Jamie

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-15 20:38             ` Jamie Lokier
  2009-07-15 21:06               ` Jan Kiszka
@ 2009-07-16  8:29               ` Or Gerlitz
  2009-07-20 14:13               ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz
  2 siblings, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-16  8:29 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Herbert Xu, Jan Kiszka, qemu-devel

Jamie Lokier wrote:
> When using a bridge, you set the IP address on the bridge itself (for example, br0).  DHCP runs on the bridge itself, so does the rest of the Linux host stack, although you can use raw sockets on the other interfaces. But reading and controlling the hardware is done on the interfaces. So if you have some program like NetworkManager which checks if you have a wire plugged into eth0, it has to read eth0 to get the wire status, but it has to run DHCP on br0.
Yes, I understand that if the guest want to communicate with the host in 
a bridged environment, the IP has to be set on the bridge. With "DHCP" 
do you refer to dhcp server or to dhcp relay or something else? I assume 
its not a server, since you mentioned a NON NAT environment.

> A bridge is quite simple to configure. Unfortunately because Linux requires all the IP configuration on the bridge device, but network device control on the network device, bridges don't work well with automatic configuration tools.
seems like this scheme/problem is similar to bonding, where the IP 
configuration is done to the bond device but people may still want to do 
control the slave devices, I am not sure why such tools need the device 
to have an IP, but it seems less relevant for this thread.

> Have you measured it?
Yes, I will send soon some data.

> Unfortunately that's usually impossible. Most switches don't do U turns, and a lot of simple networks don't have any switches except a home router

Again, as I wrote, the U turn can be done in three places: software 
bridge, virtual HW bridge inside the  NIC, or at an external switch. 
With virtualuzation becoming more common, options 2 and 3 will be more 
and more available, where the packet socket approach is valid for both 
of them.

> No, it makes performance _much_ worse if you have packets leaving the host, do a U turn and come back on the same link.  Much better to use a bridge inside the host.  Probably ten times faster because host's internal networking is much faster than a typical gigabit link :-)

My benchmark was focusing on packets per second for VM <--> world and 
not on VM/VM communication. I tend to think that with KVM and the raw 
mode or kernel virtio-net backend with both requiring U turn, the VM/VM 
performance will be no less in most if not all measures (namely, packets 
per second, cpu utilization, bandwidth, latency, etc). Still, its quite 
clear that both these mode can be useful for people that want to max the 
VM <---> world communication performance.


> Sometimes it would be useful to send it outside the host and U turn, but not very often; only for diagnostics I would think.  And even that can be done with Linux bridges, using VLANs :-)
mmm, I wasn't sure if you refer to Linux vlans (8021q devices) or the 
Qemu vlan... can you elaborate?

> If you don't need any host<->VM networking, maybe a raw packet socket is faster
> But are you sure it's faster? I'd want to see measurements before I believe it.
fair enough

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-15 20:38             ` Jamie Lokier
  2009-07-15 21:06               ` Jan Kiszka
  2009-07-16  8:29               ` Or Gerlitz
@ 2009-07-20 14:13               ` Or Gerlitz
  2009-07-20 15:53                 ` Herbert Xu
  2 siblings, 1 reply; 30+ messages in thread
From: Or Gerlitz @ 2009-07-20 14:13 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Mark McLoughlin, Michael S. Tsirkin, Herbert Xu, Dor Laor,
	qemu-devel, Jan Kiszka

Jamie Lokier wrote:
> Or Gerlitz wrote:

>> the performance (packets per second and cpu utilization) one can get 
>> with bridge+tap is much lower vs what you get with the raw mode approach.

> Have you measured it?

yes, here's some data: using 2.6.29.1 in the guest, 2.6.30 in the host, with 1Gbe 
connectivity (Intel 82575EB) between the two nodes,  I see the following results: 

with -net raw (packet socket)
		pps	cs	us	sys
vm->phys	240k	200	7	8	
phys->vm	160k	100	5	7

with -net tap (tap + bridge)
		pps	cs	us	sys
vm->phys	170k	600	5	10
phys->vm	150k	14k	5	20

where "pps" stands for packets-per-second, "cs", "us" and "sys" are taken from vmstat output, such that they represent the context switches per second, user and system time percents.  The benchmark I use is netperf 2.4.4 / UDP_STREAM with 22 bytes payload length such that there are 64(=14+20+8+22) bytes on the wire. On this setup (udp, 64 byte frames), doing phys->phys test, netperf sends/receives 450K pps and pktgen sends 900K pps, all tests done without any interrupt moderation tuning.

You can see that the raw mode has much better packets per second for the VM TX flow, and on the VM RX side, a bit better pps rate but much lower cpu utilization and context switches number.

Or.

All this on top of mainstream qemu whose head is commit 8676188b751ca28ab7c42baf20ea64391625b44d
"Work around Solaris gas problem"

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-20 14:13               ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz
@ 2009-07-20 15:53                 ` Herbert Xu
  2009-07-20 18:20                   ` Michael S. Tsirkin
  2009-07-21  7:03                   ` Or Gerlitz
  0 siblings, 2 replies; 30+ messages in thread
From: Herbert Xu @ 2009-07-20 15:53 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Mark McLoughlin, Michael S. Tsirkin, Dor Laor, qemu-devel,
	Jan Kiszka

On Mon, Jul 20, 2009 at 05:13:06PM +0300, Or Gerlitz wrote:
>
> with -net tap (tap + bridge)

Is netfilter enabled on the bridge? If so you need to turn it off
because it's a huge security hole for virtualisation and slows it
down heaps.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-20 15:53                 ` Herbert Xu
@ 2009-07-20 18:20                   ` Michael S. Tsirkin
  2009-07-21  1:46                     ` Herbert Xu
  2009-07-21  7:03                   ` Or Gerlitz
  1 sibling, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2009-07-20 18:20 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Mark McLoughlin, Dor Laor, qemu-devel, Or Gerlitz, Jan Kiszka

On Mon, Jul 20, 2009 at 11:53:08PM +0800, Herbert Xu wrote:
> On Mon, Jul 20, 2009 at 05:13:06PM +0300, Or Gerlitz wrote:
> >
> > with -net tap (tap + bridge)
> 
> Is netfilter enabled on the bridge? If so you need to turn it off
> because it's a huge security hole for virtualisation

How is it a security hole?

> and slows it
> down heaps.
> 
> Cheers,

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-20 18:20                   ` Michael S. Tsirkin
@ 2009-07-21  1:46                     ` Herbert Xu
  0 siblings, 0 replies; 30+ messages in thread
From: Herbert Xu @ 2009-07-21  1:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Mark McLoughlin, Dor Laor, qemu-devel, Or Gerlitz, Jan Kiszka

On Mon, Jul 20, 2009 at 09:20:32PM +0300, Michael S. Tsirkin wrote:
>
> > Is netfilter enabled on the bridge? If so you need to turn it off
> > because it's a huge security hole for virtualisation
> 
> How is it a security hole?

Because bridge netfilter will perform defragmentation and conntrack,
both of which are global in scope.  That means packets from two
unrelated bridges can be treated exactly as the same if their
IP addresses/port numbers are identical, causing information
leakage or worse, allowing an attacker to modify others' traffic.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-20 15:53                 ` Herbert Xu
  2009-07-20 18:20                   ` Michael S. Tsirkin
@ 2009-07-21  7:03                   ` Or Gerlitz
  2009-07-21  7:25                     ` Herbert Xu
  1 sibling, 1 reply; 30+ messages in thread
From: Or Gerlitz @ 2009-07-21  7:03 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mark McLoughlin, Michael S. Tsirkin, Dor Laor, qemu-devel,
	Jan Kiszka

Herbert Xu wrote:
> Is netfilter enabled on the bridge? If so you need to turn it off

okay, when setting net.bridge.bridge-nf-call-iptables to zero, the VM TX / tap+bridge packet rate climbs from 170K to 195K but it still way beyond the 240K rate achieved by the raw mode --> we have now a clear sign on the performance gain this approach provides. 

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21  7:03                   ` Or Gerlitz
@ 2009-07-21  7:25                     ` Herbert Xu
  2009-07-21 10:17                       ` Or Gerlitz
  2009-07-21 10:27                       ` Michael S. Tsirkin
  0 siblings, 2 replies; 30+ messages in thread
From: Herbert Xu @ 2009-07-21  7:25 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Mark McLoughlin, Michael S. Tsirkin, netdev, Dor Laor, qemu-devel,
	Jan Kiszka

On Tue, Jul 21, 2009 at 10:03:00AM +0300, Or Gerlitz wrote:
> 
> okay, when setting net.bridge.bridge-nf-call-iptables to zero, the VM TX / tap+bridge packet rate climbs from 170K to 195K but it still way beyond the 240K rate achieved by the raw mode --> we have now a clear sign on the performance gain this approach provides. 

I find this hard to believe this bridge sans netfilter does a
single lookup based on the MAC address and then just passes the
packet to the underlying driver.

Can you do an oprofile run to see if something else is chewing
up CPU time under the guise of bridging?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21  7:25                     ` Herbert Xu
@ 2009-07-21 10:17                       ` Or Gerlitz
  2009-07-21 10:27                       ` Michael S. Tsirkin
  1 sibling, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-21 10:17 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mark McLoughlin, Michael S. Tsirkin, netdev, Dor Laor, qemu-devel,
	Jan Kiszka

Herbert Xu wrote:
> I find this hard to believe this bridge sans netfilter does a single lookup based 
> on the MAC address and then just passes the packet to the underlying driver.
> Can you do an oprofile run to see if something else is chewing
> up CPU time under the guise of bridging?

okay, here are the top twenty time consumers for the three VM TX modes, the bridge code 
is not anywhere high... I'll send you the complete oprofile logs.

Or.

VM TX with the raw mode -->

samples  %        image name               app name                 symbol name
697453   25.2468  kvm-intel.ko             kvm_intel                vmx_vcpu_run
105024    3.8017  vmlinux                  vmlinux                  _raw_spin_lock
95443     3.4549  igb.ko                   igb                      igb_xmit_frame_adv
68617     2.4838  vmlinux                  vmlinux                  __slab_free
68168     2.4676  qemu-system-x86_64       qemu-system-x86_64       cpu_physical_memory_rw
56272     2.0370  vmlinux                  vmlinux                  tg_shares_up
48573     1.7583  igb.ko                   igb                      igb_clean_tx_irq
46128     1.6698  libc-2.5.so              libc-2.5.so              memcpy
44371     1.6062  vmlinux                  vmlinux                  kmem_cache_alloc
41485     1.5017  vmlinux                  vmlinux                  __alloc_skb
38719     1.4016  qemu-system-x86_64       qemu-system-x86_64       phys_page_find_alloc
38016     1.3761  vmlinux                  vmlinux                  copy_user_generic_string
37690     1.3643  qemu-system-x86_64       qemu-system-x86_64       qemu_get_ram_ptr
34321     1.2424  vmlinux                  vmlinux                  dev_kfree_skb_irq
34313     1.2421  vmlinux                  vmlinux                  __kmalloc_track_caller
28726     1.0398  vmlinux                  vmlinux                  sock_alloc_send_pskb
25195     0.9120  vmlinux                  vmlinux                  kfree
24790     0.8974  vmlinux                  vmlinux                  __slab_alloc
23406     0.8473  vmlinux                  vmlinux                  dev_queue_xmit

VM TX with the tap/bridge+netfilter OFF mode -->

samples  %        image name               app name                 symbol name
447119   21.5219  kvm-intel.ko             kvm_intel                vmx_vcpu_run
70774     3.4067  igb.ko                   igb                      igb_xmit_frame_adv
66324     3.1925  vmlinux                  vmlinux                  _raw_spin_lock
53817     2.5905  vmlinux                  vmlinux                  __slab_free
47494     2.2861  vmlinux                  vmlinux                  tg_shares_up
47213     2.2726  qemu-system-x86_64       qemu-system-x86_64       cpu_physical_memory_rw
40364     1.9429  igb.ko                   igb                      igb_clean_tx_irq
39545     1.9035  vmlinux                  vmlinux                  kmem_cache_alloc
36027     1.7341  libc-2.5.so              libc-2.5.so              memcpy
34945     1.6821  vmlinux                  vmlinux                  __alloc_skb
29747     1.4319  vmlinux                  vmlinux                  dev_kfree_skb_irq
29145     1.4029  vmlinux                  vmlinux                  __kmalloc_track_caller
28680     1.3805  vmlinux                  vmlinux                  copy_user_generic_string
26251     1.2636  qemu-system-x86_64       qemu-system-x86_64       phys_page_find_alloc
25123     1.2093  qemu-system-x86_64       qemu-system-x86_64       qemu_get_ram_ptr
23231     1.1182  vmlinux                  vmlinux                  eth_type_trans
22356     1.0761  vmlinux                  vmlinux                  sock_alloc_send_pskb
22108     1.0642  vmlinux                  vmlinux                  __slab_alloc
21288     1.0247  vmlinux                  vmlinux                  kfree

VM TX with the tap/bridge+netfilter ON mode -->

samples  %        image name               app name                 symbol name
319271   21.1411  kvm-intel.ko             kvm_intel                vmx_vcpu_run
46559     3.0830  vmlinux                  vmlinux                  _raw_spin_lock
39703     2.6290  vmlinux                  vmlinux                  tg_shares_up
35773     2.3688  vmlinux                  vmlinux                  __slab_free
35045     2.3206  qemu-system-x86_64       qemu-system-x86_64       cpu_physical_memory_rw
32612     2.1595  igb.ko                   igb                      igb_xmit_frame_adv
31779     2.1043  vmlinux                  vmlinux                  kmem_cache_alloc
29134     1.9292  libc-2.5.so              libc-2.5.so              memcpy
23031     1.5250  vmlinux                  vmlinux                  copy_user_generic_string
19713     1.3053  vmlinux                  vmlinux                  __kmalloc_track_caller
19303     1.2782  qemu-system-x86_64       qemu-system-x86_64       phys_page_find_alloc
19038     1.2606  vmlinux                  vmlinux                  __alloc_skb
18559     1.2289  vmlinux                  vmlinux                  kfree
18460     1.2224  qemu-system-x86_64       qemu-system-x86_64       qemu_get_ram_ptr
18409     1.2190  vmlinux                  vmlinux                  eth_type_trans
17828     1.1805  igb.ko                   igb                      igb_clean_tx_irq
17622     1.1669  igb.ko                   igb                      igb_poll
17303     1.1457  vmlinux                  vmlinux                  __slab_alloc
17033     1.1279  vmlinux                  vmlinux                  dev_kfree_skb_irq

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21  7:25                     ` Herbert Xu
  2009-07-21 10:17                       ` Or Gerlitz
@ 2009-07-21 10:27                       ` Michael S. Tsirkin
  2009-07-21 11:05                         ` Or Gerlitz
  1 sibling, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2009-07-21 10:27 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mark McLoughlin, netdev, Dor Laor, qemu-devel, Or Gerlitz,
	Jan Kiszka

On Tue, Jul 21, 2009 at 03:25:46PM +0800, Herbert Xu wrote:
> On Tue, Jul 21, 2009 at 10:03:00AM +0300, Or Gerlitz wrote:
> > 
> > okay, when setting net.bridge.bridge-nf-call-iptables to zero, the VM TX / tap+bridge packet rate climbs from 170K to 195K but it still way beyond the 240K rate achieved by the raw mode --> we have now a clear sign on the performance gain this approach provides. 
> 
> I find this hard to believe this bridge sans netfilter does a
> single lookup based on the MAC address and then just passes the
> packet to the underlying driver.

One advantage that raw sockets have over tap+bridge, is that they do not
do their own TX buffering, but use the TX queue for the device directly.
With raw sockets, send will block or fail if the TX queue for device is
full. With tap+bridge, the buffer in tap has to fill up instead, which
is not the same. I'm not sure this is the issue here, but could be: the
benchmark is UDP, isn't it?


> Can you do an oprofile run to see if something else is chewing
> up CPU time under the guise of bridging?
> 
> Thanks,
> -- 
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21 10:27                       ` Michael S. Tsirkin
@ 2009-07-21 11:05                         ` Or Gerlitz
  2009-07-21 12:01                           ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Or Gerlitz @ 2009-07-21 11:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Mark McLoughlin, Herbert Xu, netdev, Dor Laor, qemu-devel,
	Jan Kiszka

Michael S. Tsirkin wrote:
> With raw sockets, send will block or fail if the TX queue for device is
> full. With tap+bridge, the buffer in tap has to fill up instead, which
> is not the same. I'm not sure this is the issue here, but could be: the
> benchmark is UDP, isn't it?

Michael, 

What/where is this tap buffer? we're talking on VM TX, so looking on tun_get_user I see a call to 
skb_copy_datagram_from_iovec() to copy from the user buffer to an skb, then a call to netif_rx_ni() and that's it... As for your question, indeed udp, the VM runs netperf/UDP_STREAM

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21 11:05                         ` Or Gerlitz
@ 2009-07-21 12:01                           ` Michael S. Tsirkin
  2009-07-21 12:14                             ` Herbert Xu
  0 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2009-07-21 12:01 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Mark McLoughlin, Herbert Xu, netdev, Dor Laor, qemu-devel,
	Jan Kiszka

On Tue, Jul 21, 2009 at 02:05:32PM +0300, Or Gerlitz wrote:
> Michael S. Tsirkin wrote:
> > With raw sockets, send will block or fail if the TX queue for device is
> > full. With tap+bridge, the buffer in tap has to fill up instead, which
> > is not the same. I'm not sure this is the issue here, but could be: the
> > benchmark is UDP, isn't it?
> 
> Michael, 
> 
> What/where is this tap buffer?
> we're talking on VM TX, so looking on tun_get_user I see a call to 
> skb_copy_datagram_from_iovec() to copy from the user buffer to an skb, then a call to netif_rx_ni() and that's it... As for your question, indeed udp, the VM runs netperf/UDP_STREAM
> 
> Or.

Queue is not the right word, sorry.

I was referring to the fact that, when bridge floods a packet to
multiple interfaces, it clones the skb and frees the original, which
breaks the send buffer accounting in tun and might let you overrun the
tx queue in one of the devices.  This does not usually happen with raw
sockets.  This is the code in question:

                        if (prev != NULL) {
                                struct sk_buff *skb2;

                                if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
                                        br->dev->stats.tx_dropped++;
                                        kfree_skb(skb);
                                        return;
                                }

                                __packet_hook(prev, skb2);
                        }

the thing to check then would be that some kind of misconfiguration
does not cause the bridge to flood your packets to multiple interfaces.

-- 
MST

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21 12:01                           ` Michael S. Tsirkin
@ 2009-07-21 12:14                             ` Herbert Xu
  2009-07-21 13:41                               ` Or Gerlitz
  0 siblings, 1 reply; 30+ messages in thread
From: Herbert Xu @ 2009-07-21 12:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Mark McLoughlin, netdev, Dor Laor, qemu-devel, Or Gerlitz,
	Jan Kiszka

On Tue, Jul 21, 2009 at 03:01:42PM +0300, Michael S. Tsirkin wrote:
>
> the thing to check then would be that some kind of misconfiguration
> does not cause the bridge to flood your packets to multiple interfaces.

Right, we should make sure that the interfaces are not in promiscous
mode.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend  - some performance measurements
  2009-07-21 12:14                             ` Herbert Xu
@ 2009-07-21 13:41                               ` Or Gerlitz
  0 siblings, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-21 13:41 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mark McLoughlin, Michael S. Tsirkin, netdev, Dor Laor, qemu-devel,
	Jan Kiszka

Herbert Xu wrote:
> On Tue, Jul 21, 2009 at 03:01:42PM +0300, Michael S. Tsirkin wrote:

>> the thing to check then would be that some kind of misconfiguration
>> does not cause the bridge to flood your packets to multiple interfaces.

> Right, we should make sure that the interfaces are not in promiscous mode

Michael, Herbert, 

First, I don't see how flooding can happen in my setup, I have only two interfaces on 
the bridge (see below), a tap and a NIC (vlan) and the bridge will never attempt to forward
a packet through the port it was received. Second, the bridge always set all interfaces
attached to it to be in promiscous mode, see the call to dev_set_promiscuity() from br_add_if()
but this doesn't mean it applied flooding, it does mac learning...

Or.

# brctl show
bridge name     bridge id               STP enabled     interfaces
br0             8000.0030485f9977       no              eth1.4009
                                                        tap0

The VM mac is de:ab:be:01:01:09 and the remote node mac is 00:30:48:65:a6:2b, you 
can see that these two macs were learned by the bridge and hence no flooding is expected.

# brctl showmacs br0
port no mac addr                is local?       ageing timer
  1     00:30:48:5f:99:77       yes                0.00
  1     00:30:48:65:a6:2b       no                12.50
  2     06:f5:76:64:a0:d4       yes                0.00
  2     de:ab:be:01:01:09       no                 0.00

^ permalink raw reply	[flat|nested] 30+ messages in thread

[parent not found: <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com>]

* Re: [Qemu-devel] [PATCH] net: add raw backend
       [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com>
@ 2009-07-02 12:08   ` Or Gerlitz
  0 siblings, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-02 12:08 UTC (permalink / raw)
  To: Filip Navara, qemu-devel

Filip Navara wrote:
> This doesn't compile on Win32. In fact, even if it compiled, it
> wouldn't work. All the code should be part of the #ifndef _WIN32 block

okay, will change that in the next revision of the patch

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-01 15:46 [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz
  2009-07-01 16:21 ` Jamie Lokier
       [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com>
@ 2009-07-02 15:43 ` Michael S. Tsirkin
  2009-07-07 14:45   ` Or Gerlitz
  2 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2009-07-02 15:43 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel

On Wed, Jul 01, 2009 at 06:46:43PM +0300, Or Gerlitz wrote:
> Add raw network backend option which uses a packet socket to provide
> raw networking access. Once the socket is opened its bouned to a
> provided host interface, such that packets received on the interface
> are delivered to the VM and packets sent by the VM are sent to the
> interface.
> 
> Signed-off-by: Or Gerlitz<ogerlitz@voltaire.com>

Looks good to me overall. A couple of comments:

> diff --git a/net.c b/net.c
> index 55f70f2..f7ff381 100644
> --- a/net.c
> +++ b/net.c
> @@ -93,6 +93,9 @@
>  #endif
>  #endif
> 
> +#include <netpacket/packet.h>
> +#include <net/ethernet.h>
> +
>  #if defined(__OpenBSD__)
>  #include <util.h>
>  #endif
> @@ -1476,6 +1479,155 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model,
> 
>  #endif /* !_WIN32 */
> 
> +typedef struct RAWState {
> +    VLANClientState *vc;
> +    int fd;
> +    uint8_t buf[4096];
> +    int promisc;
> +} RAWState;
> +
> +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc)
> +{
> +	int fd, ret;
> +	struct ifreq req;
> +	struct sockaddr_ll lladdr;
> +
> +	fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> +	if (fd < 0)
> +		config_error(mon, "packet socket failed\n");
> +
> +	memset(&req, 0, sizeof(req));
> +	strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
> +	ret = ioctl(fd, SIOCGIFINDEX, &req);
> +	if (ret < 0)
> +		config_error(mon, "SIOCGIFINDEX failed\n");
> +
> +	memset(&lladdr, 0, sizeof(lladdr));
> +	lladdr.sll_family   = AF_PACKET;
> +	lladdr.sll_protocol = htons(ETH_P_ALL);
> +	lladdr.sll_ifindex  = req.ifr_ifindex;
> +	ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
> +	if (ret < 0)
> +		config_error(mon, "bind failed\n");
> +
> +	/* set iface to promiscuous mode (packets sent to the VM MAC) */
> +	if (promisc) {
> +		ret = ioctl(fd, SIOCGIFFLAGS, &req);
> +		if (ret < 0)
> +			perror("SIOCGIFFLAGS failed\n");
> +		req.ifr_flags |= IFF_PROMISC;
> +		ret = ioctl(fd, SIOCSIFFLAGS, &req);
> +		if (ret < 0)
> +			config_error(mon, "SIOCSIFFLAGS to promiscous failed\n");
> +	}
> +
> +	ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
> +	if (ret < 0)
> +		config_error(mon, "O_NONBLOCK set failed\n");
> +
> +	return fd;
> +}
> +
> +static void raw_cleanup(VLANClientState *vc)
> +{
> +	struct ifreq req;
> +	RAWState *s = vc->opaque;
> +
> +	qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> +	if (s->promisc) {
> +		ioctl(s->fd, SIOCGIFFLAGS, &req);
> +		req.ifr_flags &= ~IFF_PROMISC;
> +		ioctl(s->fd, SIOCSIFFLAGS, &req);
> +	}
> +	close(s->fd);
> +	qemu_free(s);
> +}
> +
> +static void raw_send(void *opaque);
> +
> +static int raw_can_send(void *opaque)
> +{
> +	RAWState *s = opaque;
> +
> +	return qemu_can_send_packet(s->vc);
> +}
> +
> +static void raw_send_completed(VLANClientState *vc, ssize_t len)
> +{
> +	RAWState *s = vc->opaque;
> +
> +	qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s);
> +}
> +
> +static void raw_send(void *opaque)
> +{
> +	RAWState *s = opaque;
> +	int size;
> +
> +	do {
> +		size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
> +		if (size <= 0)
> +			break;

A couple of improvement suggestions here:
- You might get size > sizeof(s->buf).
  Should not happen, but you might want to check for this condition and
  report it + discard the packet.

- It might be a good idea to request aux data and verify that checksum
  is set, calculate it if not. this will make it possible to bind to
  a local device as well.


> +
> +		size = qemu_send_packet_async(s->vc, s->buf, size,
> +						raw_send_completed);
> +		if (size == 0)
> +			qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> +
> +	} while (size > 0);
> +}
> +
> +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov,
> +				int iovcnt)
> +{
> +	ssize_t len;
> +	RAWState *s = vc->opaque;
> +
> +	do {
> +		len = writev(s->fd, iov, iovcnt);
> +	} while (len == -1 && (errno == EINTR || errno == EAGAIN));
> +
> +	return len;
> +}
> +
> +static ssize_t raw_receive(VLANClientState *vc, const uint8_t *buf, size_t size)
> +{
> +	struct iovec iov[1];
> +
> +	iov[0].iov_base = (char *)buf;
> +	iov[0].iov_len  = size;
> +
> +	return raw_receive_iov(vc, iov, 1);
> +}
> +
> +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model,
> +			const char *name, const char *ifname,
> +			int promisc, int fd)
> +{
> +	RAWState *s;
> +
> +	s = qemu_mallocz(sizeof(RAWState));
> +
> +	if (fd == -1) {
> +		s->fd = net_raw_fd_init(mon, ifname, promisc);
> +		s->promisc = promisc;
> +	} else
> +		s->fd = fd;
> +
> +	s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive,
> +					raw_receive_iov, raw_cleanup, s);
> +	qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s);
> +
> +	if (fd == -1)
> +		snprintf(s->vc->info_str, sizeof(s->vc->info_str),
> +			"raw: ifname=%s, promisc=%d", ifname, promisc);
> +	else
> +		snprintf(s->vc->info_str, sizeof(s->vc->info_str),
> +			"raw: fd=%d", fd);
> +
> +	return 0;
> +}
> +
>  #if defined(CONFIG_VDE)
>  typedef struct VDEState {
>      VLANClientState *vc;
> @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p)
>          }
>      } else
>  #endif
> +    if (!strcmp(device, "raw")) {
> +	char chkbuf[64], ifname[64];
> +        int raw_fd = -1;
> +        int promisc = 1;

promisc = 0 might be a safer default.

> +        if (get_param_value(buf, sizeof(buf), "fd", p) > 0) {
> +            static const char * const fd_params[] = {
> +                "vlan", "name", "fd", NULL
> +            };
> +            if (check_params(chkbuf, sizeof(chkbuf), fd_params, p) < 0) {
> +                config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p);
> +                ret = -1;
> +                goto out;
> +            }
> +	    raw_fd = strtol(buf, NULL, 0);
> +	    fcntl(raw_fd, F_SETFL, fcntl(raw_fd, F_GETFL | O_NONBLOCK));
> +        } else {
> +            static const char * const tap_params[] = {
> +                "vlan", "name", "ifname", "promisc", NULL
> +            };
> +            if (check_params(chkbuf, sizeof(chkbuf), tap_params, p) < 0) {
> +                config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p);
> +                ret = -1;
> +                goto out;
> +            }
> +            if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) {
> +            	config_error(mon, "raw: no interface name\n");
> +            	ret = -1;
> +            	goto out;
> +            }
> +            if (get_param_value(buf, sizeof(buf), "promisc", p))
> +                promisc = atoi(buf);
> +	}
> +	vlan->nb_host_devs++;
> +	ret = net_raw_init(mon, vlan, device, name, ifname, promisc, raw_fd);
> +    } else
>      if (!strcmp(device, "socket")) {
>          char chkbuf[64];
>          if (get_param_value(buf, sizeof(buf), "fd", p) > 0) {
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 503da33..0a3c807 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -761,6 +761,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
>      "                use 'sndbuf=nbytes' to limit the size of the send buffer\n"
>  #endif
>  #endif
> +    "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n"
> +    "                bound the host network interface to VLAN 'n' in a raw manner:\n"

in a raw manner -> using a raw packet socket

> +    "                packets received on the interface are delivered to the vlan and\n"
> +    "                packets delivered on the vlan are sent to the interface\n"

document promisc option?

>      "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
>      "                connect the vlan 'n' to another VLAN using a socket connection\n"
>      "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n"
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-02 15:43 ` Michael S. Tsirkin
@ 2009-07-07 14:45   ` Or Gerlitz
  2009-07-07 14:49     ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Or Gerlitz @ 2009-07-07 14:45 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Herbert Xu, qemu-devel

Michael S. Tsirkin wrote:
>> +static void raw_send(void *opaque)
>> +	do {
>> +		size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
>> +		if (size <= 0)
>> +			break;
 
> A couple of improvement suggestions here:
> - You might get size > sizeof(s->buf). Should not happen, but you might want to 
>   check for this condition and report it + discard the packet.

okay, will do

> - It might be a good idea to request aux data and verify that checksum is set, 
>   calculate it if not. this will make it possible to bind to a local device as well.

thanks for the heads up, I am still not sure to follow the documentation/logic wrt to checksum reporting of the af_packet kernel code. I'll look on this. What do you mean by "this will make it possible to bind to a local device as well"?

>> @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p)
>> +    if (!strcmp(device, "raw")) {
>> +        int promisc = 1;
 
> promisc = 0 might be a safer default

I can do that, but I am not sure why its safer, e.g bridge always set all interfaces to promisc

>> +    "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n"
>> +    "                bound the host network interface to VLAN 'n' in a raw manner:\n"

> in a raw manner -> using a raw packet socket

okay

>> +    "                packets received on the interface are delivered to the vlan and\n"
>> +    "                packets delivered on the vlan are sent to the interface\n"

> document promisc option?

sure


Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-07 14:45   ` Or Gerlitz
@ 2009-07-07 14:49     ` Michael S. Tsirkin
  2009-07-08 14:46       ` Or Gerlitz
  2009-07-08 15:06       ` Or Gerlitz
  0 siblings, 2 replies; 30+ messages in thread
From: Michael S. Tsirkin @ 2009-07-07 14:49 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel

On Tue, Jul 07, 2009 at 05:45:39PM +0300, Or Gerlitz wrote:
> Michael S. Tsirkin wrote:
> >> +static void raw_send(void *opaque)
> >> +	do {
> >> +		size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
> >> +		if (size <= 0)
> >> +			break;
>  
> > A couple of improvement suggestions here:
> > - You might get size > sizeof(s->buf). Should not happen, but you might want to 
> >   check for this condition and report it + discard the packet.
> 
> okay, will do
> 
> > - It might be a good idea to request aux data and verify that checksum is set, 
> >   calculate it if not. this will make it possible to bind to a local device as well.
> 
> thanks for the heads up, I am still not sure to follow the documentation/logic wrt to checksum reporting of the af_packet kernel code. I'll look on this. What do you mean by "this will make it possible to bind to a local device as well"?

See comment in qemu-kvm about dhclient as an example.

> >> @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p)
> >> +    if (!strcmp(device, "raw")) {
> >> +        int promisc = 1;
>  
> > promisc = 0 might be a safer default
> 
> I can do that, but I am not sure why its safer, e.g bridge always set all interfaces to promisc

Think of what happens if someone does kill -9 on qemu
This can not happen with bridge.

> >> +    "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n"
> >> +    "                bound the host network interface to VLAN 'n' in a raw manner:\n"
> 
> > in a raw manner -> using a raw packet socket
> 
> okay
> 
> >> +    "                packets received on the interface are delivered to the vlan and\n"
> >> +    "                packets delivered on the vlan are sent to the interface\n"
> 
> > document promisc option?
> 
> sure
> 
> 
> Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-07 14:49     ` Michael S. Tsirkin
@ 2009-07-08 14:46       ` Or Gerlitz
  2009-07-08 15:06       ` Or Gerlitz
  1 sibling, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-08 14:46 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Herbert Xu, qemu-devel

Michael S. Tsirkin wrote:
> On Tue, Jul 07, 2009 at 05:45:39PM +0300, Or Gerlitz wrote:
>   
>> I am not sure why its safer, e.g bridge always set all interfaces to promisc
>>     
> Think of what happens if someone does kill -9 on qemu This can not happen with bridge
>   
yes, I can make the default to be promisc=0

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Qemu-devel] [PATCH] net: add raw backend
  2009-07-07 14:49     ` Michael S. Tsirkin
  2009-07-08 14:46       ` Or Gerlitz
@ 2009-07-08 15:06       ` Or Gerlitz
  1 sibling, 0 replies; 30+ messages in thread
From: Or Gerlitz @ 2009-07-08 15:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Herbert Xu, qemu-devel

Michael S. Tsirkin wrote:
> On Tue, Jul 07, 2009 at 05:45:39PM +0300, Or Gerlitz wrote:
>> Michael S. Tsirkin wrote:

>>> It might be a good idea to request aux data and verify that checksum is set, 
>>> calculate it if not. this will make it possible to bind to a local device as well.

>> I am still not sure to follow the checksum reporting of the af_packet kernel code

> See comment in qemu-kvm about dhclient as an example.

yes, some sort of "host software checksum offload" when using packet socket make sense. I wasn't sure when the af_packet kernel code does report on "checksum okay", it only checks for "ip_summed == CHECKSUM_PARTIAL".

Or.

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2009-07-21 13:41 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-07-01 15:46 [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz
2009-07-01 16:21 ` Jamie Lokier
2009-07-02 12:25   ` Or Gerlitz
2009-07-03  2:39     ` Jamie Lokier
2009-07-07 13:33       ` Or Gerlitz
2009-07-07 14:57         ` Jamie Lokier
2009-07-08 14:45           ` Or Gerlitz
2009-07-14 13:54             ` Or Gerlitz
2009-07-15 20:38             ` Jamie Lokier
2009-07-15 21:06               ` Jan Kiszka
2009-07-15 21:52                 ` Jamie Lokier
2009-07-16  8:29               ` Or Gerlitz
2009-07-20 14:13               ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz
2009-07-20 15:53                 ` Herbert Xu
2009-07-20 18:20                   ` Michael S. Tsirkin
2009-07-21  1:46                     ` Herbert Xu
2009-07-21  7:03                   ` Or Gerlitz
2009-07-21  7:25                     ` Herbert Xu
2009-07-21 10:17                       ` Or Gerlitz
2009-07-21 10:27                       ` Michael S. Tsirkin
2009-07-21 11:05                         ` Or Gerlitz
2009-07-21 12:01                           ` Michael S. Tsirkin
2009-07-21 12:14                             ` Herbert Xu
2009-07-21 13:41                               ` Or Gerlitz
     [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com>
2009-07-02 12:08   ` [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz
2009-07-02 15:43 ` Michael S. Tsirkin
2009-07-07 14:45   ` Or Gerlitz
2009-07-07 14:49     ` Michael S. Tsirkin
2009-07-08 14:46       ` Or Gerlitz
2009-07-08 15:06       ` Or Gerlitz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).