* [Qemu-devel] [PATCH] net: add raw backend @ 2009-07-01 15:46 Or Gerlitz 2009-07-01 16:21 ` Jamie Lokier ` (2 more replies) 0 siblings, 3 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-01 15:46 UTC (permalink / raw) To: qemu-devel; +Cc: Herbert Xu Add raw network backend option which uses a packet socket to provide raw networking access. Once the socket is opened its bouned to a provided host interface, such that packets received on the interface are delivered to the VM and packets sent by the VM are sent to the interface. Signed-off-by: Or Gerlitz<ogerlitz@voltaire.com> diff --git a/net.c b/net.c index 55f70f2..f7ff381 100644 --- a/net.c +++ b/net.c @@ -93,6 +93,9 @@ #endif #endif +#include <netpacket/packet.h> +#include <net/ethernet.h> + #if defined(__OpenBSD__) #include <util.h> #endif @@ -1476,6 +1479,155 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model, #endif /* !_WIN32 */ +typedef struct RAWState { + VLANClientState *vc; + int fd; + uint8_t buf[4096]; + int promisc; +} RAWState; + +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc) +{ + int fd, ret; + struct ifreq req; + struct sockaddr_ll lladdr; + + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (fd < 0) + config_error(mon, "packet socket failed\n"); + + memset(&req, 0, sizeof(req)); + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); + ret = ioctl(fd, SIOCGIFINDEX, &req); + if (ret < 0) + config_error(mon, "SIOCGIFINDEX failed\n"); + + memset(&lladdr, 0, sizeof(lladdr)); + lladdr.sll_family = AF_PACKET; + lladdr.sll_protocol = htons(ETH_P_ALL); + lladdr.sll_ifindex = req.ifr_ifindex; + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); + if (ret < 0) + config_error(mon, "bind failed\n"); + + /* set iface to promiscuous mode (packets sent to the VM MAC) */ + if (promisc) { + ret = ioctl(fd, SIOCGIFFLAGS, &req); + if (ret < 0) + perror("SIOCGIFFLAGS failed\n"); + req.ifr_flags |= IFF_PROMISC; + ret = ioctl(fd, SIOCSIFFLAGS, &req); + if (ret < 0) + config_error(mon, "SIOCSIFFLAGS to promiscous failed\n"); + } + + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); + if (ret < 0) + config_error(mon, "O_NONBLOCK set failed\n"); + + return fd; +} + +static void raw_cleanup(VLANClientState *vc) +{ + struct ifreq req; + RAWState *s = vc->opaque; + + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); + if (s->promisc) { + ioctl(s->fd, SIOCGIFFLAGS, &req); + req.ifr_flags &= ~IFF_PROMISC; + ioctl(s->fd, SIOCSIFFLAGS, &req); + } + close(s->fd); + qemu_free(s); +} + +static void raw_send(void *opaque); + +static int raw_can_send(void *opaque) +{ + RAWState *s = opaque; + + return qemu_can_send_packet(s->vc); +} + +static void raw_send_completed(VLANClientState *vc, ssize_t len) +{ + RAWState *s = vc->opaque; + + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); +} + +static void raw_send(void *opaque) +{ + RAWState *s = opaque; + int size; + + do { + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); + if (size <= 0) + break; + + size = qemu_send_packet_async(s->vc, s->buf, size, + raw_send_completed); + if (size == 0) + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); + + } while (size > 0); +} + +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov, + int iovcnt) +{ + ssize_t len; + RAWState *s = vc->opaque; + + do { + len = writev(s->fd, iov, iovcnt); + } while (len == -1 && (errno == EINTR || errno == EAGAIN)); + + return len; +} + +static ssize_t raw_receive(VLANClientState *vc, const uint8_t *buf, size_t size) +{ + struct iovec iov[1]; + + iov[0].iov_base = (char *)buf; + iov[0].iov_len = size; + + return raw_receive_iov(vc, iov, 1); +} + +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model, + const char *name, const char *ifname, + int promisc, int fd) +{ + RAWState *s; + + s = qemu_mallocz(sizeof(RAWState)); + + if (fd == -1) { + s->fd = net_raw_fd_init(mon, ifname, promisc); + s->promisc = promisc; + } else + s->fd = fd; + + s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive, + raw_receive_iov, raw_cleanup, s); + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); + + if (fd == -1) + snprintf(s->vc->info_str, sizeof(s->vc->info_str), + "raw: ifname=%s, promisc=%d", ifname, promisc); + else + snprintf(s->vc->info_str, sizeof(s->vc->info_str), + "raw: fd=%d", fd); + + return 0; +} + #if defined(CONFIG_VDE) typedef struct VDEState { VLANClientState *vc; @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p) } } else #endif + if (!strcmp(device, "raw")) { + char chkbuf[64], ifname[64]; + int raw_fd = -1; + int promisc = 1; + if (get_param_value(buf, sizeof(buf), "fd", p) > 0) { + static const char * const fd_params[] = { + "vlan", "name", "fd", NULL + }; + if (check_params(chkbuf, sizeof(chkbuf), fd_params, p) < 0) { + config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p); + ret = -1; + goto out; + } + raw_fd = strtol(buf, NULL, 0); + fcntl(raw_fd, F_SETFL, fcntl(raw_fd, F_GETFL | O_NONBLOCK)); + } else { + static const char * const tap_params[] = { + "vlan", "name", "ifname", "promisc", NULL + }; + if (check_params(chkbuf, sizeof(chkbuf), tap_params, p) < 0) { + config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p); + ret = -1; + goto out; + } + if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) { + config_error(mon, "raw: no interface name\n"); + ret = -1; + goto out; + } + if (get_param_value(buf, sizeof(buf), "promisc", p)) + promisc = atoi(buf); + } + vlan->nb_host_devs++; + ret = net_raw_init(mon, vlan, device, name, ifname, promisc, raw_fd); + } else if (!strcmp(device, "socket")) { char chkbuf[64]; if (get_param_value(buf, sizeof(buf), "fd", p) > 0) { diff --git a/qemu-options.hx b/qemu-options.hx index 503da33..0a3c807 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -761,6 +761,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, " use 'sndbuf=nbytes' to limit the size of the send buffer\n" #endif #endif + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" + " bound the host network interface to VLAN 'n' in a raw manner:\n" + " packets received on the interface are delivered to the vlan and\n" + " packets delivered on the vlan are sent to the interface\n" "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" " connect the vlan 'n' to another VLAN using a socket connection\n" "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n" ^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-01 15:46 [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz @ 2009-07-01 16:21 ` Jamie Lokier 2009-07-02 12:25 ` Or Gerlitz [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com> 2009-07-02 15:43 ` Michael S. Tsirkin 2 siblings, 1 reply; 30+ messages in thread From: Jamie Lokier @ 2009-07-01 16:21 UTC (permalink / raw) To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel Or Gerlitz wrote: > Add raw network backend option which uses a packet socket to provide > raw networking access. Once the socket is opened its bouned to a > provided host interface, such that packets received on the interface > are delivered to the VM and packets sent by the VM are sent to the > interface. It looks like it would be functionally identical to the pcap network backend, with the same advantages and problems, but one less dependency on an external library (and removes any compatibility with ancient kernels that are in libpcap, but QEMU doesn't work on them anyway). -- Jamie ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-01 16:21 ` Jamie Lokier @ 2009-07-02 12:25 ` Or Gerlitz 2009-07-03 2:39 ` Jamie Lokier 0 siblings, 1 reply; 30+ messages in thread From: Or Gerlitz @ 2009-07-02 12:25 UTC (permalink / raw) To: Jamie Lokier; +Cc: Herbert Xu, qemu-devel Jamie Lokier wrote: > It looks like it would be functionally identical to the pcap network backend, > with the same advantages and problems, but one less dependency on an external > library (and removes any compatibility with ancient kernels that are in libpcap, > but QEMU doesn't work on them anyway). Assuming that by "pcap network backend" you refer to the -net dump backend, I don't think this (the idea/patch being functionally identical) is the case with the current code: the dump backend opens a file and writes there packets received from the vlan in pcap format, that's all. So first, there's no direct linking with libpcap, and more important, the packets aren't going to the network through the dump backend. With the raw backend, there's no pcap file, but packets are sent to and received from the network through packet socket. Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-02 12:25 ` Or Gerlitz @ 2009-07-03 2:39 ` Jamie Lokier 2009-07-07 13:33 ` Or Gerlitz 0 siblings, 1 reply; 30+ messages in thread From: Jamie Lokier @ 2009-07-03 2:39 UTC (permalink / raw) To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel Or Gerlitz wrote: > Jamie Lokier wrote: > > It looks like it would be functionally identical to the pcap > > network backend, with the same advantages and problems, but one > > less dependency on an external library (and removes any > > compatibility with ancient kernels that are in libpcap, but QEMU > > doesn't work on them anyway). > > Assuming that by "pcap network backend" you refer to the -net dump > backend, I don't think this (the idea/patch being functionally > identical) is the case with the current code: the dump backend opens a > file and writes there packets received from the vlan in pcap format, > that's all. So first, there's no direct linking with libpcap, and more > important, the packets aren't going to the network through the dump > backend. With the raw backend, there's no pcap file, but packets are > sent to and received from the network through packet socket. No, I meant "-net pcap", whose patch is announced at http://lists.freebsd.org/pipermail/freebsd-emulation/2007-February/003108.html and found here: http://people.freebsd.org/~jkim/patch-qemu-pcap.diff I saw it mention on this list in the last few months. It sends and receives packets over a host network interface using libpcap. -- Jamie ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-03 2:39 ` Jamie Lokier @ 2009-07-07 13:33 ` Or Gerlitz 2009-07-07 14:57 ` Jamie Lokier 0 siblings, 1 reply; 30+ messages in thread From: Or Gerlitz @ 2009-07-07 13:33 UTC (permalink / raw) To: Jamie Lokier, Anthony Liguori; +Cc: Herbert Xu, Jan Kiszka, qemu-devel Jamie Lokier wrote: > No, I meant "-net pcap", whose patch is announced at > http://lists.freebsd.org/pipermail/freebsd-emulation/2007-February/003108.html > and found here: http://people.freebsd.org/~jkim/patch-qemu-pcap.diff I saw it mention on this list in the last few months. It sends and receives packets over a host network interface using libpcap. okay, looking a bit on the archives I realized that the -net pcap backend was suggested on the qemu-devel list at least twice, on 2007 (your pointer) and recently, e.g @ http://lists.gnu.org/archive/html/qemu-devel/2009-03/msg00895.html and the mail threads that followed from March, April and May 2009. Under Linux the pcap library uses a packet socket, so basically it could make some sense to go through libpcap and not directly to sockets, but there are also some disadvantages which will not less qemu implement some related optimizations which are not integrated into libpcap. Now, before going into libpcap vs. packet socket, I'd be happy if you or Antony can help me understand the comments that with this approach guest <--> host communication is impossible. AFAIK, the only packet sent by Qemu is gratuitous ARP after migration, but it doesn't fall into guest <--> host communication over the NIC/vlan/back-end gang, so I don't see what is this traffic which is impossible with the bridge-less approach. Also, if for some reason one need to communicate from the guest to the host the -net raw can instructed to run over a veth Linux interface couple which are connected to a bridge. Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-07 13:33 ` Or Gerlitz @ 2009-07-07 14:57 ` Jamie Lokier 2009-07-08 14:45 ` Or Gerlitz 0 siblings, 1 reply; 30+ messages in thread From: Jamie Lokier @ 2009-07-07 14:57 UTC (permalink / raw) To: Or Gerlitz; +Cc: Herbert Xu, Jan Kiszka, qemu-devel Or Gerlitz wrote: > Now, before going into libpcap vs. packet socket, I'd be happy if you or > Antony can help me understand the comments that with this approach guest > <--> host communication is impossible. AFAIK, the only packet sent by > Qemu is gratuitous ARP after migration, but it doesn't fall into guest > <--> host communication over the NIC/vlan/back-end gang, so I don't see > what is this traffic which is impossible with the bridge-less approach. The problem is simply what the guest sends goes out on the network and is not looped backed to the host network stack, and vice versa. So if your host is 192.168.1.1 and is running a DNS server (say), and the guest is 192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't see those queries. Same if you're running an FTP server on the host and the guest wants to connect to it, etc. It also means multiple guests can't see each other, for the same reason. So it's much less useful than bridging, where the guests and host can all see each other and connect to each other. Unfortunately, bridging is a pain to set up, if your host has any complicated or automatic network configuration already. For example, it's impossible to make it work with NetworkManager, which manages a mobile host's wired and wireless network interfaces. (I want to use lots of VMs on my laptop, and I want them to appear on the same network as whatever my host is connecting to automatically. Bridging - the way Linux does it - makes that difficult). The main advantage of pcap/packet is you don't need potentially difficult to configure bridges set up on the host network especially to support VMs. pcap/packet just works - as long as you don't need any host<->guest or guest<->guest communication. The main advantage of bridging is you get simple, complete host<->guest and guest<->guest communication - (except the host can firewall them). It would be really nice to find a way which has the advantages of both. Either by adding a different bridging mode to Linux, where host interfaces can be configured for IP and the bridge hangs off the host interface, or by a modified tap interface, or by an alternative pcap/packet-like interface which forwards packets in a similar way to bridging. (Maybe those are different ways of saying the same thing.) > Also, if for some reason one need to communicate from the guest to the > host the -net raw can instructed to run over a veth Linux interface > couple which are connected to a bridge. But you might as well use a tap interface connected to a bridge instead. tap or veth+packet are virtually equivalent, and they both have the same bridging configuration problems if the host network is interesting. -- Jamie ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-07 14:57 ` Jamie Lokier @ 2009-07-08 14:45 ` Or Gerlitz 2009-07-14 13:54 ` Or Gerlitz 2009-07-15 20:38 ` Jamie Lokier 0 siblings, 2 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-08 14:45 UTC (permalink / raw) To: Jamie Lokier; +Cc: Herbert Xu, Jan Kiszka, qemu-devel Jamie Lokier wrote: > The problem is simply what the guest sends goes out on the network and is not looped backed to the host network stack, and vice versa. So if your host is 192.168.1.1 and is running a DNS server (say), and the guest is 192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't see those queries. Same if you're running an FTP server on the host and the guest wants to connect to it, etc. It also means multiple guests can't see each other, for the same reason. So it's much less useful than bridging, where the guests and host can all see each other and connect to each other. I wasn't sure to follow if your example refers to the case when networking uses the bridge or NAT. If its bridge, then through which bridge interface the packet arrives the host stack? say you have a bridge whose attached interfaces are tap1(VM1), tap2(VM2) and eth0(NIC), in your example did you mean that the host IP address is assigned to the bridge interface? or you were referring a NAT based scheme? > Unfortunately, bridging is a pain to set up, if your host has any complicated or automatic network configuration already. As you said bridging requires more configuration, but not less important the performance (packets per second and cpu utilization) one can get with bridge+tap is much lower vs what you get with the raw mode approach. All in all, its clear that with this approach VM/VM and VM/Host communication would have to get switched either at the NIC (e.g SR/IOV capable NICs supporting a virtual bridge) or at an external switch and make a U turn. There's a bunch of reasons why people would like to do that, among them performance boost, the ability to shape, manage and monitor VM/VM traffic in external switches and more. > It would be really nice to find a way which has the advantages of both. Either by adding a different bridging mode to Linux, where host interfaces can be configured for IP and the bridge hangs off the host interface, or by a modified tap interface, or by an alternative > pcap/packet-like interface which forwards packets in a similar way to bridging. It seems that this will not yield the performance improvement we can get with going directly to the NIC. But if someone comes up and makes such a mode working, it can be merged into qemu as well along with the raw mode. Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-08 14:45 ` Or Gerlitz @ 2009-07-14 13:54 ` Or Gerlitz 2009-07-15 20:38 ` Jamie Lokier 1 sibling, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-14 13:54 UTC (permalink / raw) To: Jamie Lokier; +Cc: Herbert Xu, Jan Kiszka, qemu-devel Or Gerlitz wrote: > Jamie Lokier wrote: >> The problem is simply what the guest sends goes out on the network >> and is not looped backed to the host network stack, and vice versa [...] > I wasn't sure to follow if your example refers to the case when > networking uses the bridge or NAT. If its bridge, then through which > bridge interface the packet arrives the host stack? say you have a > bridge whose attached interfaces are tap1(VM1), tap2(VM2) and > eth0(NIC), in your example did you mean that the host IP address is > assigned to the bridge interface? or you were referring a NAT based > scheme? Hi Jamie, Can you comment on my email? Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-08 14:45 ` Or Gerlitz 2009-07-14 13:54 ` Or Gerlitz @ 2009-07-15 20:38 ` Jamie Lokier 2009-07-15 21:06 ` Jan Kiszka ` (2 more replies) 1 sibling, 3 replies; 30+ messages in thread From: Jamie Lokier @ 2009-07-15 20:38 UTC (permalink / raw) To: Or Gerlitz; +Cc: Herbert Xu, Jan Kiszka, qemu-devel Or Gerlitz wrote: > Jamie Lokier wrote: > >The problem is simply what the guest sends goes out on the network and is > >not looped backed to the host network stack, and vice versa. So if your > >host is 192.168.1.1 and is running a DNS server (say), and the guest is > >192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't > >see those queries. Same if you're running an FTP server on the host and > >the guest wants to connect to it, etc. It also means multiple guests can't > >see each other, for the same reason. So it's much less useful than > >bridging, where the guests and host can all see each other and connect to > >each other. > I wasn't sure to follow if your example refers to the case when > networking uses the bridge or NAT. If its bridge, then through which > bridge interface the packet arrives the host stack? say you have a > bridge whose attached interfaces are tap1(VM1), tap2(VM2) and eth0(NIC), > in your example did you mean that the host IP address is assigned to the > bridge interface? or you were referring a NAT based scheme? When using a bridge, you set the IP address on the bridge itself (for example, br0). DHCP runs on the bridge itself, so does the rest of the Linux host stack, although you can use raw sockets on the other interfaces. But reading and controlling the hardware is done on the interfaces. So if you have some program like NetworkManager which checks if you have a wire plugged into eth0, it has to read eth0 to get the wire status, but it has to run DHCP on br0. Those programs don't generally have that option, which makes bridges difficult to use for VMs in a transparent way. I wasn't referring to NAT, but you can use NAT with a bridge on Linux; it's called brouting :-) > >Unfortunately, bridging is a pain to set up, if your host has any > >complicated or automatic network configuration already. > As you said bridging requires more configuration A bridge is quite simple to configure. Unfortunately because Linux requires all the IP configuration on the bridge device, but network device control on the network device, bridges don't work well with automatic configuration tools. If you could apply host IP configuration to the network device and still have a bridge, that would be perfect. You would just create br0, add tap1(VM1), tap2(VM2) and eth0(NIC), and everything would work perfectly. > but not less important the performance (packets per second and cpu > utilization) one can get with bridge+tap is much lower vs what you > get with the raw mode approach. Have you measured it? > All in all, its clear that with this approach VM/VM and VM/Host > communication would have to get switched either at the NIC (e.g > SR/IOV capable NICs supporting a virtual bridge) or at an external > switch and make a U turn. Unfortunately that's usually impossible. Most switches don't do U turns, and a lot of simple networks don't have any switches except a home router. > There's a bunch of reasons why people would > like to do that, among them performance boost, No, it makes performance _much_ worse if you have packets leaving the host, do a U turn and come back on the same link. Much better to use a bridge inside the host. Probably ten times faster because host's internal networking is much faster than a typical gigabit link :-) > the ability to shape, > manage and monitor VM/VM traffic in external switches and more. That could be useful, but I think it's's probably quite unusual for someone to want to shape traffic between a VM and it's own host. Also if you want to do that, you can do it inside the host. Sometimes it would be useful to send it outside the host and U turn, but not very often; only for diagnostics I would think. And even that can be done with Linux bridges, using VLANs :-) > >It would be really nice to find a way which has the advantages of both. > >Either by adding a different bridging mode to Linux, where host interfaces > >can be configured for IP and the bridge hangs off the host interface, or > >by a modified tap interface, or by an alternative > >pcap/packet-like interface which forwards packets in a similar way to > >bridging. > It seems that this will not yield the performance improvement we can > get with going directly to the NIC. If you don't need any host<->VM networking, maybe a raw packet socket is faster. But are you sure it's faster? I'd want to see measurements before I believe it. If you need any host<->VM networking, most of the time the packet socket isn't an option at all. Not many switches will 'U turn' packets as you suggest. -- Jamie ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-15 20:38 ` Jamie Lokier @ 2009-07-15 21:06 ` Jan Kiszka 2009-07-15 21:52 ` Jamie Lokier 2009-07-16 8:29 ` Or Gerlitz 2009-07-20 14:13 ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz 2 siblings, 1 reply; 30+ messages in thread From: Jan Kiszka @ 2009-07-15 21:06 UTC (permalink / raw) To: Jamie Lokier; +Cc: Or Gerlitz, Herbert Xu, qemu-devel [-- Attachment #1: Type: text/plain, Size: 5249 bytes --] Jamie Lokier wrote: > Or Gerlitz wrote: >> Jamie Lokier wrote: >>> The problem is simply what the guest sends goes out on the network and is >>> not looped backed to the host network stack, and vice versa. So if your >>> host is 192.168.1.1 and is running a DNS server (say), and the guest is >>> 192.168.1.2, when the guest sends queries to 192.168.1.1 the host won't >>> see those queries. Same if you're running an FTP server on the host and >>> the guest wants to connect to it, etc. It also means multiple guests can't >>> see each other, for the same reason. So it's much less useful than >>> bridging, where the guests and host can all see each other and connect to >>> each other. >> I wasn't sure to follow if your example refers to the case when >> networking uses the bridge or NAT. If its bridge, then through which >> bridge interface the packet arrives the host stack? say you have a >> bridge whose attached interfaces are tap1(VM1), tap2(VM2) and eth0(NIC), >> in your example did you mean that the host IP address is assigned to the >> bridge interface? or you were referring a NAT based scheme? > > When using a bridge, you set the IP address on the bridge itself (for > example, br0). DHCP runs on the bridge itself, so does the rest of > the Linux host stack, although you can use raw sockets on the other > interfaces. > > But reading and controlling the hardware is done on the interfaces. > > So if you have some program like NetworkManager which checks if you > have a wire plugged into eth0, it has to read eth0 to get the wire > status, but it has to run DHCP on br0. > > Those programs don't generally have that option, which makes bridges > difficult to use for VMs in a transparent way. > > I wasn't referring to NAT, but you can use NAT with a bridge on Linux; > it's called brouting :-) > >>> Unfortunately, bridging is a pain to set up, if your host has any >>> complicated or automatic network configuration already. > >> As you said bridging requires more configuration > > A bridge is quite simple to configure. Unfortunately because Linux > requires all the IP configuration on the bridge device, but network > device control on the network device, bridges don't work well with > automatic configuration tools. > > If you could apply host IP configuration to the network device and > still have a bridge, that would be perfect. You would just create > br0, add tap1(VM1), tap2(VM2) and eth0(NIC), and everything would work > perfectly. > >> but not less important the performance (packets per second and cpu >> utilization) one can get with bridge+tap is much lower vs what you >> get with the raw mode approach. > > Have you measured it? > >> All in all, its clear that with this approach VM/VM and VM/Host >> communication would have to get switched either at the NIC (e.g >> SR/IOV capable NICs supporting a virtual bridge) or at an external >> switch and make a U turn. > > Unfortunately that's usually impossible. Most switches don't do U > turns, and a lot of simple networks don't have any switches except a > home router. > >> There's a bunch of reasons why people would >> like to do that, among them performance boost, > > No, it makes performance _much_ worse if you have packets leaving the > host, do a U turn and come back on the same link. Much better to use > a bridge inside the host. Probably ten times faster because host's > internal networking is much faster than a typical gigabit link :-) > >> the ability to shape, >> manage and monitor VM/VM traffic in external switches and more. > > That could be useful, but I think it's's probably quite unusual for > someone to want to shape traffic between a VM and it's own host. Also > if you want to do that, you can do it inside the host. > > Sometimes it would be useful to send it outside the host and U turn, > but not very often; only for diagnostics I would think. And even that > can be done with Linux bridges, using VLANs :-) > >>> It would be really nice to find a way which has the advantages of both. >>> Either by adding a different bridging mode to Linux, where host interfaces >>> can be configured for IP and the bridge hangs off the host interface, or >>> by a modified tap interface, or by an alternative >>> pcap/packet-like interface which forwards packets in a similar way to >>> bridging. > >> It seems that this will not yield the performance improvement we can >> get with going directly to the NIC. > > If you don't need any host<->VM networking, maybe a raw packet socket > is faster. > > But are you sure it's faster? > I'd want to see measurements before I believe it. > > If you need any host<->VM networking, most of the time the packet > socket isn't an option at all. Not many switches will 'U turn' > packets as you suggest. FWIW, the fastest local VM<->VM bridge I've happened to measure so far was using qemu's -net socket,listen/connect, ie. a plain local IP or unix domain socket between two qemu instances. No tap devices, no in-kernel bridges involved. But this picture may change once we have some in-kernel virtio-net backend. Jan [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 257 bytes --] ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-15 21:06 ` Jan Kiszka @ 2009-07-15 21:52 ` Jamie Lokier 0 siblings, 0 replies; 30+ messages in thread From: Jamie Lokier @ 2009-07-15 21:52 UTC (permalink / raw) To: Jan Kiszka; +Cc: Or Gerlitz, Herbert Xu, qemu-devel Jan Kiszka wrote: > > But are you sure it's faster? > > I'd want to see measurements before I believe it. > > > > If you need any host<->VM networking, most of the time the packet > > socket isn't an option at all. Not many switches will 'U turn' > > packets as you suggest. > > FWIW, the fastest local VM<->VM bridge I've happened to measure so far > was using qemu's -net socket,listen/connect, ie. a plain local IP or > unix domain socket between two qemu instances. No tap devices, no > in-kernel bridges involved. That's not surprising, but good to know. Packet sockets aren't much use for VM<->VM bridges either ;-) However on a positive note, if packet sockets give good performance for VM<->external, and a unix domain socket gives good performance for VM<->VM, maybe a packet socket on _lo_ (the loopback interface) can be used for VM<->host communication? Then with the right (ugly) hackery in QEMU, it could query the host's MAC addresses as well as other VMs on the same host, listen on all three types of interface, and send to the appropriate one depending on destination MAC address of each packet. That might give great performance in all cases and solve the bridge configuration problem at the same time, so that you can run VMs easily which Just Work(tm) on the host's network. Then again it might not. Code would be a bit complicated, it would interact with Linux iptables differently, and one of the most useful configurations which is VMs being NAT'd by the host (so invisible outside the host) would be difficult. > But this picture may change once we have some in-kernel virtio-net > backend. If there's a faster way to send/receive packets, especially if it _behaves_ differently from tap/packet, it would be nice if it were available from userspace too, not just from KVM. If virtio-net is growing a backend to send/receive packets via the host network stack, it would be nice if it solve the awkward bridge configuration problem at the same time. Do you know what direction that backend is going in? In my experience with VMs, they are always looked after by some host iptables rules for safety, and sometimes NAT rules depending on how they are to be visible outside, and with policy routing at times too. It would be great if those facilities still worked, and unfortunate if the new backend was only usable in quite limited configurations. -- Jamie ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-15 20:38 ` Jamie Lokier 2009-07-15 21:06 ` Jan Kiszka @ 2009-07-16 8:29 ` Or Gerlitz 2009-07-20 14:13 ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz 2 siblings, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-16 8:29 UTC (permalink / raw) To: Jamie Lokier; +Cc: Herbert Xu, Jan Kiszka, qemu-devel Jamie Lokier wrote: > When using a bridge, you set the IP address on the bridge itself (for example, br0). DHCP runs on the bridge itself, so does the rest of the Linux host stack, although you can use raw sockets on the other interfaces. But reading and controlling the hardware is done on the interfaces. So if you have some program like NetworkManager which checks if you have a wire plugged into eth0, it has to read eth0 to get the wire status, but it has to run DHCP on br0. Yes, I understand that if the guest want to communicate with the host in a bridged environment, the IP has to be set on the bridge. With "DHCP" do you refer to dhcp server or to dhcp relay or something else? I assume its not a server, since you mentioned a NON NAT environment. > A bridge is quite simple to configure. Unfortunately because Linux requires all the IP configuration on the bridge device, but network device control on the network device, bridges don't work well with automatic configuration tools. seems like this scheme/problem is similar to bonding, where the IP configuration is done to the bond device but people may still want to do control the slave devices, I am not sure why such tools need the device to have an IP, but it seems less relevant for this thread. > Have you measured it? Yes, I will send soon some data. > Unfortunately that's usually impossible. Most switches don't do U turns, and a lot of simple networks don't have any switches except a home router Again, as I wrote, the U turn can be done in three places: software bridge, virtual HW bridge inside the NIC, or at an external switch. With virtualuzation becoming more common, options 2 and 3 will be more and more available, where the packet socket approach is valid for both of them. > No, it makes performance _much_ worse if you have packets leaving the host, do a U turn and come back on the same link. Much better to use a bridge inside the host. Probably ten times faster because host's internal networking is much faster than a typical gigabit link :-) My benchmark was focusing on packets per second for VM <--> world and not on VM/VM communication. I tend to think that with KVM and the raw mode or kernel virtio-net backend with both requiring U turn, the VM/VM performance will be no less in most if not all measures (namely, packets per second, cpu utilization, bandwidth, latency, etc). Still, its quite clear that both these mode can be useful for people that want to max the VM <---> world communication performance. > Sometimes it would be useful to send it outside the host and U turn, but not very often; only for diagnostics I would think. And even that can be done with Linux bridges, using VLANs :-) mmm, I wasn't sure if you refer to Linux vlans (8021q devices) or the Qemu vlan... can you elaborate? > If you don't need any host<->VM networking, maybe a raw packet socket is faster > But are you sure it's faster? I'd want to see measurements before I believe it. fair enough Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-15 20:38 ` Jamie Lokier 2009-07-15 21:06 ` Jan Kiszka 2009-07-16 8:29 ` Or Gerlitz @ 2009-07-20 14:13 ` Or Gerlitz 2009-07-20 15:53 ` Herbert Xu 2 siblings, 1 reply; 30+ messages in thread From: Or Gerlitz @ 2009-07-20 14:13 UTC (permalink / raw) To: Jamie Lokier Cc: Mark McLoughlin, Michael S. Tsirkin, Herbert Xu, Dor Laor, qemu-devel, Jan Kiszka Jamie Lokier wrote: > Or Gerlitz wrote: >> the performance (packets per second and cpu utilization) one can get >> with bridge+tap is much lower vs what you get with the raw mode approach. > Have you measured it? yes, here's some data: using 2.6.29.1 in the guest, 2.6.30 in the host, with 1Gbe connectivity (Intel 82575EB) between the two nodes, I see the following results: with -net raw (packet socket) pps cs us sys vm->phys 240k 200 7 8 phys->vm 160k 100 5 7 with -net tap (tap + bridge) pps cs us sys vm->phys 170k 600 5 10 phys->vm 150k 14k 5 20 where "pps" stands for packets-per-second, "cs", "us" and "sys" are taken from vmstat output, such that they represent the context switches per second, user and system time percents. The benchmark I use is netperf 2.4.4 / UDP_STREAM with 22 bytes payload length such that there are 64(=14+20+8+22) bytes on the wire. On this setup (udp, 64 byte frames), doing phys->phys test, netperf sends/receives 450K pps and pktgen sends 900K pps, all tests done without any interrupt moderation tuning. You can see that the raw mode has much better packets per second for the VM TX flow, and on the VM RX side, a bit better pps rate but much lower cpu utilization and context switches number. Or. All this on top of mainstream qemu whose head is commit 8676188b751ca28ab7c42baf20ea64391625b44d "Work around Solaris gas problem" ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-20 14:13 ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz @ 2009-07-20 15:53 ` Herbert Xu 2009-07-20 18:20 ` Michael S. Tsirkin 2009-07-21 7:03 ` Or Gerlitz 0 siblings, 2 replies; 30+ messages in thread From: Herbert Xu @ 2009-07-20 15:53 UTC (permalink / raw) To: Or Gerlitz Cc: Mark McLoughlin, Michael S. Tsirkin, Dor Laor, qemu-devel, Jan Kiszka On Mon, Jul 20, 2009 at 05:13:06PM +0300, Or Gerlitz wrote: > > with -net tap (tap + bridge) Is netfilter enabled on the bridge? If so you need to turn it off because it's a huge security hole for virtualisation and slows it down heaps. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-20 15:53 ` Herbert Xu @ 2009-07-20 18:20 ` Michael S. Tsirkin 2009-07-21 1:46 ` Herbert Xu 2009-07-21 7:03 ` Or Gerlitz 1 sibling, 1 reply; 30+ messages in thread From: Michael S. Tsirkin @ 2009-07-20 18:20 UTC (permalink / raw) To: Herbert Xu; +Cc: Mark McLoughlin, Dor Laor, qemu-devel, Or Gerlitz, Jan Kiszka On Mon, Jul 20, 2009 at 11:53:08PM +0800, Herbert Xu wrote: > On Mon, Jul 20, 2009 at 05:13:06PM +0300, Or Gerlitz wrote: > > > > with -net tap (tap + bridge) > > Is netfilter enabled on the bridge? If so you need to turn it off > because it's a huge security hole for virtualisation How is it a security hole? > and slows it > down heaps. > > Cheers, ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-20 18:20 ` Michael S. Tsirkin @ 2009-07-21 1:46 ` Herbert Xu 0 siblings, 0 replies; 30+ messages in thread From: Herbert Xu @ 2009-07-21 1:46 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Mark McLoughlin, Dor Laor, qemu-devel, Or Gerlitz, Jan Kiszka On Mon, Jul 20, 2009 at 09:20:32PM +0300, Michael S. Tsirkin wrote: > > > Is netfilter enabled on the bridge? If so you need to turn it off > > because it's a huge security hole for virtualisation > > How is it a security hole? Because bridge netfilter will perform defragmentation and conntrack, both of which are global in scope. That means packets from two unrelated bridges can be treated exactly as the same if their IP addresses/port numbers are identical, causing information leakage or worse, allowing an attacker to modify others' traffic. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-20 15:53 ` Herbert Xu 2009-07-20 18:20 ` Michael S. Tsirkin @ 2009-07-21 7:03 ` Or Gerlitz 2009-07-21 7:25 ` Herbert Xu 1 sibling, 1 reply; 30+ messages in thread From: Or Gerlitz @ 2009-07-21 7:03 UTC (permalink / raw) To: Herbert Xu Cc: Mark McLoughlin, Michael S. Tsirkin, Dor Laor, qemu-devel, Jan Kiszka Herbert Xu wrote: > Is netfilter enabled on the bridge? If so you need to turn it off okay, when setting net.bridge.bridge-nf-call-iptables to zero, the VM TX / tap+bridge packet rate climbs from 170K to 195K but it still way beyond the 240K rate achieved by the raw mode --> we have now a clear sign on the performance gain this approach provides. Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 7:03 ` Or Gerlitz @ 2009-07-21 7:25 ` Herbert Xu 2009-07-21 10:17 ` Or Gerlitz 2009-07-21 10:27 ` Michael S. Tsirkin 0 siblings, 2 replies; 30+ messages in thread From: Herbert Xu @ 2009-07-21 7:25 UTC (permalink / raw) To: Or Gerlitz Cc: Mark McLoughlin, Michael S. Tsirkin, netdev, Dor Laor, qemu-devel, Jan Kiszka On Tue, Jul 21, 2009 at 10:03:00AM +0300, Or Gerlitz wrote: > > okay, when setting net.bridge.bridge-nf-call-iptables to zero, the VM TX / tap+bridge packet rate climbs from 170K to 195K but it still way beyond the 240K rate achieved by the raw mode --> we have now a clear sign on the performance gain this approach provides. I find this hard to believe this bridge sans netfilter does a single lookup based on the MAC address and then just passes the packet to the underlying driver. Can you do an oprofile run to see if something else is chewing up CPU time under the guise of bridging? Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 7:25 ` Herbert Xu @ 2009-07-21 10:17 ` Or Gerlitz 2009-07-21 10:27 ` Michael S. Tsirkin 1 sibling, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-21 10:17 UTC (permalink / raw) To: Herbert Xu Cc: Mark McLoughlin, Michael S. Tsirkin, netdev, Dor Laor, qemu-devel, Jan Kiszka Herbert Xu wrote: > I find this hard to believe this bridge sans netfilter does a single lookup based > on the MAC address and then just passes the packet to the underlying driver. > Can you do an oprofile run to see if something else is chewing > up CPU time under the guise of bridging? okay, here are the top twenty time consumers for the three VM TX modes, the bridge code is not anywhere high... I'll send you the complete oprofile logs. Or. VM TX with the raw mode --> samples % image name app name symbol name 697453 25.2468 kvm-intel.ko kvm_intel vmx_vcpu_run 105024 3.8017 vmlinux vmlinux _raw_spin_lock 95443 3.4549 igb.ko igb igb_xmit_frame_adv 68617 2.4838 vmlinux vmlinux __slab_free 68168 2.4676 qemu-system-x86_64 qemu-system-x86_64 cpu_physical_memory_rw 56272 2.0370 vmlinux vmlinux tg_shares_up 48573 1.7583 igb.ko igb igb_clean_tx_irq 46128 1.6698 libc-2.5.so libc-2.5.so memcpy 44371 1.6062 vmlinux vmlinux kmem_cache_alloc 41485 1.5017 vmlinux vmlinux __alloc_skb 38719 1.4016 qemu-system-x86_64 qemu-system-x86_64 phys_page_find_alloc 38016 1.3761 vmlinux vmlinux copy_user_generic_string 37690 1.3643 qemu-system-x86_64 qemu-system-x86_64 qemu_get_ram_ptr 34321 1.2424 vmlinux vmlinux dev_kfree_skb_irq 34313 1.2421 vmlinux vmlinux __kmalloc_track_caller 28726 1.0398 vmlinux vmlinux sock_alloc_send_pskb 25195 0.9120 vmlinux vmlinux kfree 24790 0.8974 vmlinux vmlinux __slab_alloc 23406 0.8473 vmlinux vmlinux dev_queue_xmit VM TX with the tap/bridge+netfilter OFF mode --> samples % image name app name symbol name 447119 21.5219 kvm-intel.ko kvm_intel vmx_vcpu_run 70774 3.4067 igb.ko igb igb_xmit_frame_adv 66324 3.1925 vmlinux vmlinux _raw_spin_lock 53817 2.5905 vmlinux vmlinux __slab_free 47494 2.2861 vmlinux vmlinux tg_shares_up 47213 2.2726 qemu-system-x86_64 qemu-system-x86_64 cpu_physical_memory_rw 40364 1.9429 igb.ko igb igb_clean_tx_irq 39545 1.9035 vmlinux vmlinux kmem_cache_alloc 36027 1.7341 libc-2.5.so libc-2.5.so memcpy 34945 1.6821 vmlinux vmlinux __alloc_skb 29747 1.4319 vmlinux vmlinux dev_kfree_skb_irq 29145 1.4029 vmlinux vmlinux __kmalloc_track_caller 28680 1.3805 vmlinux vmlinux copy_user_generic_string 26251 1.2636 qemu-system-x86_64 qemu-system-x86_64 phys_page_find_alloc 25123 1.2093 qemu-system-x86_64 qemu-system-x86_64 qemu_get_ram_ptr 23231 1.1182 vmlinux vmlinux eth_type_trans 22356 1.0761 vmlinux vmlinux sock_alloc_send_pskb 22108 1.0642 vmlinux vmlinux __slab_alloc 21288 1.0247 vmlinux vmlinux kfree VM TX with the tap/bridge+netfilter ON mode --> samples % image name app name symbol name 319271 21.1411 kvm-intel.ko kvm_intel vmx_vcpu_run 46559 3.0830 vmlinux vmlinux _raw_spin_lock 39703 2.6290 vmlinux vmlinux tg_shares_up 35773 2.3688 vmlinux vmlinux __slab_free 35045 2.3206 qemu-system-x86_64 qemu-system-x86_64 cpu_physical_memory_rw 32612 2.1595 igb.ko igb igb_xmit_frame_adv 31779 2.1043 vmlinux vmlinux kmem_cache_alloc 29134 1.9292 libc-2.5.so libc-2.5.so memcpy 23031 1.5250 vmlinux vmlinux copy_user_generic_string 19713 1.3053 vmlinux vmlinux __kmalloc_track_caller 19303 1.2782 qemu-system-x86_64 qemu-system-x86_64 phys_page_find_alloc 19038 1.2606 vmlinux vmlinux __alloc_skb 18559 1.2289 vmlinux vmlinux kfree 18460 1.2224 qemu-system-x86_64 qemu-system-x86_64 qemu_get_ram_ptr 18409 1.2190 vmlinux vmlinux eth_type_trans 17828 1.1805 igb.ko igb igb_clean_tx_irq 17622 1.1669 igb.ko igb igb_poll 17303 1.1457 vmlinux vmlinux __slab_alloc 17033 1.1279 vmlinux vmlinux dev_kfree_skb_irq ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 7:25 ` Herbert Xu 2009-07-21 10:17 ` Or Gerlitz @ 2009-07-21 10:27 ` Michael S. Tsirkin 2009-07-21 11:05 ` Or Gerlitz 1 sibling, 1 reply; 30+ messages in thread From: Michael S. Tsirkin @ 2009-07-21 10:27 UTC (permalink / raw) To: Herbert Xu Cc: Mark McLoughlin, netdev, Dor Laor, qemu-devel, Or Gerlitz, Jan Kiszka On Tue, Jul 21, 2009 at 03:25:46PM +0800, Herbert Xu wrote: > On Tue, Jul 21, 2009 at 10:03:00AM +0300, Or Gerlitz wrote: > > > > okay, when setting net.bridge.bridge-nf-call-iptables to zero, the VM TX / tap+bridge packet rate climbs from 170K to 195K but it still way beyond the 240K rate achieved by the raw mode --> we have now a clear sign on the performance gain this approach provides. > > I find this hard to believe this bridge sans netfilter does a > single lookup based on the MAC address and then just passes the > packet to the underlying driver. One advantage that raw sockets have over tap+bridge, is that they do not do their own TX buffering, but use the TX queue for the device directly. With raw sockets, send will block or fail if the TX queue for device is full. With tap+bridge, the buffer in tap has to fill up instead, which is not the same. I'm not sure this is the issue here, but could be: the benchmark is UDP, isn't it? > Can you do an oprofile run to see if something else is chewing > up CPU time under the guise of bridging? > > Thanks, > -- > Visit Openswan at http://www.openswan.org/ > Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> > Home Page: http://gondor.apana.org.au/~herbert/ > PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 10:27 ` Michael S. Tsirkin @ 2009-07-21 11:05 ` Or Gerlitz 2009-07-21 12:01 ` Michael S. Tsirkin 0 siblings, 1 reply; 30+ messages in thread From: Or Gerlitz @ 2009-07-21 11:05 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Mark McLoughlin, Herbert Xu, netdev, Dor Laor, qemu-devel, Jan Kiszka Michael S. Tsirkin wrote: > With raw sockets, send will block or fail if the TX queue for device is > full. With tap+bridge, the buffer in tap has to fill up instead, which > is not the same. I'm not sure this is the issue here, but could be: the > benchmark is UDP, isn't it? Michael, What/where is this tap buffer? we're talking on VM TX, so looking on tun_get_user I see a call to skb_copy_datagram_from_iovec() to copy from the user buffer to an skb, then a call to netif_rx_ni() and that's it... As for your question, indeed udp, the VM runs netperf/UDP_STREAM Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 11:05 ` Or Gerlitz @ 2009-07-21 12:01 ` Michael S. Tsirkin 2009-07-21 12:14 ` Herbert Xu 0 siblings, 1 reply; 30+ messages in thread From: Michael S. Tsirkin @ 2009-07-21 12:01 UTC (permalink / raw) To: Or Gerlitz Cc: Mark McLoughlin, Herbert Xu, netdev, Dor Laor, qemu-devel, Jan Kiszka On Tue, Jul 21, 2009 at 02:05:32PM +0300, Or Gerlitz wrote: > Michael S. Tsirkin wrote: > > With raw sockets, send will block or fail if the TX queue for device is > > full. With tap+bridge, the buffer in tap has to fill up instead, which > > is not the same. I'm not sure this is the issue here, but could be: the > > benchmark is UDP, isn't it? > > Michael, > > What/where is this tap buffer? > we're talking on VM TX, so looking on tun_get_user I see a call to > skb_copy_datagram_from_iovec() to copy from the user buffer to an skb, then a call to netif_rx_ni() and that's it... As for your question, indeed udp, the VM runs netperf/UDP_STREAM > > Or. Queue is not the right word, sorry. I was referring to the fact that, when bridge floods a packet to multiple interfaces, it clones the skb and frees the original, which breaks the send buffer accounting in tun and might let you overrun the tx queue in one of the devices. This does not usually happen with raw sockets. This is the code in question: if (prev != NULL) { struct sk_buff *skb2; if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { br->dev->stats.tx_dropped++; kfree_skb(skb); return; } __packet_hook(prev, skb2); } the thing to check then would be that some kind of misconfiguration does not cause the bridge to flood your packets to multiple interfaces. -- MST ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 12:01 ` Michael S. Tsirkin @ 2009-07-21 12:14 ` Herbert Xu 2009-07-21 13:41 ` Or Gerlitz 0 siblings, 1 reply; 30+ messages in thread From: Herbert Xu @ 2009-07-21 12:14 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Mark McLoughlin, netdev, Dor Laor, qemu-devel, Or Gerlitz, Jan Kiszka On Tue, Jul 21, 2009 at 03:01:42PM +0300, Michael S. Tsirkin wrote: > > the thing to check then would be that some kind of misconfiguration > does not cause the bridge to flood your packets to multiple interfaces. Right, we should make sure that the interfaces are not in promiscous mode. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend - some performance measurements 2009-07-21 12:14 ` Herbert Xu @ 2009-07-21 13:41 ` Or Gerlitz 0 siblings, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-21 13:41 UTC (permalink / raw) To: Herbert Xu Cc: Mark McLoughlin, Michael S. Tsirkin, netdev, Dor Laor, qemu-devel, Jan Kiszka Herbert Xu wrote: > On Tue, Jul 21, 2009 at 03:01:42PM +0300, Michael S. Tsirkin wrote: >> the thing to check then would be that some kind of misconfiguration >> does not cause the bridge to flood your packets to multiple interfaces. > Right, we should make sure that the interfaces are not in promiscous mode Michael, Herbert, First, I don't see how flooding can happen in my setup, I have only two interfaces on the bridge (see below), a tap and a NIC (vlan) and the bridge will never attempt to forward a packet through the port it was received. Second, the bridge always set all interfaces attached to it to be in promiscous mode, see the call to dev_set_promiscuity() from br_add_if() but this doesn't mean it applied flooding, it does mac learning... Or. # brctl show bridge name bridge id STP enabled interfaces br0 8000.0030485f9977 no eth1.4009 tap0 The VM mac is de:ab:be:01:01:09 and the remote node mac is 00:30:48:65:a6:2b, you can see that these two macs were learned by the bridge and hence no flooding is expected. # brctl showmacs br0 port no mac addr is local? ageing timer 1 00:30:48:5f:99:77 yes 0.00 1 00:30:48:65:a6:2b no 12.50 2 06:f5:76:64:a0:d4 yes 0.00 2 de:ab:be:01:01:09 no 0.00 ^ permalink raw reply [flat|nested] 30+ messages in thread
[parent not found: <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com>]
* Re: [Qemu-devel] [PATCH] net: add raw backend [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com> @ 2009-07-02 12:08 ` Or Gerlitz 0 siblings, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-02 12:08 UTC (permalink / raw) To: Filip Navara, qemu-devel Filip Navara wrote: > This doesn't compile on Win32. In fact, even if it compiled, it > wouldn't work. All the code should be part of the #ifndef _WIN32 block okay, will change that in the next revision of the patch Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-01 15:46 [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz 2009-07-01 16:21 ` Jamie Lokier [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com> @ 2009-07-02 15:43 ` Michael S. Tsirkin 2009-07-07 14:45 ` Or Gerlitz 2 siblings, 1 reply; 30+ messages in thread From: Michael S. Tsirkin @ 2009-07-02 15:43 UTC (permalink / raw) To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel On Wed, Jul 01, 2009 at 06:46:43PM +0300, Or Gerlitz wrote: > Add raw network backend option which uses a packet socket to provide > raw networking access. Once the socket is opened its bouned to a > provided host interface, such that packets received on the interface > are delivered to the VM and packets sent by the VM are sent to the > interface. > > Signed-off-by: Or Gerlitz<ogerlitz@voltaire.com> Looks good to me overall. A couple of comments: > diff --git a/net.c b/net.c > index 55f70f2..f7ff381 100644 > --- a/net.c > +++ b/net.c > @@ -93,6 +93,9 @@ > #endif > #endif > > +#include <netpacket/packet.h> > +#include <net/ethernet.h> > + > #if defined(__OpenBSD__) > #include <util.h> > #endif > @@ -1476,6 +1479,155 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model, > > #endif /* !_WIN32 */ > > +typedef struct RAWState { > + VLANClientState *vc; > + int fd; > + uint8_t buf[4096]; > + int promisc; > +} RAWState; > + > +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc) > +{ > + int fd, ret; > + struct ifreq req; > + struct sockaddr_ll lladdr; > + > + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); > + if (fd < 0) > + config_error(mon, "packet socket failed\n"); > + > + memset(&req, 0, sizeof(req)); > + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); > + ret = ioctl(fd, SIOCGIFINDEX, &req); > + if (ret < 0) > + config_error(mon, "SIOCGIFINDEX failed\n"); > + > + memset(&lladdr, 0, sizeof(lladdr)); > + lladdr.sll_family = AF_PACKET; > + lladdr.sll_protocol = htons(ETH_P_ALL); > + lladdr.sll_ifindex = req.ifr_ifindex; > + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); > + if (ret < 0) > + config_error(mon, "bind failed\n"); > + > + /* set iface to promiscuous mode (packets sent to the VM MAC) */ > + if (promisc) { > + ret = ioctl(fd, SIOCGIFFLAGS, &req); > + if (ret < 0) > + perror("SIOCGIFFLAGS failed\n"); > + req.ifr_flags |= IFF_PROMISC; > + ret = ioctl(fd, SIOCSIFFLAGS, &req); > + if (ret < 0) > + config_error(mon, "SIOCSIFFLAGS to promiscous failed\n"); > + } > + > + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); > + if (ret < 0) > + config_error(mon, "O_NONBLOCK set failed\n"); > + > + return fd; > +} > + > +static void raw_cleanup(VLANClientState *vc) > +{ > + struct ifreq req; > + RAWState *s = vc->opaque; > + > + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > + if (s->promisc) { > + ioctl(s->fd, SIOCGIFFLAGS, &req); > + req.ifr_flags &= ~IFF_PROMISC; > + ioctl(s->fd, SIOCSIFFLAGS, &req); > + } > + close(s->fd); > + qemu_free(s); > +} > + > +static void raw_send(void *opaque); > + > +static int raw_can_send(void *opaque) > +{ > + RAWState *s = opaque; > + > + return qemu_can_send_packet(s->vc); > +} > + > +static void raw_send_completed(VLANClientState *vc, ssize_t len) > +{ > + RAWState *s = vc->opaque; > + > + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); > +} > + > +static void raw_send(void *opaque) > +{ > + RAWState *s = opaque; > + int size; > + > + do { > + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); > + if (size <= 0) > + break; A couple of improvement suggestions here: - You might get size > sizeof(s->buf). Should not happen, but you might want to check for this condition and report it + discard the packet. - It might be a good idea to request aux data and verify that checksum is set, calculate it if not. this will make it possible to bind to a local device as well. > + > + size = qemu_send_packet_async(s->vc, s->buf, size, > + raw_send_completed); > + if (size == 0) > + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > + > + } while (size > 0); > +} > + > +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov, > + int iovcnt) > +{ > + ssize_t len; > + RAWState *s = vc->opaque; > + > + do { > + len = writev(s->fd, iov, iovcnt); > + } while (len == -1 && (errno == EINTR || errno == EAGAIN)); > + > + return len; > +} > + > +static ssize_t raw_receive(VLANClientState *vc, const uint8_t *buf, size_t size) > +{ > + struct iovec iov[1]; > + > + iov[0].iov_base = (char *)buf; > + iov[0].iov_len = size; > + > + return raw_receive_iov(vc, iov, 1); > +} > + > +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model, > + const char *name, const char *ifname, > + int promisc, int fd) > +{ > + RAWState *s; > + > + s = qemu_mallocz(sizeof(RAWState)); > + > + if (fd == -1) { > + s->fd = net_raw_fd_init(mon, ifname, promisc); > + s->promisc = promisc; > + } else > + s->fd = fd; > + > + s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive, > + raw_receive_iov, raw_cleanup, s); > + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); > + > + if (fd == -1) > + snprintf(s->vc->info_str, sizeof(s->vc->info_str), > + "raw: ifname=%s, promisc=%d", ifname, promisc); > + else > + snprintf(s->vc->info_str, sizeof(s->vc->info_str), > + "raw: fd=%d", fd); > + > + return 0; > +} > + > #if defined(CONFIG_VDE) > typedef struct VDEState { > VLANClientState *vc; > @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p) > } > } else > #endif > + if (!strcmp(device, "raw")) { > + char chkbuf[64], ifname[64]; > + int raw_fd = -1; > + int promisc = 1; promisc = 0 might be a safer default. > + if (get_param_value(buf, sizeof(buf), "fd", p) > 0) { > + static const char * const fd_params[] = { > + "vlan", "name", "fd", NULL > + }; > + if (check_params(chkbuf, sizeof(chkbuf), fd_params, p) < 0) { > + config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p); > + ret = -1; > + goto out; > + } > + raw_fd = strtol(buf, NULL, 0); > + fcntl(raw_fd, F_SETFL, fcntl(raw_fd, F_GETFL | O_NONBLOCK)); > + } else { > + static const char * const tap_params[] = { > + "vlan", "name", "ifname", "promisc", NULL > + }; > + if (check_params(chkbuf, sizeof(chkbuf), tap_params, p) < 0) { > + config_error(mon, "invalid parameter '%s' in '%s'\n", chkbuf, p); > + ret = -1; > + goto out; > + } > + if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) { > + config_error(mon, "raw: no interface name\n"); > + ret = -1; > + goto out; > + } > + if (get_param_value(buf, sizeof(buf), "promisc", p)) > + promisc = atoi(buf); > + } > + vlan->nb_host_devs++; > + ret = net_raw_init(mon, vlan, device, name, ifname, promisc, raw_fd); > + } else > if (!strcmp(device, "socket")) { > char chkbuf[64]; > if (get_param_value(buf, sizeof(buf), "fd", p) > 0) { > diff --git a/qemu-options.hx b/qemu-options.hx > index 503da33..0a3c807 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -761,6 +761,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > " use 'sndbuf=nbytes' to limit the size of the send buffer\n" > #endif > #endif > + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" > + " bound the host network interface to VLAN 'n' in a raw manner:\n" in a raw manner -> using a raw packet socket > + " packets received on the interface are delivered to the vlan and\n" > + " packets delivered on the vlan are sent to the interface\n" document promisc option? > "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" > " connect the vlan 'n' to another VLAN using a socket connection\n" > "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n" > ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-02 15:43 ` Michael S. Tsirkin @ 2009-07-07 14:45 ` Or Gerlitz 2009-07-07 14:49 ` Michael S. Tsirkin 0 siblings, 1 reply; 30+ messages in thread From: Or Gerlitz @ 2009-07-07 14:45 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Herbert Xu, qemu-devel Michael S. Tsirkin wrote: >> +static void raw_send(void *opaque) >> + do { >> + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); >> + if (size <= 0) >> + break; > A couple of improvement suggestions here: > - You might get size > sizeof(s->buf). Should not happen, but you might want to > check for this condition and report it + discard the packet. okay, will do > - It might be a good idea to request aux data and verify that checksum is set, > calculate it if not. this will make it possible to bind to a local device as well. thanks for the heads up, I am still not sure to follow the documentation/logic wrt to checksum reporting of the af_packet kernel code. I'll look on this. What do you mean by "this will make it possible to bind to a local device as well"? >> @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p) >> + if (!strcmp(device, "raw")) { >> + int promisc = 1; > promisc = 0 might be a safer default I can do that, but I am not sure why its safer, e.g bridge always set all interfaces to promisc >> + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" >> + " bound the host network interface to VLAN 'n' in a raw manner:\n" > in a raw manner -> using a raw packet socket okay >> + " packets received on the interface are delivered to the vlan and\n" >> + " packets delivered on the vlan are sent to the interface\n" > document promisc option? sure Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-07 14:45 ` Or Gerlitz @ 2009-07-07 14:49 ` Michael S. Tsirkin 2009-07-08 14:46 ` Or Gerlitz 2009-07-08 15:06 ` Or Gerlitz 0 siblings, 2 replies; 30+ messages in thread From: Michael S. Tsirkin @ 2009-07-07 14:49 UTC (permalink / raw) To: Or Gerlitz; +Cc: Herbert Xu, qemu-devel On Tue, Jul 07, 2009 at 05:45:39PM +0300, Or Gerlitz wrote: > Michael S. Tsirkin wrote: > >> +static void raw_send(void *opaque) > >> + do { > >> + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); > >> + if (size <= 0) > >> + break; > > > A couple of improvement suggestions here: > > - You might get size > sizeof(s->buf). Should not happen, but you might want to > > check for this condition and report it + discard the packet. > > okay, will do > > > - It might be a good idea to request aux data and verify that checksum is set, > > calculate it if not. this will make it possible to bind to a local device as well. > > thanks for the heads up, I am still not sure to follow the documentation/logic wrt to checksum reporting of the af_packet kernel code. I'll look on this. What do you mean by "this will make it possible to bind to a local device as well"? See comment in qemu-kvm about dhclient as an example. > >> @@ -2348,6 +2500,41 @@ int net_client_init(Monitor *mon, const char *device, const char *p) > >> + if (!strcmp(device, "raw")) { > >> + int promisc = 1; > > > promisc = 0 might be a safer default > > I can do that, but I am not sure why its safer, e.g bridge always set all interfaces to promisc Think of what happens if someone does kill -9 on qemu This can not happen with bridge. > >> + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" > >> + " bound the host network interface to VLAN 'n' in a raw manner:\n" > > > in a raw manner -> using a raw packet socket > > okay > > >> + " packets received on the interface are delivered to the vlan and\n" > >> + " packets delivered on the vlan are sent to the interface\n" > > > document promisc option? > > sure > > > Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-07 14:49 ` Michael S. Tsirkin @ 2009-07-08 14:46 ` Or Gerlitz 2009-07-08 15:06 ` Or Gerlitz 1 sibling, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-08 14:46 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Herbert Xu, qemu-devel Michael S. Tsirkin wrote: > On Tue, Jul 07, 2009 at 05:45:39PM +0300, Or Gerlitz wrote: > >> I am not sure why its safer, e.g bridge always set all interfaces to promisc >> > Think of what happens if someone does kill -9 on qemu This can not happen with bridge > yes, I can make the default to be promisc=0 Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [Qemu-devel] [PATCH] net: add raw backend 2009-07-07 14:49 ` Michael S. Tsirkin 2009-07-08 14:46 ` Or Gerlitz @ 2009-07-08 15:06 ` Or Gerlitz 1 sibling, 0 replies; 30+ messages in thread From: Or Gerlitz @ 2009-07-08 15:06 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Herbert Xu, qemu-devel Michael S. Tsirkin wrote: > On Tue, Jul 07, 2009 at 05:45:39PM +0300, Or Gerlitz wrote: >> Michael S. Tsirkin wrote: >>> It might be a good idea to request aux data and verify that checksum is set, >>> calculate it if not. this will make it possible to bind to a local device as well. >> I am still not sure to follow the checksum reporting of the af_packet kernel code > See comment in qemu-kvm about dhclient as an example. yes, some sort of "host software checksum offload" when using packet socket make sense. I wasn't sure when the af_packet kernel code does report on "checksum okay", it only checks for "ip_summed == CHECKSUM_PARTIAL". Or. ^ permalink raw reply [flat|nested] 30+ messages in thread
end of thread, other threads:[~2009-07-21 13:41 UTC | newest] Thread overview: 30+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2009-07-01 15:46 [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz 2009-07-01 16:21 ` Jamie Lokier 2009-07-02 12:25 ` Or Gerlitz 2009-07-03 2:39 ` Jamie Lokier 2009-07-07 13:33 ` Or Gerlitz 2009-07-07 14:57 ` Jamie Lokier 2009-07-08 14:45 ` Or Gerlitz 2009-07-14 13:54 ` Or Gerlitz 2009-07-15 20:38 ` Jamie Lokier 2009-07-15 21:06 ` Jan Kiszka 2009-07-15 21:52 ` Jamie Lokier 2009-07-16 8:29 ` Or Gerlitz 2009-07-20 14:13 ` [Qemu-devel] [PATCH] net: add raw backend - some performance measurements Or Gerlitz 2009-07-20 15:53 ` Herbert Xu 2009-07-20 18:20 ` Michael S. Tsirkin 2009-07-21 1:46 ` Herbert Xu 2009-07-21 7:03 ` Or Gerlitz 2009-07-21 7:25 ` Herbert Xu 2009-07-21 10:17 ` Or Gerlitz 2009-07-21 10:27 ` Michael S. Tsirkin 2009-07-21 11:05 ` Or Gerlitz 2009-07-21 12:01 ` Michael S. Tsirkin 2009-07-21 12:14 ` Herbert Xu 2009-07-21 13:41 ` Or Gerlitz [not found] ` <5b31733c0907011250i7afcdbcdnb844290de4ad64f2@mail.gmail.com> 2009-07-02 12:08 ` [Qemu-devel] [PATCH] net: add raw backend Or Gerlitz 2009-07-02 15:43 ` Michael S. Tsirkin 2009-07-07 14:45 ` Or Gerlitz 2009-07-07 14:49 ` Michael S. Tsirkin 2009-07-08 14:46 ` Or Gerlitz 2009-07-08 15:06 ` Or Gerlitz
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).