* Re: [PATCHv3 6/6] tun: experimental zero copy tx support [not found] ` <5ace2b9c3f15259cdf29af03f9231faac673e719.1338735323.git.mst@redhat.com> @ 2012-06-04 4:48 ` Jason Wang 0 siblings, 0 replies; 2+ messages in thread From: Jason Wang @ 2012-06-04 4:48 UTC (permalink / raw) To: Michael S. Tsirkin Cc: eric.dumazet, netdev, linux-kernel, ebiederm, davem, Ian Campbell On 05/13/2012 08:34 PM, Michael S. Tsirkin wrote: > Let vhost-net utilize zero copy tx when used with tun. > > Signed-off-by: Michael S. Tsirkin<mst@redhat.com> > --- > drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 134 insertions(+), 12 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index fe5cd2f3..74d7e5e 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -100,6 +100,8 @@ do { \ > } while (0) > #endif > > +#define GOODCOPY_LEN 128 > + > #define FLT_EXACT_COUNT 8 > struct tap_filter { > unsigned int count; /* Number of addrs. Zero means disabled */ > @@ -602,19 +604,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, > return skb; > } > > +/* set skb frags from iovec, this can move to core network code for reuse */ > +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, > + int offset, size_t count) > +{ > + int len = iov_length(from, count) - offset; > + int copy = skb_headlen(skb); > + int size, offset1 = 0; > + int i = 0; > + > + /* Skip over from offset */ > + while (count&& (offset>= from->iov_len)) { > + offset -= from->iov_len; > + ++from; > + --count; > + } > + > + /* copy up to skb headlen */ > + while (count&& (copy> 0)) { > + size = min_t(unsigned int, copy, from->iov_len - offset); > + if (copy_from_user(skb->data + offset1, from->iov_base + offset, > + size)) > + return -EFAULT; > + if (copy> size) { > + ++from; > + --count; > + offset = 0; > + } else > + offset += size; > + copy -= size; > + offset1 += size; > + } > + > + if (len == offset1) > + return 0; > + > + while (count--) { > + struct page *page[MAX_SKB_FRAGS]; > + int num_pages; > + unsigned long base; > + unsigned long truesize; > + > + len = from->iov_len - offset; > + if (!len) { > + offset = 0; > + ++from; > + continue; > + } > + base = (unsigned long)from->iov_base + offset; > + size = ((base& ~PAGE_MASK) + len + ~PAGE_MASK)>> PAGE_SHIFT; > + if (i + size> MAX_SKB_FRAGS) > + return -EMSGSIZE; > + num_pages = get_user_pages_fast(base, size, 0,&page[i]); > + if (num_pages != size) { > + for (i = 0; i< num_pages; i++) > + put_page(page[i]); > + return -EFAULT; > + } > + truesize = size * PAGE_SIZE; > + skb->data_len += len; > + skb->len += len; > + skb->truesize += truesize; > + atomic_add(truesize,&skb->sk->sk_wmem_alloc); > + while (len) { > + int off = base& ~PAGE_MASK; > + int size = min_t(int, len, PAGE_SIZE - off); > + __skb_fill_page_desc(skb, i, page[i], off, size); > + skb_shinfo(skb)->nr_frags++; > + /* increase sk_wmem_alloc */ > + base += size; > + len -= size; > + i++; > + } > + offset = 0; > + ++from; > + } > + return 0; > +} > + > /* Get packet from user space buffer */ > -static ssize_t tun_get_user(struct tun_struct *tun, > - const struct iovec *iv, size_t count, > - int noblock) > +static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control, > + const struct iovec *iv, size_t total_len, > + size_t count, int noblock) > { Looks like V2 uses count as the number of vectors and V3 correct this, so does V3 still have any issue during test? > struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; > struct sk_buff *skb; > - size_t len = count, align = NET_SKB_PAD; > + size_t len = total_len, align = NET_SKB_PAD; > struct virtio_net_hdr gso = { 0 }; > int offset = 0; > + int copylen; > + bool zerocopy = false; > + int err; > > if (!(tun->flags& TUN_NO_PI)) { > - if ((len -= sizeof(pi))> count) > + if ((len -= sizeof(pi))> total_len) > return -EINVAL; > > if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) > @@ -623,7 +706,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, > } > > if (tun->flags& TUN_VNET_HDR) { > - if ((len -= tun->vnet_hdr_sz)> count) > + if ((len -= tun->vnet_hdr_sz)> total_len) > return -EINVAL; > > if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) > @@ -645,14 +728,46 @@ static ssize_t tun_get_user(struct tun_struct *tun, > return -EINVAL; > } > Add a check of UIO_MAXIOV like macvtap? Other looks good to me. Thanks. > - skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); > + if (msg_control) > + zerocopy = true; > + > + if (zerocopy) { > + /* Userspace may produce vectors with count greater than > + * MAX_SKB_FRAGS, so we need to linearize parts of the skb > + * to let the rest of data to be fit in the frags. > + */ > + if (count> MAX_SKB_FRAGS) { > + copylen = iov_length(iv, count - MAX_SKB_FRAGS); > + if (copylen< offset) > + copylen = 0; > + else > + copylen -= offset; > + } else > + copylen = 0; > + /* There are 256 bytes to be copied in skb, so there is enough > + * room for skb expand head in case it is used. > + * The rest of the buffer is mapped from userspace. > + */ > + if (copylen< gso.hdr_len) > + copylen = gso.hdr_len; > + if (!copylen) > + copylen = GOODCOPY_LEN; > + } else > + copylen = len; > + > + skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock); > if (IS_ERR(skb)) { > if (PTR_ERR(skb) != -EAGAIN) > tun->dev->stats.rx_dropped++; > return PTR_ERR(skb); > } > > - if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { > + if (zerocopy) > + err = zerocopy_sg_from_iovec(skb, iv, offset, count); > + else > + err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); > + > + if (err) { > tun->dev->stats.rx_dropped++; > kfree_skb(skb); > return -EFAULT; > @@ -726,12 +841,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, > skb_shinfo(skb)->gso_segs = 0; > } > > + /* copy skb_ubuf_info for callback when skb has no error */ > + if (zerocopy) { > + skb_shinfo(skb)->destructor_arg = msg_control; > + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; > + } > + > netif_rx_ni(skb); > > tun->dev->stats.rx_packets++; > tun->dev->stats.rx_bytes += len; > > - return count; > + return total_len; > } > > static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, > @@ -746,7 +867,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, > > tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); > > - result = tun_get_user(tun, iv, iov_length(iv, count), > + result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count, > file->f_flags& O_NONBLOCK); > > tun_put(tun); > @@ -960,8 +1081,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, > struct msghdr *m, size_t total_len) > { > struct tun_struct *tun = container_of(sock, struct tun_struct, socket); > - return tun_get_user(tun, m->msg_iov, total_len, > - m->msg_flags& MSG_DONTWAIT); > + return tun_get_user(tun, m->msg_control, m->msg_iov, total_len, > + m->msg_iovlen, m->msg_flags& MSG_DONTWAIT); > } > > static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, > @@ -1130,6 +1251,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > sock_init_data(&tun->socket, sk); > sk->sk_write_space = tun_sock_write_space; > sk->sk_sndbuf = INT_MAX; > + sock_set_flag(sk, SOCK_ZEROCOPY); > > tun_sk(sk)->tun = tun; > ^ permalink raw reply [flat|nested] 2+ messages in thread
* [PATCHv3 0/6] tun zerocopy support @ 2012-07-20 19:23 Michael S. Tsirkin 2012-07-20 19:23 ` [PATCHv3 6/6] tun: experimental zero copy tx support Michael S. Tsirkin 0 siblings, 1 reply; 2+ messages in thread From: Michael S. Tsirkin @ 2012-07-20 19:23 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Jason Wang, eric.dumazet, netdev, linux-kernel, ebiederm, davem This adds support for experimental zero copy transmit to tun. This includes some patches from Ian's patchset to support zerocopy with tun, so it should help that work progress: we are still trying to figure out how to make everything work properly with tcp but tun seems easier, and it's helpful by itself since not everyone can use macvtap. Same as with macvtap, I get single-percentage wins in CPU utilization on guest to external from this patchset, and a performance regression on guest to host, so more work is needed until this feature can move out of experimental status, but I think it's useful for some people already. Pls review and consider for 3.6. There's some code duplication between tun and macvtap now: common code could move to net/core/datagram.c, this patch does not do this yet. Changes from v2: Fixed some bugs so it's stable now Michael S. Tsirkin (6): skbuff: add an api to orphan frags skbuff: convert to skb_orphan_frags skbuff: export skb_copy_ubufs tun: orphan frags on xmit net: orphan frags on receive tun: experimental zero copy tx support drivers/net/tun.c | 148 +++++++++++++++++++++++++++++++++++++++++++++---- include/linux/skbuff.h | 16 ++++++ net/core/dev.c | 7 ++- net/core/skbuff.c | 24 +++----- 4 files changed, 167 insertions(+), 28 deletions(-) -- MST ^ permalink raw reply [flat|nested] 2+ messages in thread
* [PATCHv3 6/6] tun: experimental zero copy tx support 2012-07-20 19:23 [PATCHv3 0/6] tun zerocopy support Michael S. Tsirkin @ 2012-07-20 19:23 ` Michael S. Tsirkin 0 siblings, 0 replies; 2+ messages in thread From: Michael S. Tsirkin @ 2012-07-20 19:23 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Jason Wang, eric.dumazet, netdev, linux-kernel, ebiederm, davem Let vhost-net utilize zero copy tx when used with tun. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 134 insertions(+), 12 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b95a7f4..c62163e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -100,6 +100,8 @@ do { \ } while (0) #endif +#define GOODCOPY_LEN 128 + #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ @@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, return skb; } +/* set skb frags from iovec, this can move to core network code for reuse */ +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + int offset, size_t count) +{ + int len = iov_length(from, count) - offset; + int copy = skb_headlen(skb); + int size, offset1 = 0; + int i = 0; + + /* Skip over from offset */ + while (count && (offset >= from->iov_len)) { + offset -= from->iov_len; + ++from; + --count; + } + + /* copy up to skb headlen */ + while (count && (copy > 0)) { + size = min_t(unsigned int, copy, from->iov_len - offset); + if (copy_from_user(skb->data + offset1, from->iov_base + offset, + size)) + return -EFAULT; + if (copy > size) { + ++from; + --count; + offset = 0; + } else + offset += size; + copy -= size; + offset1 += size; + } + + if (len == offset1) + return 0; + + while (count--) { + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; + unsigned long truesize; + + len = from->iov_len - offset; + if (!len) { + offset = 0; + ++from; + continue; + } + base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); + if (num_pages != size) { + for (i = 0; i < num_pages; i++) + put_page(page[i]); + return -EFAULT; + } + truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); + __skb_fill_page_desc(skb, i, page[i], off, size); + skb_shinfo(skb)->nr_frags++; + /* increase sk_wmem_alloc */ + base += size; + len -= size; + i++; + } + offset = 0; + ++from; + } + return 0; +} + /* Get packet from user space buffer */ -static ssize_t tun_get_user(struct tun_struct *tun, - const struct iovec *iv, size_t count, - int noblock) +static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control, + const struct iovec *iv, size_t total_len, + size_t count, int noblock) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; - size_t len = count, align = NET_SKB_PAD; + size_t len = total_len, align = NET_SKB_PAD; struct virtio_net_hdr gso = { 0 }; int offset = 0; + int copylen; + bool zerocopy = false; + int err; if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) > count) + if ((len -= sizeof(pi)) > total_len) return -EINVAL; if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) @@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, } if (tun->flags & TUN_VNET_HDR) { - if ((len -= tun->vnet_hdr_sz) > count) + if ((len -= tun->vnet_hdr_sz) > total_len) return -EINVAL; if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) @@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun, return -EINVAL; } - skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); + if (msg_control) + zerocopy = true; + + if (zerocopy) { + /* Userspace may produce vectors with count greater than + * MAX_SKB_FRAGS, so we need to linearize parts of the skb + * to let the rest of data to be fit in the frags. + */ + if (count > MAX_SKB_FRAGS) { + copylen = iov_length(iv, count - MAX_SKB_FRAGS); + if (copylen < offset) + copylen = 0; + else + copylen -= offset; + } else + copylen = 0; + /* There are 256 bytes to be copied in skb, so there is enough + * room for skb expand head in case it is used. + * The rest of the buffer is mapped from userspace. + */ + if (copylen < gso.hdr_len) + copylen = gso.hdr_len; + if (!copylen) + copylen = GOODCOPY_LEN; + } else + copylen = len; + + skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) tun->dev->stats.rx_dropped++; return PTR_ERR(skb); } - if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { + if (zerocopy) + err = zerocopy_sg_from_iovec(skb, iv, offset, count); + else + err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); + + if (err) { tun->dev->stats.rx_dropped++; kfree_skb(skb); return -EFAULT; @@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, skb_shinfo(skb)->gso_segs = 0; } + /* copy skb_ubuf_info for callback when skb has no error */ + if (zerocopy) { + skb_shinfo(skb)->destructor_arg = msg_control; + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + } + netif_rx_ni(skb); tun->dev->stats.rx_packets++; tun->dev->stats.rx_bytes += len; - return count; + return total_len; } static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, @@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); - result = tun_get_user(tun, iv, iov_length(iv, count), + result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count, file->f_flags & O_NONBLOCK); tun_put(tun); @@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len) { struct tun_struct *tun = container_of(sock, struct tun_struct, socket); - return tun_get_user(tun, m->msg_iov, total_len, - m->msg_flags & MSG_DONTWAIT); + return tun_get_user(tun, m->msg_control, m->msg_iov, total_len, + m->msg_iovlen, m->msg_flags & MSG_DONTWAIT); } static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, @@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) sock_init_data(&tun->socket, sk); sk->sk_write_space = tun_sock_write_space; sk->sk_sndbuf = INT_MAX; + sock_set_flag(sk, SOCK_ZEROCOPY); tun_sk(sk)->tun = tun; -- MST ^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2012-07-20 19:23 UTC | newest] Thread overview: 2+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- [not found] <cover.1338735323.git.mst@redhat.com> [not found] ` <5ace2b9c3f15259cdf29af03f9231faac673e719.1338735323.git.mst@redhat.com> 2012-06-04 4:48 ` [PATCHv3 6/6] tun: experimental zero copy tx support Jason Wang 2012-07-20 19:23 [PATCHv3 0/6] tun zerocopy support Michael S. Tsirkin 2012-07-20 19:23 ` [PATCHv3 6/6] tun: experimental zero copy tx support Michael S. Tsirkin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).