* [PATCHv2-RFC 2/6] skbuff: convert to skb_orphan_frags
2012-05-16 21:15 [PATCHv2-RFC 0/6] tun zerocopy support Michael S. Tsirkin
2012-05-16 21:15 ` [PATCHv2-RFC 1/6] skbuff: add an api to orphan frags Michael S. Tsirkin
@ 2012-05-16 21:16 ` Michael S. Tsirkin
2012-05-16 21:16 ` [PATCHv2-RFC 3/6] skbuff: export skb_copy_ubufs Michael S. Tsirkin
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Michael S. Tsirkin @ 2012-05-16 21:16 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Jason Wang, eric.dumazet, netdev, linux-kernel, ebiederm, davem
Reduce code duplication a bit using the new helper.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
net/core/skbuff.c | 22 ++++++++--------------
1 files changed, 8 insertions(+), 14 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2a18719..596e392 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -761,10 +761,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *n;
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, gfp_mask))
- return NULL;
- }
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
@@ -884,12 +882,10 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
if (skb_shinfo(skb)->nr_frags) {
int i;
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, gfp_mask)) {
- kfree_skb(n);
- n = NULL;
- goto out;
- }
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
@@ -962,10 +958,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
*/
if (skb_cloned(skb)) {
/* copy this zero copy skb frags */
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, gfp_mask))
- goto nofrags;
- }
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
--
MST
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCHv2-RFC 6/6] tun: experimental zero copy tx support
2012-05-16 21:15 [PATCHv2-RFC 0/6] tun zerocopy support Michael S. Tsirkin
` (4 preceding siblings ...)
2012-05-16 21:16 ` [PATCHv2-RFC 5/6] net: orphan frags on receive Michael S. Tsirkin
@ 2012-05-16 21:16 ` Michael S. Tsirkin
5 siblings, 0 replies; 7+ messages in thread
From: Michael S. Tsirkin @ 2012-05-16 21:16 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Jason Wang, eric.dumazet, netdev, linux-kernel, ebiederm, davem
Let vhost-net utilize zero copy tx when used with tun.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/net/tun.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 127 insertions(+), 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index fe5cd2f3..c4459e0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -100,6 +100,8 @@ do { \
} while (0)
#endif
+#define GOODCOPY_LEN 128
+
#define FLT_EXACT_COUNT 8
struct tap_filter {
unsigned int count; /* Number of addrs. Zero means disabled */
@@ -602,8 +604,86 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
return skb;
}
+/* set skb frags from iovec, this can move to core network code for reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+ int offset, size_t count)
+{
+ int len = iov_length(from, count) - offset;
+ int copy = skb_headlen(skb);
+ int size, offset1 = 0;
+ int i = 0;
+
+ /* Skip over from offset */
+ while (count && (offset >= from->iov_len)) {
+ offset -= from->iov_len;
+ ++from;
+ --count;
+ }
+
+ /* copy up to skb headlen */
+ while (count && (copy > 0)) {
+ size = min_t(unsigned int, copy, from->iov_len - offset);
+ if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+ size))
+ return -EFAULT;
+ if (copy > size) {
+ ++from;
+ --count;
+ offset = 0;
+ } else
+ offset += size;
+ copy -= size;
+ offset1 += size;
+ }
+
+ if (len == offset1)
+ return 0;
+
+ while (count--) {
+ struct page *page[MAX_SKB_FRAGS];
+ int num_pages;
+ unsigned long base;
+ unsigned long truesize;
+
+ len = from->iov_len - offset;
+ if (!len) {
+ offset = 0;
+ ++from;
+ continue;
+ }
+ base = (unsigned long)from->iov_base + offset;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (i + size > MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+ num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+ if (num_pages != size) {
+ for (i = 0; i < num_pages; i++)
+ put_page(page[i]);
+ return -EFAULT;
+ }
+ truesize = size * PAGE_SIZE;
+ skb->data_len += len;
+ skb->len += len;
+ skb->truesize += truesize;
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+ while (len) {
+ int off = base & ~PAGE_MASK;
+ int size = min_t(int, len, PAGE_SIZE - off);
+ __skb_fill_page_desc(skb, i, page[i], off, size);
+ skb_shinfo(skb)->nr_frags++;
+ /* increase sk_wmem_alloc */
+ base += size;
+ len -= size;
+ i++;
+ }
+ offset = 0;
+ ++from;
+ }
+ return 0;
+}
+
/* Get packet from user space buffer */
-static ssize_t tun_get_user(struct tun_struct *tun,
+static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
const struct iovec *iv, size_t count,
int noblock)
{
@@ -612,6 +692,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
size_t len = count, align = NET_SKB_PAD;
struct virtio_net_hdr gso = { 0 };
int offset = 0;
+ int copylen;
+ bool zerocopy = false;
+ int err;
if (!(tun->flags & TUN_NO_PI)) {
if ((len -= sizeof(pi)) > count)
@@ -645,14 +728,46 @@ static ssize_t tun_get_user(struct tun_struct *tun,
return -EINVAL;
}
- skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
+ if (msg_control)
+ zerocopy = true;
+
+ if (zerocopy) {
+ /* Userspace may produce vectors with count greater than
+ * MAX_SKB_FRAGS, so we need to linearize parts of the skb
+ * to let the rest of data to be fit in the frags.
+ */
+ if (count > MAX_SKB_FRAGS) {
+ copylen = iov_length(iv, count - MAX_SKB_FRAGS);
+ if (copylen < offset)
+ copylen = 0;
+ else
+ copylen -= offset;
+ } else
+ copylen = 0;
+ /* There are 256 bytes to be copied in skb, so there is enough
+ * room for skb expand head in case it is used.
+ * The rest of the buffer is mapped from userspace.
+ */
+ if (copylen < gso.hdr_len)
+ copylen = gso.hdr_len;
+ if (!copylen)
+ copylen = GOODCOPY_LEN;
+ } else
+ copylen = len;
+
+ skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
tun->dev->stats.rx_dropped++;
return PTR_ERR(skb);
}
- if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
+ if (zerocopy)
+ err = zerocopy_sg_from_iovec(skb, iv, offset, count);
+ else
+ err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
+
+ if (err) {
tun->dev->stats.rx_dropped++;
kfree_skb(skb);
return -EFAULT;
@@ -726,6 +841,12 @@ static ssize_t tun_get_user(struct tun_struct *tun,
skb_shinfo(skb)->gso_segs = 0;
}
+ /* copy skb_ubuf_info for callback when skb has no error */
+ if (zerocopy) {
+ skb_shinfo(skb)->destructor_arg = msg_control;
+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ }
+
netif_rx_ni(skb);
tun->dev->stats.rx_packets++;
@@ -746,7 +867,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
- result = tun_get_user(tun, iv, iov_length(iv, count),
+ result = tun_get_user(tun, NULL, iv, iov_length(iv, count),
file->f_flags & O_NONBLOCK);
tun_put(tun);
@@ -960,7 +1081,7 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len)
{
struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
- return tun_get_user(tun, m->msg_iov, total_len,
+ return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
m->msg_flags & MSG_DONTWAIT);
}
@@ -1130,6 +1251,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_sndbuf = INT_MAX;
+ sock_set_flag(sk, SOCK_ZEROCOPY);
tun_sk(sk)->tun = tun;
--
MST
^ permalink raw reply related [flat|nested] 7+ messages in thread