* [RFC PATCH net-next 05/12] vhost_net: batch update used ring for datacopy TX
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
Like commit e2b3b35eb989 ("vhost_net: batch used ring update in rx"),
this patches implements batch used ring update for datacopy TX
(zerocopy has already done some kind of batching).
Testpmd transmission from guest to ixgbe via XDP_REDIRECT shows about
15% improvement from 2.8Mpps to 3.2Mpps.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4682fcc..f0639d7 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -511,6 +511,7 @@ static void handle_tx_copy(struct vhost_net *net)
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
int sent_pkts = 0;
+ s16 nheads = 0;
mutex_lock(&vq->mutex);
sock = vq->private_data;
@@ -550,6 +551,9 @@ static void handle_tx_copy(struct vhost_net *net)
if (len < 0)
break;
+ vq->heads[nheads].id = cpu_to_vhost32(vq, head);
+ vq->heads[nheads].len = 0;
+
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
vhost_has_more_pkts(net, vq)) {
@@ -568,13 +572,20 @@ static void handle_tx_copy(struct vhost_net *net)
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ if (++nheads == VHOST_RX_BATCH) {
+ vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+ nheads);
+ nheads = 0;
+ }
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
vhost_poll_queue(&vq->poll);
break;
}
}
out:
+ if (nheads)
+ vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+ nheads);
mutex_unlock(&vq->mutex);
}
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 06/12] tuntap: enable premmption early
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44d4f3d..24ecd82 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1697,6 +1697,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
goto err_xdp;
}
}
+ rcu_read_unlock();
+ preempt_enable();
skb = build_skb(buf, buflen);
if (!skb) {
@@ -1710,9 +1712,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
- rcu_read_unlock();
- preempt_enable();
-
return skb;
err_redirect:
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 07/12] tuntap: simplify error handling in tun_build_skb()
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 36 ++++++++++++++----------------------
1 file changed, 14 insertions(+), 22 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 24ecd82..f6e0f96 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1612,7 +1612,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
int len, int *skb_xdp)
{
struct page_frag *alloc_frag = ¤t->task_frag;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
unsigned int delta = 0;
@@ -1638,6 +1638,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
if (copied != len)
return ERR_PTR(-EFAULT);
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
/* There's a small window that XDP may be set after the check
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
@@ -1665,24 +1668,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
switch (act) {
case XDP_REDIRECT:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
xdp_do_flush_map();
if (err)
- goto err_redirect;
- rcu_read_unlock();
- preempt_enable();
- return NULL;
+ goto err_xdp;
+ goto out;
case XDP_TX:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
if (tun_xdp_tx(tun->dev, &xdp))
- goto err_redirect;
+ goto err_xdp;
tun_xdp_flush(tun->dev);
- rcu_read_unlock();
- preempt_enable();
- return NULL;
+ goto out;
case XDP_PASS:
delta = orig_data - xdp.data;
len = xdp.data_end - xdp.data;
@@ -1702,25 +1697,22 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
skb = build_skb(buf, buflen);
if (!skb) {
- rcu_read_unlock();
- preempt_enable();
- return ERR_PTR(-ENOMEM);
+ skb = ERR_PTR(-ENOMEM);
+ goto out;
}
skb_reserve(skb, pad - delta);
skb_put(skb, len);
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
return skb;
-err_redirect:
- put_page(alloc_frag->page);
err_xdp:
+ alloc_frag->offset -= buflen;
+ put_page(alloc_frag->page);
+out:
rcu_read_unlock();
preempt_enable();
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
- return NULL;
+ return skb;
}
/* Get packet from user space buffer */
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 08/12] tuntap: tweak on the path of non-xdp case in tun_build_skb()
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
If we're sure not to go native XDP, there's no need for several things
like touching preemption counter and rcu stuffs.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f6e0f96..ed04291 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1645,10 +1645,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
*/
- if (hdr->gso_type || !xdp_prog)
+ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1;
- else
- *skb_xdp = 0;
+ goto build;
+ }
+
+ *skb_xdp = 0;
preempt_disable();
rcu_read_lock();
@@ -1695,6 +1697,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
rcu_read_unlock();
preempt_enable();
+build:
skb = build_skb(buf, buflen);
if (!skb) {
skb = ERR_PTR(-ENOMEM);
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 09/12] tuntap: split out XDP logic
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
This patch split out XDP logic into a single function. This make it to
be reused by XDP batching path in the following patch.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++----------------------
1 file changed, 51 insertions(+), 34 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ed04291..2560378 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1605,6 +1605,44 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}
+static u32 tun_do_xdp(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct bpf_prog *xdp_prog,
+ struct xdp_buff *xdp,
+ int *err)
+{
+ u32 act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+ switch (act) {
+ case XDP_REDIRECT:
+ *err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+ xdp_do_flush_map();
+ if (*err)
+ break;
+ goto out;
+ case XDP_TX:
+ *err = tun_xdp_tx(tun->dev, xdp);
+ if (*err)
+ break;
+ tun_xdp_flush(tun->dev);
+ goto out;
+ case XDP_PASS:
+ goto out;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ break;
+ }
+
+ put_page(virt_to_head_page(xdp->data_hard_start));
+out:
+ return act;
+}
+
static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
@@ -1615,10 +1653,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct sk_buff *skb = NULL;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- unsigned int delta = 0;
char *buf;
size_t copied;
- int err, pad = TUN_RX_PAD;
+ int pad = TUN_RX_PAD;
+ int err = 0;
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
@@ -1655,9 +1693,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
preempt_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
- if (xdp_prog && !*skb_xdp) {
+ if (xdp_prog) {
struct xdp_buff xdp;
- void *orig_data;
u32 act;
xdp.data_hard_start = buf;
@@ -1665,34 +1702,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
- orig_data = xdp.data;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
- switch (act) {
- case XDP_REDIRECT:
- err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
- xdp_do_flush_map();
- if (err)
- goto err_xdp;
- goto out;
- case XDP_TX:
- if (tun_xdp_tx(tun->dev, &xdp))
- goto err_xdp;
- tun_xdp_flush(tun->dev);
- goto out;
- case XDP_PASS:
- delta = orig_data - xdp.data;
- len = xdp.data_end - xdp.data;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(tun->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
+ act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
+ if (err)
goto err_xdp;
- }
+ if (act != XDP_PASS)
+ goto out;
+
+ pad = xdp.data - xdp.data_hard_start;
+ len = xdp.data_end - xdp.data;
}
rcu_read_unlock();
preempt_enable();
@@ -1700,18 +1717,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
build:
skb = build_skb(buf, buflen);
if (!skb) {
+ put_page(alloc_frag->page);
skb = ERR_PTR(-ENOMEM);
goto out;
}
- skb_reserve(skb, pad - delta);
+ skb_reserve(skb, pad);
skb_put(skb, len);
return skb;
err_xdp:
- alloc_frag->offset -= buflen;
- put_page(alloc_frag->page);
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
out:
rcu_read_unlock();
preempt_enable();
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 10/12] vhost_net: build xdp buff
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
This patch implement build XDP buffers in vhost_net. The idea is do
userspace copy in vhost_net and build XDP buff based on the
page. Vhost_net can then submit one or an array of XDP buffs to
underlayer socket (e.g TUN). TUN can choose to do XDP or call
build_skb() to build skb. To support build skb, vnet header were also
stored into the header of the XDP buff.
This userspace copy and XDP buffs building is key to achieve XDP
batching in TUN, since TUN does not need to care about userspace copy
and then can disable premmption for several XDP buffs to achieve
batching from XDP.
TODO: reserve headroom based on the TUN XDP.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f0639d7..1209e84 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
likely(!vhost_exceeds_maxpend(net));
}
+#define VHOST_NET_HEADROOM 256
+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
+ struct iov_iter *from,
+ struct xdp_buff *xdp)
+{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ struct page_frag *alloc_frag = ¤t->task_frag;
+ struct virtio_net_hdr *gso;
+ size_t len = iov_iter_count(from);
+ int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
+ + nvq->sock_hlen);
+ int sock_hlen = nvq->sock_hlen;
+ void *buf;
+ int copied;
+
+ if (len < nvq->sock_hlen)
+ return -EFAULT;
+
+ if (SKB_DATA_ALIGN(len + pad) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+ return -ENOSPC;
+
+ buflen += SKB_DATA_ALIGN(len + pad);
+ alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
+ if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+ return -ENOMEM;
+
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+
+ /* We store two kinds of metadata in the header which will be
+ * used for XDP_PASS to do build_skb():
+ * offset 0: buflen
+ * offset sizeof(int): vnet header
+ */
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + sizeof(int), sock_hlen, from);
+ if (copied != sock_hlen)
+ return -EFAULT;
+
+ gso = (struct virtio_net_hdr *)(buf + sizeof(int));
+
+ if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+ vhost16_to_cpu(vq, gso->csum_start) +
+ vhost16_to_cpu(vq, gso->csum_offset) + 2 >
+ vhost16_to_cpu(vq, gso->hdr_len)) {
+ gso->hdr_len = cpu_to_vhost16(vq,
+ vhost16_to_cpu(vq, gso->csum_start) +
+ vhost16_to_cpu(vq, gso->csum_offset) + 2);
+
+ if (vhost16_to_cpu(vq, gso->hdr_len) > len)
+ return -EINVAL;
+ }
+
+ len -= sock_hlen;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + pad,
+ len, from);
+ if (copied != len)
+ return -EFAULT;
+
+ xdp->data_hard_start = buf;
+ xdp->data = buf + pad;
+ xdp->data_end = xdp->data + len;
+ *(int *)(xdp->data_hard_start)= buflen;
+
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return 0;
+}
+
static void handle_tx_copy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 11/12] vhost_net: passing raw xdp buff to tun
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
This patches implement a TUN specific msg_control:
#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR 2
struct tun_msg_ctl {
int type;
void *ptr;
};
The first supported type is ubuf which is already used by vhost_net
zerocopy code. The second is XDP buff, which allows vhost_net to pass
XDP buff to TUN. This could be used to implement accepting an array of
XDP buffs from vhost_net in the following patches.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/net.c | 21 ++++++++++--
include/linux/if_tun.h | 7 ++++
3 files changed, 116 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2560378..b586b3f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2387,18 +2387,107 @@ static void tun_sock_write_space(struct sock *sk)
kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
+static int tun_xdp_one(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct xdp_buff *xdp)
+{
+ struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+ struct tun_pcpu_stats *stats;
+ struct bpf_prog *xdp_prog;
+ struct sk_buff *skb = NULL;
+ u32 rxhash = 0, act;
+ int buflen = *(int *)xdp->data_hard_start;
+ int err = 0;
+ bool skb_xdp = false;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ if (gso->gso_type) {
+ skb_xdp = true;
+ goto build;
+ }
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+ act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
+ if (err)
+ goto out;
+ if (act != XDP_PASS)
+ goto out;
+ }
+
+build:
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (skb_xdp) {
+ err = do_xdp_generic(xdp_prog, skb);
+ if (err != XDP_PASS)
+ goto out;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ err = -EINVAL;
+ goto out;
+ }
+
+ skb->protocol = eth_type_trans(skb, tun->dev);
+ skb_reset_network_header(skb);
+ skb_probe_transport_header(skb, 0);
+
+ if (!rcu_dereference(tun->steering_prog))
+ rxhash = __skb_get_hash_symmetric(skb);
+
+ netif_receive_skb(skb);
+
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(stats);
+
+ if (rxhash)
+ tun_flow_update(tun, rxhash, tfile);
+
+out:
+ rcu_read_unlock();
+ preempt_enable();
+
+ return err;
+}
+
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
int ret;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+ struct tun_msg_ctl *ctl = m->msg_control;
if (!tun)
return -EBADFD;
- ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+ if (ctl && ctl->type == TUN_MSG_PTR) {
+ ret = tun_xdp_one(tun, tfile, ctl->ptr);
+ if (!ret)
+ ret = total_len;
+ goto out;
+ }
+
+ ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
+out:
tun_put(tun);
return ret;
}
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1209e84..0d84de6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -117,6 +117,7 @@ struct vhost_net_virtqueue {
struct vhost_net_ubuf_ref *ubufs;
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
+ struct xdp_buff xdp[VHOST_RX_BATCH];
};
struct vhost_net {
@@ -570,6 +571,7 @@ static void handle_tx_copy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
+ struct xdp_buff xdp;
unsigned out, in;
int head;
struct msghdr msg = {
@@ -584,6 +586,7 @@ static void handle_tx_copy(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ struct tun_msg_ctl ctl;
int sent_pkts = 0;
s16 nheads = 0;
@@ -628,6 +631,14 @@ static void handle_tx_copy(struct vhost_net *net)
vq->heads[nheads].id = cpu_to_vhost32(vq, head);
vq->heads[nheads].len = 0;
+ err = vhost_net_build_xdp(nvq, &msg.msg_iter, &xdp);
+ if (!err) {
+ ctl.type = TUN_MSG_PTR;
+ ctl.ptr = &xdp;
+ msg.msg_control = &ctl;
+ } else
+ msg.msg_control = NULL;
+
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
vhost_has_more_pkts(net, vq)) {
@@ -734,16 +745,21 @@ static void handle_tx_zerocopy(struct vhost_net *net)
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if (zcopy_used) {
struct ubuf_info *ubuf;
+ struct tun_msg_ctl ctl;
+
ubuf = nvq->ubuf_info + nvq->upend_idx;
+ ctl.type = TUN_MSG_UBUF;
+ ctl.ptr = ubuf;
+
vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
refcount_set(&ubuf->refcnt, 1);
- msg.msg_control = ubuf;
- msg.msg_controllen = sizeof(ubuf);
+ msg.msg_control = &ctl;
+ msg.msg_controllen = sizeof(ctl);
ubufs = nvq->ubufs;
atomic_inc(&ubufs->refcount);
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
@@ -751,6 +767,7 @@ static void handle_tx_zerocopy(struct vhost_net *net)
msg.msg_control = NULL;
ubufs = NULL;
}
+
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
vhost_has_more_pkts(net, vq)) {
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3d2996d..ba46dce 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,13 @@
#define TUN_XDP_FLAG 0x1UL
+#define TUN_MSG_UBUF 1
+#define TUN_MSG_PTR 2
+struct tun_msg_ctl {
+ int type;
+ void *ptr;
+};
+
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);
--
2.7.4
^ permalink raw reply related
* [RFC PATCH net-next 12/12] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Jason Wang @ 2018-05-21 9:04 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>
This patch implements XDP batching for vhost_net with tun. This is
done by batching XDP buffs in vhost and submit them when:
- vhost_net can not build XDP buff (mostly because of the size of packet)
- #batched exceeds the limitation (VHOST_NET_RX_BATCH).
- tun accept a batch of XDP buff through msg_control and process them
in a batch
With this tun XDP can benefit from e.g batch transmission during
XDP_REDIRECT or XDP_TX.
Tests shows 21% improvement on TX pps (from ~3.2Mpps to ~3.9Mpps)
while transmitting through testpmd from guest to host by
xdp_redirect_map between tap0 and ixgbe.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 36 +++++++++++++++++----------
drivers/vhost/net.c | 71 ++++++++++++++++++++++++++++++++++++-----------------
2 files changed, 71 insertions(+), 36 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b586b3f..5d16d18 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1616,7 +1616,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
switch (act) {
case XDP_REDIRECT:
*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
- xdp_do_flush_map();
if (*err)
break;
goto out;
@@ -1624,7 +1623,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
*err = tun_xdp_tx(tun->dev, xdp);
if (*err)
break;
- tun_xdp_flush(tun->dev);
goto out;
case XDP_PASS:
goto out;
@@ -2400,9 +2398,6 @@ static int tun_xdp_one(struct tun_struct *tun,
int err = 0;
bool skb_xdp = false;
- preempt_disable();
- rcu_read_lock();
-
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog) {
if (gso->gso_type) {
@@ -2461,15 +2456,12 @@ static int tun_xdp_one(struct tun_struct *tun,
tun_flow_update(tun, rxhash, tfile);
out:
- rcu_read_unlock();
- preempt_enable();
-
return err;
}
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
- int ret;
+ int ret, i;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
struct tun_msg_ctl *ctl = m->msg_control;
@@ -2477,10 +2469,28 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
if (!tun)
return -EBADFD;
- if (ctl && ctl->type == TUN_MSG_PTR) {
- ret = tun_xdp_one(tun, tfile, ctl->ptr);
- if (!ret)
- ret = total_len;
+ if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+ int n = ctl->type >> 16;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ for (i = 0; i < n; i++) {
+ struct xdp_buff *x = (struct xdp_buff *)ctl->ptr;
+ struct xdp_buff *xdp = &x[i];
+
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+ tun_xdp_one(tun, tfile, xdp);
+ }
+
+ xdp_do_flush_map();
+ tun_xdp_flush(tun->dev);
+
+ rcu_read_unlock();
+ preempt_enable();
+
+ ret = total_len;
goto out;
}
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 0d84de6..bec4109 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -118,6 +118,7 @@ struct vhost_net_virtqueue {
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
struct xdp_buff xdp[VHOST_RX_BATCH];
+ struct vring_used_elem heads[VHOST_RX_BATCH];
};
struct vhost_net {
@@ -511,7 +512,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
void *buf;
int copied;
- if (len < nvq->sock_hlen)
+ if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
if (SKB_DATA_ALIGN(len + pad) +
@@ -567,11 +568,37 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
return 0;
}
+static void vhost_tx_batch(struct vhost_net *net,
+ struct vhost_net_virtqueue *nvq,
+ struct socket *sock,
+ struct msghdr *msghdr, int n)
+{
+ struct tun_msg_ctl ctl = {
+ .type = n << 16 | TUN_MSG_PTR,
+ .ptr = nvq->xdp,
+ };
+ int err;
+
+ if (n == 0)
+ return;
+
+ msghdr->msg_control = &ctl;
+ err = sock->ops->sendmsg(sock, msghdr, 0);
+
+ if (unlikely(err < 0)) {
+ /* FIXME vq_err() */
+ vq_err(&nvq->vq, "sendmsg err!\n");
+ return;
+ }
+ vhost_add_used_and_signal_n(&net->dev, &nvq->vq, nvq->vq.heads, n);
+}
+
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
static void handle_tx_copy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
- struct xdp_buff xdp;
unsigned out, in;
int head;
struct msghdr msg = {
@@ -586,7 +613,6 @@ static void handle_tx_copy(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
- struct tun_msg_ctl ctl;
int sent_pkts = 0;
s16 nheads = 0;
@@ -631,22 +657,24 @@ static void handle_tx_copy(struct vhost_net *net)
vq->heads[nheads].id = cpu_to_vhost32(vq, head);
vq->heads[nheads].len = 0;
- err = vhost_net_build_xdp(nvq, &msg.msg_iter, &xdp);
- if (!err) {
- ctl.type = TUN_MSG_PTR;
- ctl.ptr = &xdp;
- msg.msg_control = &ctl;
- } else
- msg.msg_control = NULL;
-
total_len += len;
- if (total_len < VHOST_NET_WEIGHT &&
- vhost_has_more_pkts(net, vq)) {
- msg.msg_flags |= MSG_MORE;
- } else {
- msg.msg_flags &= ~MSG_MORE;
+ err = vhost_net_build_xdp(nvq, &msg.msg_iter,
+ &nvq->xdp[nheads]);
+ if (!err) {
+ if (++nheads == VHOST_RX_BATCH) {
+ vhost_tx_batch(net, nvq, sock, &msg, nheads);
+ nheads = 0;
+ }
+ goto done;
+ } else if (unlikely(err != -ENOSPC)) {
+ vq_err(vq, "Fail to build XDP buffer\n");
+ break;
}
+ vhost_tx_batch(net, nvq, sock, &msg, nheads);
+ msg.msg_control = NULL;
+ nheads = 0;
+
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) {
@@ -657,11 +685,9 @@ static void handle_tx_copy(struct vhost_net *net)
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- if (++nheads == VHOST_RX_BATCH) {
- vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
- nheads);
- nheads = 0;
- }
+
+ vhost_add_used_and_signal(&net->dev, vq, head, 0);
+done:
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
vhost_poll_queue(&vq->poll);
break;
@@ -669,8 +695,7 @@ static void handle_tx_copy(struct vhost_net *net)
}
out:
if (nheads)
- vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
- nheads);
+ vhost_tx_batch(net, nvq, sock, &msg, nheads);
mutex_unlock(&vq->mutex);
}
--
2.7.4
^ permalink raw reply related
* Re: [RFC V4 PATCH 0/8] Packed ring layout for vhost
From: Wei Xu @ 2018-05-21 12:25 UTC (permalink / raw)
To: Jason Wang; +Cc: kvm, mst, netdev, linux-kernel, virtualization
In-Reply-To: <9a8425be-5c39-eff7-d808-836b825ed890@redhat.com>
On Mon, May 21, 2018 at 10:33:30AM +0800, Jason Wang wrote:
>
>
> On 2018年05月21日 00:25, Wei Xu wrote:
> >On Wed, May 16, 2018 at 08:32:13PM +0800, Jason Wang wrote:
> >>Hi all:
> >>
> >>This RFC implement packed ring layout. The code were tested with
> >>Tiwei's RFC V3 ahttps://lkml.org/lkml/2018/4/25/34. Some fixups and
> >>tweaks were needed on top of Tiwei's code to make it run for event
> >>index.
> >Could you please show the change based on Tiwei's code to easy other's
> >test?
>
> Please try Tiwei's V4 instead of just waiting for the fixup. It should work
> unless you don't try zerocopy and vIOMMU.
Yeah, actually v3 of both you guys works well on my test bed except resetting.
>
>
> >>Pktgen reports about 20% improvement on PPS (event index is off). More
> >>testing is ongoing.
> >>
> >>Notes for tester:
> >>
> >>- Start from this version, vhost need qemu co-operation to work
> >> correctly. Or you can comment out the packed specific code for
> >> GET/SET_VRING_BASE.
> >Do you mean the code in vhost_virtqueue_start/stop?
>
> For qemu, probably.
>
> >Both Tiwei's and your v3
> >work fortunately correctly which should be avoided since the ring should be
> >definitely different.
>
> I don't understand this, you mean reset work?a
No, currently we have not handled vhost start/stop for split/packed ring relatively
which is what I am doing now.
Wei
>
> Thanks
>
> >
> >Wei
> >
> >>Changes from V3:
> >>- Fix math on event idx checking
> >>- Sync last avail wrap counter through GET/SET_VRING_BASE
> >>- remove desc_event prefix in the driver/device structure
> >>
> >>Changes from V2:
> >>- do not use & in checking desc_event_flags
> >>- off should be most significant bit
> >>- remove the workaround of mergeable buffer for dpdk prototype
> >>- id should be in the last descriptor in the chain
> >>- keep _F_WRITE for write descriptor when adding used
> >>- device flags updating should use ADDR_USED type
> >>- return error on unexpected unavail descriptor in a chain
> >>- return false in vhost_ve_avail_empty is descriptor is available
> >>- track last seen avail_wrap_counter
> >>- correctly examine available descriptor in get_indirect_packed()
> >>- vhost_idx_diff should return u16 instead of bool
> >>
> >>Changes from V1:
> >>
> >>- Refactor vhost used elem code to avoid open coding on used elem
> >>- Event suppression support (compile test only).
> >>- Indirect descriptor support (compile test only).
> >>- Zerocopy support.
> >>- vIOMMU support.
> >>- SCSI/VSOCK support (compile test only).
> >>- Fix several bugs
> >>
> >>Jason Wang (8):
> >> vhost: move get_rx_bufs to vhost.c
> >> vhost: hide used ring layout from device
> >> vhost: do not use vring_used_elem
> >> vhost_net: do not explicitly manipulate vhost_used_elem
> >> vhost: vhost_put_user() can accept metadata type
> >> virtio: introduce packed ring defines
> >> vhost: packed ring support
> >> vhost: event suppression for packed ring
> >>
> >> drivers/vhost/net.c | 136 ++----
> >> drivers/vhost/scsi.c | 62 +--
> >> drivers/vhost/vhost.c | 861 ++++++++++++++++++++++++++++++++-----
> >> drivers/vhost/vhost.h | 47 +-
> >> drivers/vhost/vsock.c | 42 +-
> >> include/uapi/linux/virtio_config.h | 9 +
> >> include/uapi/linux/virtio_ring.h | 32 ++
> >> 7 files changed, 928 insertions(+), 261 deletions(-)
> >>
> >>--
> >>2.7.4
> >>
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH net-next 06/12] tuntap: enable premmption early
From: Michael S. Tsirkin @ 2018-05-21 14:32 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-7-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 05:04:27PM +0800, Jason Wang wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>
typo in subject
> ---
> drivers/net/tun.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 44d4f3d..24ecd82 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1697,6 +1697,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
> goto err_xdp;
> }
> }
> + rcu_read_unlock();
> + preempt_enable();
>
> skb = build_skb(buf, buflen);
> if (!skb) {
> @@ -1710,9 +1712,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
> get_page(alloc_frag->page);
> alloc_frag->offset += buflen;
>
> - rcu_read_unlock();
> - preempt_enable();
> -
> return skb;
>
> err_redirect:
> --
> 2.7.4
^ permalink raw reply
* Re: [RFC PATCH net-next 12/12] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Michael S. Tsirkin @ 2018-05-21 14:33 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-13-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 05:04:33PM +0800, Jason Wang wrote:
> This patch implements XDP batching for vhost_net with tun. This is
> done by batching XDP buffs in vhost and submit them when:
>
> - vhost_net can not build XDP buff (mostly because of the size of packet)
> - #batched exceeds the limitation (VHOST_NET_RX_BATCH).
> - tun accept a batch of XDP buff through msg_control and process them
> in a batch
>
> With this tun XDP can benefit from e.g batch transmission during
> XDP_REDIRECT or XDP_TX.
>
> Tests shows 21% improvement on TX pps (from ~3.2Mpps to ~3.9Mpps)
> while transmitting through testpmd from guest to host by
> xdp_redirect_map between tap0 and ixgbe.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
s/underlayer/underlying/ ?
> ---
> drivers/net/tun.c | 36 +++++++++++++++++----------
> drivers/vhost/net.c | 71 ++++++++++++++++++++++++++++++++++++-----------------
> 2 files changed, 71 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b586b3f..5d16d18 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1616,7 +1616,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
> switch (act) {
> case XDP_REDIRECT:
> *err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
> - xdp_do_flush_map();
> if (*err)
> break;
> goto out;
> @@ -1624,7 +1623,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
> *err = tun_xdp_tx(tun->dev, xdp);
> if (*err)
> break;
> - tun_xdp_flush(tun->dev);
> goto out;
> case XDP_PASS:
> goto out;
> @@ -2400,9 +2398,6 @@ static int tun_xdp_one(struct tun_struct *tun,
> int err = 0;
> bool skb_xdp = false;
>
> - preempt_disable();
> - rcu_read_lock();
> -
> xdp_prog = rcu_dereference(tun->xdp_prog);
> if (xdp_prog) {
> if (gso->gso_type) {
> @@ -2461,15 +2456,12 @@ static int tun_xdp_one(struct tun_struct *tun,
> tun_flow_update(tun, rxhash, tfile);
>
> out:
> - rcu_read_unlock();
> - preempt_enable();
> -
> return err;
> }
>
> static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> {
> - int ret;
> + int ret, i;
> struct tun_file *tfile = container_of(sock, struct tun_file, socket);
> struct tun_struct *tun = tun_get(tfile);
> struct tun_msg_ctl *ctl = m->msg_control;
> @@ -2477,10 +2469,28 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> if (!tun)
> return -EBADFD;
>
> - if (ctl && ctl->type == TUN_MSG_PTR) {
> - ret = tun_xdp_one(tun, tfile, ctl->ptr);
> - if (!ret)
> - ret = total_len;
> + if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> + int n = ctl->type >> 16;
> +
> + preempt_disable();
> + rcu_read_lock();
> +
> + for (i = 0; i < n; i++) {
> + struct xdp_buff *x = (struct xdp_buff *)ctl->ptr;
> + struct xdp_buff *xdp = &x[i];
> +
> + xdp_set_data_meta_invalid(xdp);
> + xdp->rxq = &tfile->xdp_rxq;
> + tun_xdp_one(tun, tfile, xdp);
> + }
> +
> + xdp_do_flush_map();
> + tun_xdp_flush(tun->dev);
> +
> + rcu_read_unlock();
> + preempt_enable();
> +
> + ret = total_len;
> goto out;
> }
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 0d84de6..bec4109 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -118,6 +118,7 @@ struct vhost_net_virtqueue {
> struct ptr_ring *rx_ring;
> struct vhost_net_buf rxq;
> struct xdp_buff xdp[VHOST_RX_BATCH];
> + struct vring_used_elem heads[VHOST_RX_BATCH];
> };
>
> struct vhost_net {
> @@ -511,7 +512,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> void *buf;
> int copied;
>
> - if (len < nvq->sock_hlen)
> + if (unlikely(len < nvq->sock_hlen))
> return -EFAULT;
>
> if (SKB_DATA_ALIGN(len + pad) +
> @@ -567,11 +568,37 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> return 0;
> }
>
> +static void vhost_tx_batch(struct vhost_net *net,
> + struct vhost_net_virtqueue *nvq,
> + struct socket *sock,
> + struct msghdr *msghdr, int n)
> +{
> + struct tun_msg_ctl ctl = {
> + .type = n << 16 | TUN_MSG_PTR,
> + .ptr = nvq->xdp,
> + };
> + int err;
> +
> + if (n == 0)
> + return;
> +
> + msghdr->msg_control = &ctl;
> + err = sock->ops->sendmsg(sock, msghdr, 0);
> +
> + if (unlikely(err < 0)) {
> + /* FIXME vq_err() */
> + vq_err(&nvq->vq, "sendmsg err!\n");
> + return;
> + }
> + vhost_add_used_and_signal_n(&net->dev, &nvq->vq, nvq->vq.heads, n);
> +}
> +
> +/* Expects to be always run from workqueue - which acts as
> + * read-size critical section for our kind of RCU. */
> static void handle_tx_copy(struct vhost_net *net)
> {
> struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> struct vhost_virtqueue *vq = &nvq->vq;
> - struct xdp_buff xdp;
> unsigned out, in;
> int head;
> struct msghdr msg = {
> @@ -586,7 +613,6 @@ static void handle_tx_copy(struct vhost_net *net)
> size_t hdr_size;
> struct socket *sock;
> struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> - struct tun_msg_ctl ctl;
> int sent_pkts = 0;
> s16 nheads = 0;
>
> @@ -631,22 +657,24 @@ static void handle_tx_copy(struct vhost_net *net)
> vq->heads[nheads].id = cpu_to_vhost32(vq, head);
> vq->heads[nheads].len = 0;
>
> - err = vhost_net_build_xdp(nvq, &msg.msg_iter, &xdp);
> - if (!err) {
> - ctl.type = TUN_MSG_PTR;
> - ctl.ptr = &xdp;
> - msg.msg_control = &ctl;
> - } else
> - msg.msg_control = NULL;
> -
> total_len += len;
> - if (total_len < VHOST_NET_WEIGHT &&
> - vhost_has_more_pkts(net, vq)) {
> - msg.msg_flags |= MSG_MORE;
> - } else {
> - msg.msg_flags &= ~MSG_MORE;
> + err = vhost_net_build_xdp(nvq, &msg.msg_iter,
> + &nvq->xdp[nheads]);
> + if (!err) {
> + if (++nheads == VHOST_RX_BATCH) {
> + vhost_tx_batch(net, nvq, sock, &msg, nheads);
> + nheads = 0;
> + }
> + goto done;
> + } else if (unlikely(err != -ENOSPC)) {
> + vq_err(vq, "Fail to build XDP buffer\n");
> + break;
> }
>
> + vhost_tx_batch(net, nvq, sock, &msg, nheads);
> + msg.msg_control = NULL;
> + nheads = 0;
> +
> /* TODO: Check specific error and bomb out unless ENOBUFS? */
> err = sock->ops->sendmsg(sock, &msg, len);
> if (unlikely(err < 0)) {
> @@ -657,11 +685,9 @@ static void handle_tx_copy(struct vhost_net *net)
> if (err != len)
> pr_debug("Truncated TX packet: "
> " len %d != %zd\n", err, len);
> - if (++nheads == VHOST_RX_BATCH) {
> - vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> - nheads);
> - nheads = 0;
> - }
> +
> + vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +done:
> if (vhost_exceeds_weight(++sent_pkts, total_len)) {
> vhost_poll_queue(&vq->poll);
> break;
> @@ -669,8 +695,7 @@ static void handle_tx_copy(struct vhost_net *net)
> }
> out:
> if (nheads)
> - vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> - nheads);
> + vhost_tx_batch(net, nvq, sock, &msg, nheads);
> mutex_unlock(&vq->mutex);
> }
>
> --
> 2.7.4
^ permalink raw reply
* Re: KASAN: use-after-free Read in vhost_chr_write_iter
From: Michael S. Tsirkin @ 2018-05-21 14:42 UTC (permalink / raw)
To: Jason Wang
Cc: bammanag, kt0755, kvm, netdev, linux-kernel, virtualization,
byoungyoung, DaeRyong Jeong
In-Reply-To: <fb27c1fd-5172-252a-cb8f-b53927a26d06@redhat.com>
On Mon, May 21, 2018 at 10:38:10AM +0800, Jason Wang wrote:
>
>
> On 2018年05月18日 17:24, Jason Wang wrote:
> >
> >
> > On 2018年05月17日 21:45, DaeRyong Jeong wrote:
> > > We report the crash: KASAN: use-after-free Read in vhost_chr_write_iter
> > >
> > > This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
> > > version of Syzkaller), which we describe more at the end of this
> > > report. Our analysis shows that the race occurs when invoking two
> > > syscalls concurrently, write$vnet and ioctl$VHOST_RESET_OWNER.
> > >
> > >
> > > Analysis:
> > > We think the concurrent execution of vhost_process_iotlb_msg() and
> > > vhost_dev_cleanup() causes the crash.
> > > Both of functions can run concurrently (please see call sequence below),
> > > and possibly, there is a race on dev->iotlb.
> > > If the switch occurs right after vhost_dev_cleanup() frees
> > > dev->iotlb, vhost_process_iotlb_msg() still sees the non-null value
> > > and it
> > > keep executing without returning -EFAULT. Consequently, use-after-free
> > > occures
> > >
> > >
> > > Thread interleaving:
> > > CPU0 (vhost_process_iotlb_msg) CPU1 (vhost_dev_cleanup)
> > > (In the case of both VHOST_IOTLB_UPDATE and
> > > VHOST_IOTLB_INVALIDATE)
> > > ===== =====
> > > vhost_umem_clean(dev->iotlb);
> > > if (!dev->iotlb) {
> > > ret = -EFAULT;
> > > break;
> > > }
> > > dev->iotlb = NULL;
> > >
> > >
> > > Call Sequence:
> > > CPU0
> > > =====
> > > vhost_net_chr_write_iter
> > > vhost_chr_write_iter
> > > vhost_process_iotlb_msg
> > >
> > > CPU1
> > > =====
> > > vhost_net_ioctl
> > > vhost_net_reset_owner
> > > vhost_dev_reset_owner
> > > vhost_dev_cleanup
> >
> > Thanks a lot for the analysis.
> >
> > This could be addressed by simply protect it with dev mutex.
> >
> > Will post a patch.
> >
>
> Could you please help to test the attached patch? I've done some smoking
> test.
>
> Thanks
> >From 88328386f3f652e684ee33dc4cf63dcaed871aea Mon Sep 17 00:00:00 2001
> From: Jason Wang <jasowang@redhat.com>
> Date: Fri, 18 May 2018 17:33:27 +0800
> Subject: [PATCH] vhost: synchronize IOTLB message with dev cleanup
>
> DaeRyong Jeong reports a race between vhost_dev_cleanup() and
> vhost_process_iotlb_msg():
>
> Thread interleaving:
> CPU0 (vhost_process_iotlb_msg) CPU1 (vhost_dev_cleanup)
> (In the case of both VHOST_IOTLB_UPDATE and
> VHOST_IOTLB_INVALIDATE)
> ===== =====
> vhost_umem_clean(dev->iotlb);
> if (!dev->iotlb) {
> ret = -EFAULT;
> break;
> }
> dev->iotlb = NULL;
>
> The reason is we don't synchronize between them, fixing by protecting
> vhost_process_iotlb_msg() with dev mutex.
>
> Reported-by: DaeRyong Jeong <threeearcat@gmail.com>
> Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
> Reported-by: DaeRyong Jeong <threeearcat@gmail.com>
Long terms we might want to move iotlb into vqs
so that messages can be processed in parallel.
Not sure how to do it yet.
> ---
> drivers/vhost/vhost.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f3bd8e9..f0be5f3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -981,6 +981,7 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> {
> int ret = 0;
>
> + mutex_lock(&dev->mutex);
> vhost_dev_lock_vqs(dev);
> switch (msg->type) {
> case VHOST_IOTLB_UPDATE:
> @@ -1016,6 +1017,8 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> }
>
> vhost_dev_unlock_vqs(dev);
> + mutex_unlock(&dev->mutex);
> +
> return ret;
> }
> ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
> --
> 2.7.4
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net 3/4] virtio-net: reset num_buf to 1 after linearizing packet
From: Michael S. Tsirkin @ 2018-05-21 14:59 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1526891706-18516-4-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 04:35:05PM +0800, Jason Wang wrote:
> If we successfully linearize the packets, num_buf were set to zero
> which was wrong since we now have only 1 buffer to be used for e.g in
> the error path of receive_mergeable(). Zero num_buf will lead the code
> try to pop the buffers of next packet and drop it. Fixing this by set
> num_buf to 1 if we successfully linearize the packet.
>
> Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/virtio_net.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6260d65..165a922 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -722,6 +722,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> &len);
> if (!xdp_page)
> goto err_xdp;
> + num_buf = 1;
So this is tweaked here for the benefit of err_skb below.
That's confusing and we won't remember to change it
if we change the error handling.
How about fixing the error path?
- while (--num_buf) {
+ while (num_buf-- > 1) {
Seems more robust to me.
> offset = VIRTIO_XDP_HEADROOM;
> } else {
> xdp_page = page;
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net 4/4] virito-net: fix leaking page for gso packet during mergeable XDP
From: Michael S. Tsirkin @ 2018-05-21 15:01 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, John Fastabend, linux-kernel, virtualization
In-Reply-To: <1526891706-18516-5-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 04:35:06PM +0800, Jason Wang wrote:
> We need to drop refcnt to xdp_page if we see a gso packet. Otherwise
> it will be leaked. Fixing this by moving the check of gso packet above
> the linearizing logic.
>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous buffers")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
typo in subject
> ---
> drivers/net/virtio_net.c | 16 ++++++++--------
> 1 file changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 165a922..f8db809 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -707,6 +707,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> void *data;
> u32 act;
>
> + /* Transient failure which in theory could occur if
> + * in-flight packets from before XDP was enabled reach
> + * the receive path after XDP is loaded. In practice I
> + * was not able to create this condition.
BTW we should probably drop the last sentence. It says in theory, should be enough.
> + */
> + if (unlikely(hdr->hdr.gso_type))
> + goto err_xdp;
> +
> /* This happens when rx buffer size is underestimated
> * or headroom is not enough because of the buffer
> * was refilled before XDP is set. This should only
> @@ -728,14 +736,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> xdp_page = page;
> }
>
> - /* Transient failure which in theory could occur if
> - * in-flight packets from before XDP was enabled reach
> - * the receive path after XDP is loaded. In practice I
> - * was not able to create this condition.
> - */
> - if (unlikely(hdr->hdr.gso_type))
> - goto err_xdp;
> -
> /* Allow consuming headroom but reserve enough space to push
> * the descriptor on if we get an XDP_TX return code.
> */
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net 1/4] virtio-net: correctly redirect linearized packet
From: Michael S. Tsirkin @ 2018-05-21 15:03 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1526891706-18516-2-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 04:35:03PM +0800, Jason Wang wrote:
> After a linearized packet was redirected by XDP, we should not go for
> the err path which will try to pop buffers for the next packet and
> increase the drop counter. Fixing this by just drop the page refcnt
> for the original page.
>
> Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
> Reported-by: David Ahern <dsahern@gmail.com>
> Tested-by: David Ahern <dsahern@gmail.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 770422e..c15d240 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -787,7 +787,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> }
> *xdp_xmit = true;
> if (unlikely(xdp_page != page))
> - goto err_xdp;
> + put_page(page);
> rcu_read_unlock();
> goto xdp_xmit;
> default:
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net 2/4] virtio-net: correctly transmit XDP buff after linearizing
From: Michael S. Tsirkin @ 2018-05-21 15:03 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, John Fastabend, linux-kernel, virtualization
In-Reply-To: <1526891706-18516-3-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 04:35:04PM +0800, Jason Wang wrote:
> We should not go for the error path after successfully transmitting a
> XDP buffer after linearizing. Since the error path may try to pop and
> drop next packet and increase the drop counters. Fixing this by simply
> drop the refcnt of original page and go for xmit path.
>
> Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous buffers")
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c15d240..6260d65 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -775,7 +775,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> }
> *xdp_xmit = true;
> if (unlikely(xdp_page != page))
> - goto err_xdp;
> + put_page(page);
> rcu_read_unlock();
> goto xdp_xmit;
> case XDP_REDIRECT:
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net 0/4] Fix several issues of virtio-net mergeable XDP
From: Michael S. Tsirkin @ 2018-05-21 15:04 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1526891706-18516-1-git-send-email-jasowang@redhat.com>
On Mon, May 21, 2018 at 04:35:02PM +0800, Jason Wang wrote:
> Hi:
>
> Please review the patches that tries to fix sevreal issues of
> virtio-net mergeable XDP.
>
> Thanks
I think we should do 3/4 differently.
The rest looks good, and probably needed on stable.
Thanks!
> Jason Wang (4):
> virtio-net: correctly redirect linearized packet
> virtio-net: correctly transmit XDP buff after linearizing
> virtio-net: reset num_buf to 1 after linearizing packet
> virito-net: fix leaking page for gso packet during mergeable XDP
>
> drivers/net/virtio_net.c | 21 +++++++++++----------
> 1 file changed, 11 insertions(+), 10 deletions(-)
>
> --
> 2.7.4
^ permalink raw reply
* Re: [RFC PATCH net-next 01/12] vhost_net: introduce helper to initialize tx iov iter
From: Jesse Brandeburg @ 2018-05-21 16:24 UTC (permalink / raw)
To: Jason Wang; +Cc: kvm, mst, netdev, virtualization, linux-kernel
In-Reply-To: <1526893473-20128-2-git-send-email-jasowang@redhat.com>
Hi Jason, a few nits.
On Mon, 21 May 2018 17:04:22 +0800 Jason wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/net.c | 34 +++++++++++++++++++++++-----------
> 1 file changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c4b49fc..15d191a 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -459,6 +459,26 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
> min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
> }
>
> +static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
> + size_t hdr_size, int out)
> +{
> + /* Skip header. TODO: support TSO. */
> + size_t len = iov_length(vq->iov, out);
> +
> + iov_iter_init(iter, WRITE, vq->iov, out, len);
> + iov_iter_advance(iter, hdr_size);
> + /* Sanity check */
> + if (!iov_iter_count(iter)) {
> + vq_err(vq, "Unexpected header len for TX: "
> + "%zd expected %zd\n",
> + len, hdr_size);
ok, it was like this before, but please unwrap the string in " ", there
should be no line breaks in string declarations and they are allowed to
go over 80 characters.
> + return -EFAULT;
> + }
> + len = iov_iter_count(iter);
> +
> + return len;
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> @@ -521,18 +541,10 @@ static void handle_tx(struct vhost_net *net)
> "out %d, int %d\n", out, in);
> break;
> }
> - /* Skip header. TODO: support TSO. */
> - len = iov_length(vq->iov, out);
> - iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> - iov_iter_advance(&msg.msg_iter, hdr_size);
> - /* Sanity check */
> - if (!msg_data_left(&msg)) {
> - vq_err(vq, "Unexpected header len for TX: "
> - "%zd expected %zd\n",
> - len, hdr_size);
> +
> + len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
> + if (len < 0)
len is declared as size_t, which is unsigned, and can never be
negative. I'm pretty sure this is a bug.
> break;
> - }
> - len = msg_data_left(&msg);
>
> zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> && !vhost_exceeds_maxpend(net)
^ permalink raw reply
* Re: [RFC PATCH net-next 02/12] vhost_net: introduce vhost_exceeds_weight()
From: Jesse Brandeburg @ 2018-05-21 16:29 UTC (permalink / raw)
To: Jason Wang; +Cc: kvm, mst, netdev, virtualization, linux-kernel
In-Reply-To: <1526893473-20128-3-git-send-email-jasowang@redhat.com>
On Mon, 21 May 2018 17:04:23 +0800 Jason wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/net.c | 13 ++++++++-----
> 1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 15d191a..de544ee 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -479,6 +479,12 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
> return len;
> }
>
> +static bool vhost_exceeds_weight(int pkts, int total_len)
> +{
> + return unlikely(total_len >= VHOST_NET_WEIGHT) ||
> + unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
I was going to say just one unlikely, but then the caller of this
function also says unlikely(vhost_exceeds...), so I think you should
just drop the unlikely statements here (both of them)
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> @@ -570,7 +576,6 @@ static void handle_tx(struct vhost_net *net)
> msg.msg_control = NULL;
> ubufs = NULL;
> }
> -
unrelated whitespace changes?
> total_len += len;
> if (total_len < VHOST_NET_WEIGHT &&
> !vhost_vq_avail_empty(&net->dev, vq) &&
> @@ -600,8 +605,7 @@ static void handle_tx(struct vhost_net *net)
> else
> vhost_zerocopy_signal_used(net, vq);
> vhost_net_tx_packet(net);
> - if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
> - unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
> + if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
> vhost_poll_queue(&vq->poll);
> break;
> }
> @@ -887,8 +891,7 @@ static void handle_rx(struct vhost_net *net)
> if (unlikely(vq_log))
> vhost_log_write(vq, vq_log, log, vhost_len);
> total_len += vhost_len;
> - if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
> - unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
> + if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
> vhost_poll_queue(&vq->poll);
> goto out;
> }
^ permalink raw reply
* Re: [RFC PATCH net-next 03/12] vhost_net: introduce vhost_has_more_pkts()
From: Jesse Brandeburg @ 2018-05-21 16:39 UTC (permalink / raw)
To: Jason Wang; +Cc: kvm, mst, netdev, virtualization, linux-kernel
In-Reply-To: <1526893473-20128-4-git-send-email-jasowang@redhat.com>
On Mon, 21 May 2018 17:04:24 +0800 Jason wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/net.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index de544ee..4ebac76 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -485,6 +485,13 @@ static bool vhost_exceeds_weight(int pkts, int total_len)
> unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
> }
>
> +static bool vhost_has_more_pkts(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + return !vhost_vq_avail_empty(&net->dev, vq) &&
> + likely(!vhost_exceeds_maxpend(net));
This really seems like mis-use of likely/unlikely, in the middle of a
sequence of operations that will always be run when this function is
called. I think you should remove the likely from this helper,
especially, and control the branch from the branch point.
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> @@ -578,8 +585,7 @@ static void handle_tx(struct vhost_net *net)
> }
> total_len += len;
> if (total_len < VHOST_NET_WEIGHT &&
> - !vhost_vq_avail_empty(&net->dev, vq) &&
> - likely(!vhost_exceeds_maxpend(net))) {
> + vhost_has_more_pkts(net, vq)) {
Yes, I know it came from here, but likely/unlikely are for branch
control, so they should encapsulate everything inside the if, unless
I'm mistaken.
> msg.msg_flags |= MSG_MORE;
> } else {
> msg.msg_flags &= ~MSG_MORE;
> @@ -605,7 +611,7 @@ static void handle_tx(struct vhost_net *net)
> else
> vhost_zerocopy_signal_used(net, vq);
> vhost_net_tx_packet(net);
> - if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
> + if (vhost_exceeds_weight(++sent_pkts, total_len)) {
You should have kept the unlikely here, and not had it inside the
helper (as per the previous patch. Also, why wasn't this change part
of the previous patch?
> vhost_poll_queue(&vq->poll);
> break;
> }
^ permalink raw reply
* Re: [RFC PATCH net-next 04/12] vhost_net: split out datacopy logic
From: Jesse Brandeburg @ 2018-05-21 16:46 UTC (permalink / raw)
To: Jason Wang; +Cc: kvm, mst, netdev, virtualization, linux-kernel
In-Reply-To: <1526893473-20128-5-git-send-email-jasowang@redhat.com>
On Mon, 21 May 2018 17:04:25 +0800 Jason wrote:
> Instead of mixing zerocopy and datacopy logics, this patch tries to
> split datacopy logic out. This results for a more compact code and
> specific optimization could be done on top more easily.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/net.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 102 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 4ebac76..4682fcc 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -492,9 +492,95 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> likely(!vhost_exceeds_maxpend(net));
> }
>
> +static void handle_tx_copy(struct vhost_net *net)
> +{
> + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> + struct vhost_virtqueue *vq = &nvq->vq;
> + unsigned out, in;
move "out" and "in" down to inside the for loop as well.
> + int head;
move this "head" declaration inside for-loop below.
> + struct msghdr msg = {
> + .msg_name = NULL,
> + .msg_namelen = 0,
> + .msg_control = NULL,
> + .msg_controllen = 0,
> + .msg_flags = MSG_DONTWAIT,
> + };
> + size_t len, total_len = 0;
> + int err;
> + size_t hdr_size;
> + struct socket *sock;
> + struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> + int sent_pkts = 0;
why do we need so many locals?
> +
> + mutex_lock(&vq->mutex);
> + sock = vq->private_data;
> + if (!sock)
> + goto out;
> +
> + if (!vq_iotlb_prefetch(vq))
> + goto out;
> +
> + vhost_disable_notify(&net->dev, vq);
> + vhost_net_disable_vq(net, vq);
> +
> + hdr_size = nvq->vhost_hlen;
> +
> + for (;;) {
> + head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> + ARRAY_SIZE(vq->iov),
> + &out, &in);
> + /* On error, stop handling until the next kick. */
> + if (unlikely(head < 0))
> + break;
> + /* Nothing new? Wait for eventfd to tell us they refilled. */
> + if (head == vq->num) {
> + if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> + vhost_disable_notify(&net->dev, vq);
> + continue;
> + }
> + break;
> + }
> + if (in) {
> + vq_err(vq, "Unexpected descriptor format for TX: "
> + "out %d, int %d\n", out, in);
don't break strings, keep all the bits between " " together, even if it
is longer than 80 chars.
> + break;
> + }
> +
> + len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
> + if (len < 0)
> + break;
same comment as previous patch, len is a size_t which is unsigned.
> +
> + total_len += len;
> + if (total_len < VHOST_NET_WEIGHT &&
> + vhost_has_more_pkts(net, vq)) {
> + msg.msg_flags |= MSG_MORE;
> + } else {
> + msg.msg_flags &= ~MSG_MORE;
> + }
don't need { } here.
> +
> + /* TODO: Check specific error and bomb out unless ENOBUFS? */
> + err = sock->ops->sendmsg(sock, &msg, len);
> + if (unlikely(err < 0)) {
> + vhost_discard_vq_desc(vq, 1);
> + vhost_net_enable_vq(net, vq);
> + break;
> + }
> + if (err != len)
> + pr_debug("Truncated TX packet: "
> + " len %d != %zd\n", err, len);
single line " " string please
^ permalink raw reply
* Re: [RFC PATCH net-next 10/12] vhost_net: build xdp buff
From: Jesse Brandeburg @ 2018-05-21 16:56 UTC (permalink / raw)
To: Jason Wang; +Cc: kvm, mst, netdev, virtualization, linux-kernel
In-Reply-To: <1526893473-20128-11-git-send-email-jasowang@redhat.com>
On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
> This patch implement build XDP buffers in vhost_net. The idea is do
> userspace copy in vhost_net and build XDP buff based on the
> page. Vhost_net can then submit one or an array of XDP buffs to
> underlayer socket (e.g TUN). TUN can choose to do XDP or call
> build_skb() to build skb. To support build skb, vnet header were also
> stored into the header of the XDP buff.
>
> This userspace copy and XDP buffs building is key to achieve XDP
> batching in TUN, since TUN does not need to care about userspace copy
> and then can disable premmption for several XDP buffs to achieve
> batching from XDP.
>
> TODO: reserve headroom based on the TUN XDP.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 74 insertions(+)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index f0639d7..1209e84 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> likely(!vhost_exceeds_maxpend(net));
> }
>
> +#define VHOST_NET_HEADROOM 256
> +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> +
> +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> + struct iov_iter *from,
> + struct xdp_buff *xdp)
> +{
> + struct vhost_virtqueue *vq = &nvq->vq;
> + struct page_frag *alloc_frag = ¤t->task_frag;
> + struct virtio_net_hdr *gso;
> + size_t len = iov_iter_count(from);
> + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
> + + nvq->sock_hlen);
> + int sock_hlen = nvq->sock_hlen;
> + void *buf;
> + int copied;
> +
> + if (len < nvq->sock_hlen)
> + return -EFAULT;
> +
> + if (SKB_DATA_ALIGN(len + pad) +
> + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> + return -ENOSPC;
> +
> + buflen += SKB_DATA_ALIGN(len + pad);
maybe store the result of SKB_DATA_ALIGN in a local instead of doing
the work twice?
> + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> + return -ENOMEM;
> +
> + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +
> + /* We store two kinds of metadata in the header which will be
> + * used for XDP_PASS to do build_skb():
> + * offset 0: buflen
> + * offset sizeof(int): vnet header
> + */
> + copied = copy_page_from_iter(alloc_frag->page,
> + alloc_frag->offset + sizeof(int), sock_hlen, from);
> + if (copied != sock_hlen)
> + return -EFAULT;
> +
> + gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> +
> + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> + vhost16_to_cpu(vq, gso->csum_start) +
> + vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> + vhost16_to_cpu(vq, gso->hdr_len)) {
> + gso->hdr_len = cpu_to_vhost16(vq,
> + vhost16_to_cpu(vq, gso->csum_start) +
> + vhost16_to_cpu(vq, gso->csum_offset) + 2);
> +
> + if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> + return -EINVAL;
> + }
> +
> + len -= sock_hlen;
> + copied = copy_page_from_iter(alloc_frag->page,
> + alloc_frag->offset + pad,
> + len, from);
> + if (copied != len)
> + return -EFAULT;
> +
> + xdp->data_hard_start = buf;
> + xdp->data = buf + pad;
> + xdp->data_end = xdp->data + len;
> + *(int *)(xdp->data_hard_start)= buflen;
space before =
> +
> + get_page(alloc_frag->page);
> + alloc_frag->offset += buflen;
> +
> + return 0;
> +}
> +
> static void handle_tx_copy(struct vhost_net *net)
> {
> struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
^ permalink raw reply
* Re: [RFC PATCH net-next 10/12] vhost_net: build xdp buff
From: Michael S. Tsirkin @ 2018-05-21 22:21 UTC (permalink / raw)
To: Jesse Brandeburg; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180521095611.00005caa@intel.com>
On Mon, May 21, 2018 at 09:56:11AM -0700, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
> > This patch implement build XDP buffers in vhost_net. The idea is do
> > userspace copy in vhost_net and build XDP buff based on the
> > page. Vhost_net can then submit one or an array of XDP buffs to
> > underlayer socket (e.g TUN). TUN can choose to do XDP or call
> > build_skb() to build skb. To support build skb, vnet header were also
> > stored into the header of the XDP buff.
> >
> > This userspace copy and XDP buffs building is key to achieve XDP
> > batching in TUN, since TUN does not need to care about userspace copy
> > and then can disable premmption for several XDP buffs to achieve
> > batching from XDP.
> >
> > TODO: reserve headroom based on the TUN XDP.
> >
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> > drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 74 insertions(+)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index f0639d7..1209e84 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> > likely(!vhost_exceeds_maxpend(net));
> > }
> >
> > +#define VHOST_NET_HEADROOM 256
> > +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > +
> > +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> > + struct iov_iter *from,
> > + struct xdp_buff *xdp)
> > +{
> > + struct vhost_virtqueue *vq = &nvq->vq;
> > + struct page_frag *alloc_frag = ¤t->task_frag;
> > + struct virtio_net_hdr *gso;
> > + size_t len = iov_iter_count(from);
> > + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
> > + + nvq->sock_hlen);
> > + int sock_hlen = nvq->sock_hlen;
> > + void *buf;
> > + int copied;
> > +
> > + if (len < nvq->sock_hlen)
> > + return -EFAULT;
> > +
> > + if (SKB_DATA_ALIGN(len + pad) +
> > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> > + return -ENOSPC;
> > +
> > + buflen += SKB_DATA_ALIGN(len + pad);
>
> maybe store the result of SKB_DATA_ALIGN in a local instead of doing
> the work twice?
I don't mind, but I guess gcc can always do it itself?
> > + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> > + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> > + return -ENOMEM;
> > +
> > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > +
> > + /* We store two kinds of metadata in the header which will be
> > + * used for XDP_PASS to do build_skb():
> > + * offset 0: buflen
> > + * offset sizeof(int): vnet header
> > + */
> > + copied = copy_page_from_iter(alloc_frag->page,
> > + alloc_frag->offset + sizeof(int), sock_hlen, from);
> > + if (copied != sock_hlen)
> > + return -EFAULT;
> > +
> > + gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> > +
> > + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > + vhost16_to_cpu(vq, gso->csum_start) +
> > + vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> > + vhost16_to_cpu(vq, gso->hdr_len)) {
> > + gso->hdr_len = cpu_to_vhost16(vq,
> > + vhost16_to_cpu(vq, gso->csum_start) +
> > + vhost16_to_cpu(vq, gso->csum_offset) + 2);
> > +
> > + if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> > + return -EINVAL;
> > + }
> > +
> > + len -= sock_hlen;
> > + copied = copy_page_from_iter(alloc_frag->page,
> > + alloc_frag->offset + pad,
> > + len, from);
> > + if (copied != len)
> > + return -EFAULT;
> > +
> > + xdp->data_hard_start = buf;
> > + xdp->data = buf + pad;
> > + xdp->data_end = xdp->data + len;
> > + *(int *)(xdp->data_hard_start)= buflen;
>
> space before =
>
> > +
> > + get_page(alloc_frag->page);
> > + alloc_frag->offset += buflen;
> > +
> > + return 0;
> > +}
> > +
> > static void handle_tx_copy(struct vhost_net *net)
> > {
> > struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
^ permalink raw reply
* [PATCH net-next v11 0/5] Enable virtio_net to act as a standby for a passthru device
From: Sridhar Samudrala @ 2018-05-22 2:06 UTC (permalink / raw)
To: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
jasowang, loseweigh, jiri, aaron.f.brown, anjali.singhai
The main motivation for this patch is to enable cloud service providers
to provide an accelerated datapath to virtio-net enabled VMs in a
transparent manner with no/minimal guest userspace changes. This also
enables hypervisor controlled live migration to be supported with VMs that
have direct attached SR-IOV VF devices.
Patch 1 introduces a failover module that provides a generic interface for
paravirtual drivers to listen for netdev register/unregister/link change
events from pci ethernet devices with the same MAC and takeover their
datapath. The notifier and event handling code is based on the existing
netvsc implementation.
Patch 2 refactors netvsc to use the registration/notification framework
introduced by failover module.
Patch 3 introduces a net_failover driver that provides an automated
failover mechanism to paravirtual drivers via APIs to create and destroy
a failover master netdev and mananges a primary and standby slave netdevs
that get registered via the generic failover infrastructure.
Patch 4 introduces a new feature bit VIRTIO_NET_F_STANDBY to virtio-net
that can be used by hypervisor to indicate that virtio_net interface
should act as a standby for another device with the same MAC address.
Patch 5 extends virtio_net to use alternate datapath when available and
registered. When STANDBY feature is enabled, virtio_net driver uese the
net_failover API to create an additional 'failover' netdev that acts as
a master device and controls 2 slave devices. The original virtio_net
netdev is registered as 'standby' netdev and a passthru/vf device with
the same MAC gets registered as 'primary' netdev. Both 'standby' and
'failover' netdevs are associated with the same 'pci' device. The user
accesses the network interface via 'failover' netdev. The 'failover'
netdev chooses 'primary' netdev as default for transmits when it is
available with link up and running.
As this patch series is initially focusing on usecases where hypervisor
fully controls the VM networking and the guest is not expected to directly
configure any hardware settings, it doesn't expose all the ndo/ethtool ops
that are supported by virtio_net at this time. To support additional usecases,
it should be possible to enable additional ops later by caching the state
in failover netdev and replaying when the 'primary' netdev gets registered.
At the time of live migration, the hypervisor needs to unplug the VF device
from the guest on the source host and reset the MAC filter of the VF to
initiate failover of datapath to virtio before starting the migration. After
the migration is completed, the destination hypervisor sets the MAC filter
on the VF and plugs it back to the guest to switch over to VF datapath.
This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
v11:
- Tested live migration with virtio-net/AVF(i40evf) configured in failover
mode while running iperf in background. Tried static ip and dhcp
configurations using 'network' scripts and Network Manager.
- Build tested netvsc module.
Updates:
- Split net_failover module into 2 components.
1. 'failover' module that provides generic failover infrastructure
to register a failover instance and listen for slave events.
2. 'net_failover' driver that provides APIs to create/destroy upper
netdev and supports 3-netdev model used by virtio-net.
- Added documentation
v10:
- Tested live migration with virtio-net/AVF(i40evf) configured in failover
mode while running iperf in background. Tried static ip and dhcp
configurations using 'network' scripts and Network Manager.
- To avoid dhcp requests to be sent over the lower 'standby' and 'primary'
netdevs, ifcfg-xxx scripts for the lower devices can be created.
- Build tested netvsc module.
Updates:
- fix net_failover_open() to update failover CARRIER correctly based on
standby and primary states.
- fix net_failover_handle_frame() to handle frames received on standby
when primary is present.
- replace netdev_upper_dev_link with netdev_master_upper_dev_link and
handle lower dev state changes.
- fix net_failver_create() and net_failover_register() interfaces to
use ERR_PTR and avoid arg **
- disable setting mac address when virtio-net in STANDBY mode
- document exported symbols
- added entry to MAINTAINERS file
v9:
Select NET_FAILOVER automatically when VIRTIO_NET/HYPERV_NET
are enabled. (stephen)
v8:
- Made the failover managment routines more robust by updating the feature
bits/other fields in the failover netdev when slave netdevs are
registered/unregistered. (mst)
- added support for handling vlans.
- Limited the changes in netvsc to only use the notifier/event/lookups
from the failover module. The slave register/unregister/link-change
handlers are only updated to use the getbymac routine to get the
upper netdev. There is no change in their functionality. (stephen)
- renamed structs/function/file names to use net_failover prefix. (mst)
v7
- Rename 'bypass/active/backup' terminology with 'failover/primary/standy'
(jiri, mst)
- re-arranged dev_open() and dev_set_mtu() calls in the register routines
so that they don't get called for 2-netdev model. (stephen)
- fixed select_queue() routine to do queue selection based on VF if it is
registered as primary. (stephen)
- minor bugfixes
v6 RFC:
Simplified virtio_net changes by moving all the ndo_ops of the
bypass_netdev and create/destroy of bypass_netdev to 'bypass' module.
avoided 2 phase registration(driver + instances).
introduced IFF_BYPASS/IFF_BYPASS_SLAVE dev->priv_flags
replaced mutex with a spinlock
v5 RFC:
Based on Jiri's comments, moved the common functionality to a 'bypass'
module so that the same notifier and event handlers to handle child
register/unregister/link change events can be shared between virtio_net
and netvsc.
Improved error handling based on Siwei's comments.
v4:
- Based on the review comments on the v3 version of the RFC patch and
Jakub's suggestion for the naming issue with 3 netdev solution,
proposed 3 netdev in-driver bonding solution for virtio-net.
v3 RFC:
- Introduced 3 netdev model and pointed out a couple of issues with
that model and proposed 2 netdev model to avoid these issues.
- Removed broadcast/multicast optimization and only use virtio as
backup path when VF is unplugged.
v2 RFC:
- Changed VIRTIO_NET_F_MASTER to VIRTIO_NET_F_BACKUP (mst)
- made a small change to the virtio-net xmit path to only use VF datapath
for unicasts. Broadcasts/multicasts use virtio datapath. This avoids
east-west broadcasts to go over the PCI link.
- added suppport for the feature bit in qemu
Sridhar Samudrala (5):
net: Introduce generic failover module
netvsc: refactor notifier/event handling code to use the failover
framework
net: Introduce net_failover driver
virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit
virtio_net: Extend virtio to use VF datapath when available
Documentation/networking/failover.rst | 18 +
Documentation/networking/net_failover.rst | 116 +++++
MAINTAINERS | 16 +
drivers/net/Kconfig | 13 +
drivers/net/Makefile | 1 +
drivers/net/hyperv/Kconfig | 1 +
drivers/net/hyperv/hyperv_net.h | 2 +
drivers/net/hyperv/netvsc_drv.c | 133 +----
drivers/net/net_failover.c | 836 ++++++++++++++++++++++++++++++
drivers/net/virtio_net.c | 40 +-
include/linux/netdevice.h | 16 +
include/net/failover.h | 31 ++
include/net/net_failover.h | 40 ++
include/uapi/linux/virtio_net.h | 3 +
net/Kconfig | 13 +
net/core/Makefile | 1 +
net/core/failover.c | 273 ++++++++++
17 files changed, 1440 insertions(+), 113 deletions(-)
create mode 100644 Documentation/networking/failover.rst
create mode 100644 Documentation/networking/net_failover.rst
create mode 100644 drivers/net/net_failover.c
create mode 100644 include/net/failover.h
create mode 100644 include/net/net_failover.h
create mode 100644 net/core/failover.c
--
2.14.3
^ permalink raw reply
* [PATCH net-next v11 1/5] net: Introduce generic failover module
From: Sridhar Samudrala @ 2018-05-22 2:06 UTC (permalink / raw)
To: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
jasowang, loseweigh, jiri, aaron.f.brown, anjali.singhai
In-Reply-To: <1526954781-35359-1-git-send-email-sridhar.samudrala@intel.com>
The failover module provides a generic interface for paravirtual drivers
to register a netdev and a set of ops with a failover instance. The ops
are used as event handlers that get called to handle netdev register/
unregister/link change/name change events on slave pci ethernet devices
with the same mac address as the failover netdev.
This enables paravirtual drivers to use a VF as an accelerated low latency
datapath. It also allows migration of VMs with direct attached VFs by
failing over to the paravirtual datapath when the VF is unplugged.
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
Documentation/networking/failover.rst | 18 +++
MAINTAINERS | 8 +
include/linux/netdevice.h | 16 ++
include/net/failover.h | 31 ++++
net/Kconfig | 13 ++
net/core/Makefile | 1 +
net/core/failover.c | 273 ++++++++++++++++++++++++++++++++++
7 files changed, 360 insertions(+)
create mode 100644 Documentation/networking/failover.rst
create mode 100644 include/net/failover.h
create mode 100644 net/core/failover.c
diff --git a/Documentation/networking/failover.rst b/Documentation/networking/failover.rst
new file mode 100644
index 000000000000..f0c8483cdbf5
--- /dev/null
+++ b/Documentation/networking/failover.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+FAILOVER
+========
+
+Overview
+========
+
+The failover module provides a generic interface for paravirtual drivers
+to register a netdev and a set of ops with a failover instance. The ops
+are used as event handlers that get called to handle netdev register/
+unregister/link change/name change events on slave pci ethernet devices
+with the same mac address as the failover netdev.
+
+This enables paravirtual drivers to use a VF as an accelerated low latency
+datapath. It also allows live migration of VMs with direct attached VFs by
+failing over to the paravirtual datapath when the VF is unplugged.
diff --git a/MAINTAINERS b/MAINTAINERS
index 658880464b9d..aed9575ddc53 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5413,6 +5413,14 @@ S: Maintained
F: Documentation/hwmon/f71805f
F: drivers/hwmon/f71805f.c
+FAILOVER MODULE
+M: Sridhar Samudrala <sridhar.samudrala@intel.com>
+L: netdev@vger.kernel.org
+S: Supported
+F: net/core/failover.c
+F: include/net/failover.h
+F: Documentation/networking/failover.rst
+
FANOTIFY
M: Jan Kara <jack@suse.cz>
R: Amir Goldstein <amir73il@gmail.com>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..0f4ba52b641d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1421,6 +1421,8 @@ struct net_device_ops {
* entity (i.e. the master device for bridged veth)
* @IFF_MACSEC: device is a MACsec device
* @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
+ * @IFF_FAILOVER: device is a failover master device
+ * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
@@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
IFF_PHONY_HEADROOM = 1<<24,
IFF_MACSEC = 1<<25,
IFF_NO_RX_HANDLER = 1<<26,
+ IFF_FAILOVER = 1<<27,
+ IFF_FAILOVER_SLAVE = 1<<28,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1478,6 +1482,8 @@ enum netdev_priv_flags {
#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
#define IFF_MACSEC IFF_MACSEC
#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
+#define IFF_FAILOVER IFF_FAILOVER
+#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
/**
* struct net_device - The DEVICE structure.
@@ -4321,6 +4327,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
return dev->priv_flags & IFF_RXFH_CONFIGURED;
}
+static inline bool netif_is_failover(const struct net_device *dev)
+{
+ return dev->priv_flags & IFF_FAILOVER;
+}
+
+static inline bool netif_is_failover_slave(const struct net_device *dev)
+{
+ return dev->priv_flags & IFF_FAILOVER_SLAVE;
+}
+
/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
diff --git a/include/net/failover.h b/include/net/failover.h
new file mode 100644
index 000000000000..fea9d0b978c8
--- /dev/null
+++ b/include/net/failover.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _FAILOVER_H
+#define _FAILOVER_H
+
+#include <linux/netdevice.h>
+
+struct failover_ops {
+ int (*slave_register)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_unregister)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_link_change)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_name_change)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+};
+
+struct failover {
+ struct list_head list;
+ struct net_device __rcu *failover_dev;
+ struct failover_ops __rcu *ops;
+};
+
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops);
+void failover_unregister(struct failover *failover);
+int failover_slave_unregister(struct net_device *slave_dev);
+
+#endif /* _FAILOVER_H */
diff --git a/net/Kconfig b/net/Kconfig
index df8d45ef47d8..d581c4f0f1c4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -430,6 +430,19 @@ config MAY_USE_DEVLINK
config PAGE_POOL
bool
+config FAILOVER
+ tristate "Generic failover module"
+ help
+ The failover module provides a generic interface for paravirtual
+ drivers to register a netdev and a set of ops with a failover
+ instance. The ops are used as event handlers that get called to
+ handle netdev register/unregister/link change/name change events
+ on slave pci ethernet devices with the same mac address as the
+ failover netdev. This enables paravirtual drivers to use a
+ VF as an accelerated low latency datapath. It also allows live
+ migration of VMs with direct attached VFs by failing over to the
+ paravirtual datapath when the VF is unplugged.
+
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 7080417f8bc8..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..48ee713a1ef4
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_register)
+ return fops->slave_register(slave_dev, failover_dev);
+
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_unregister)
+ return fops->slave_unregister(slave_dev, failover_dev);
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change)
+ return fops->slave_link_change(slave_dev, failover_dev);
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change)
+ return fops->slave_name_change(slave_dev, failover_dev);
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
--
2.14.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox