Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC V3 PATCH 6/8] virtio: introduce packed ring defines
From: Jason Wang @ 2018-04-23  5:34 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, tiwei.bie, jfreimann,
	wexu
In-Reply-To: <1524461700-5469-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/uapi/linux/virtio_config.h |  9 +++++++++
 include/uapi/linux/virtio_ring.h   | 13 +++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e209..5903d51 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -71,4 +71,13 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+#define VIRTIO_F_RING_PACKED		34
+
+/*
+ * This feature indicates that all buffers are used by the device in
+ * the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5fa..e297580 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -43,6 +43,8 @@
 #define VRING_DESC_F_WRITE	2
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
+#define VRING_DESC_F_AVAIL      7
+#define VRING_DESC_F_USED	15
 
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
@@ -62,6 +64,17 @@
  * at the end of the used ring. Guest should ignore the used->flags field. */
 #define VIRTIO_RING_F_EVENT_IDX		29
 
+struct vring_desc_packed {
+	/* Buffer Address. */
+	__virtio64 addr;
+	/* Buffer Length. */
+	__virtio32 len;
+	/* Buffer ID. */
+	__virtio16 id;
+	/* The flags depending on descriptor type. */
+	__virtio16 flags;
+};
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
 	/* Address (guest-physical). */
-- 
2.7.4

^ permalink raw reply related

* [RFC V3 PATCH 5/8] vhost: vhost_put_user() can accept metadata type
From: Jason Wang @ 2018-04-23  5:34 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <1524461700-5469-1-git-send-email-jasowang@redhat.com>

We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5cc1cdb..82a7b73 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -846,7 +846,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr)		\
+#define vhost_put_user(vq, x, ptr, type)		\
 ({ \
 	int ret = -EFAULT; \
 	if (!vq->iotlb) { \
@@ -854,7 +854,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	} else { \
 		__typeof__(ptr) to = \
 			(__typeof__(ptr)) __vhost_get_user(vq, ptr,	\
-					  sizeof(*ptr), VHOST_ADDR_USED); \
+					  sizeof(*ptr), type); \
 		if (to != NULL) \
 			ret = __put_user(x, to); \
 		else \
@@ -1715,7 +1715,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-			   &vq->used->flags) < 0)
+			   &vq->used->flags, VHOST_ADDR_USED) < 0)
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1734,7 +1734,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-			   vhost_avail_event(vq)))
+			   vhost_avail_event(vq), VHOST_ADDR_USED))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -2217,12 +2217,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	used = vq->used->ring + start;
 	for (i = 0; i < count; i++) {
 		if (unlikely(vhost_put_user(vq, heads[i].elem.id,
-					    &used[i].id))) {
+					    &used[i].id, VHOST_ADDR_USED))) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
 		if (unlikely(vhost_put_user(vq, heads[i].elem.len,
-					    &used[i].len))) {
+					    &used[i].len, VHOST_ADDR_USED))) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
@@ -2268,7 +2268,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-			   &vq->used->idx)) {
+			   &vq->used->idx, VHOST_ADDR_USED)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
-- 
2.7.4

^ permalink raw reply related

* [RFC V3 PATCH 4/8] vhost_net: do not explicitly manipulate vhost_used_elem
From: Jason Wang @ 2018-04-23  5:34 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <1524461700-5469-1-git-send-email-jasowang@redhat.com>

Two helpers of setting/getting used len were introduced to avoid
explicitly manipulating vhost_used_elem in zerocopy code. This will be
used to hide used_elem internals and simplify packed ring
implementation.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 11 +++++------
 drivers/vhost/vhost.c | 12 ++++++++++--
 drivers/vhost/vhost.h |  5 +++++
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 3826f1f..30273ad 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,9 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	int j = 0;
 
 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-		if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
+		if (vhost_get_used_len(vq, &vq->heads[i]) ==
+		    VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
-		if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+		if (VHOST_DMA_IS_DONE(vhost_get_used_len(vq, &vq->heads[i]))) {
 			vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
 			++j;
 		} else
@@ -542,10 +543,8 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].elem.id =
-				cpu_to_vhost32(vq, used.elem.id);
-			vq->heads[nvq->upend_idx].elem.len =
-				VHOST_DMA_IN_PROGRESS;
+			vhost_set_used_len(vq, &used, VHOST_DMA_IN_PROGRESS);
+			vq->heads[nvq->upend_idx] = used;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 24f3013..5cc1cdb 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2099,11 +2099,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
-static void vhost_set_used_len(struct vhost_virtqueue *vq,
-			       struct vhost_used_elem *used, int len)
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+			struct vhost_used_elem *used, int len)
 {
 	used->elem.len = cpu_to_vhost32(vq, len);
 }
+EXPORT_SYMBOL_GPL(vhost_set_used_len);
+
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+		       struct vhost_used_elem *used)
+{
+	return vhost32_to_cpu(vq, used->elem.len);
+}
+EXPORT_SYMBOL_GPL(vhost_get_used_len);
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8dea44b..604821b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,11 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *log_num,
 		   unsigned int quota,
 		   s16 *count);
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+			struct vhost_used_elem *used,
+			int len);
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+		       struct vhost_used_elem *used);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* [RFC V3 PATCH 3/8] vhost: do not use vring_used_elem
From: Jason Wang @ 2018-04-23  5:34 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, tiwei.bie, jfreimann,
	wexu
In-Reply-To: <1524461700-5469-1-git-send-email-jasowang@redhat.com>

Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 19 ++++++++++---------
 drivers/vhost/scsi.c  | 10 +++++-----
 drivers/vhost/vhost.c | 33 ++++++++++++++++-----------------
 drivers/vhost/vhost.h | 18 +++++++++++-------
 drivers/vhost/vsock.c |  6 +++---
 5 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 826489c..3826f1f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,10 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	int j = 0;
 
 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+		if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
-		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
-			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+		if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+			vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
 			++j;
 		} else
 			break;
@@ -367,7 +367,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	rcu_read_lock_bh();
 
 	/* set len to mark this desc buffers done DMA */
-	vq->heads[ubuf->desc].len = success ?
+	vq->heads[ubuf->desc].elem.len = success ?
 		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 	cnt = vhost_net_ubuf_put(ubufs);
 
@@ -426,7 +426,7 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
-				    struct vring_used_elem *used_elem,
+				    struct vhost_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
@@ -477,7 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -542,9 +542,10 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id =
-				cpu_to_vhost32(vq, used.id);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+			vq->heads[nvq->upend_idx].elem.id =
+				cpu_to_vhost32(vq, used.elem.id);
+			vq->heads[nvq->upend_idx].elem.len =
+				VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 654c71f..ac11412 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	struct vring_used_elem tvc_vq_used;
+	struct vhost_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	unsigned out, in;
 	int ret;
 
@@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   struct vring_used_elem *used, unsigned out)
+			   struct vhost_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
@@ -837,7 +837,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 					ARRAY_SIZE(vq->iov), &out, &in,
 					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 used.id, out, in);
+			 used.elem.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e069adc..24f3013 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1955,7 +1955,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct vring_used_elem *used,
+		      struct vhost_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -2006,7 +2006,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
-	used->id = ring_head;
+	used->elem.id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2100,9 +2100,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 static void vhost_set_used_len(struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
-	used->len = cpu_to_vhost32(vq, len);
+	used->elem.len = cpu_to_vhost32(vq, len);
 }
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
@@ -2116,7 +2116,7 @@ static void vhost_set_used_len(struct vhost_virtqueue *vq,
  *	returns number of buffer heads allocated, negative on error
  */
 int vhost_get_bufs(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *heads,
+		   struct vhost_used_elem *heads,
 		   int datalen,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
@@ -2189,7 +2189,7 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+int vhost_add_used(struct vhost_virtqueue *vq, struct vhost_used_elem *used,
 		   int len)
 {
 	vhost_set_used_len(vq, used, len);
@@ -2198,27 +2198,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
-			    struct vring_used_elem *heads,
+			    struct vhost_used_elem *heads,
 			    unsigned count)
 {
 	struct vring_used_elem __user *used;
 	u16 old, new;
-	int start;
+	int start, i;
 
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
-	if (count == 1) {
-		if (vhost_put_user(vq, heads[0].id, &used->id)) {
+	for (i = 0; i < count; i++) {
+		if (unlikely(vhost_put_user(vq, heads[i].elem.id,
+					    &used[i].id))) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
-		if (vhost_put_user(vq, heads[0].len, &used->len)) {
+		if (unlikely(vhost_put_user(vq, heads[i].elem.len,
+					    &used[i].len))) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
-	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
-		vq_err(vq, "Failed to write used");
-		return -EFAULT;
 	}
 	if (unlikely(vq->log_used)) {
 		/* Make sure data is seen before log. */
@@ -2242,7 +2241,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 		     unsigned count)
 {
 	int start, n, r;
@@ -2326,7 +2325,7 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
 	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
@@ -2336,7 +2335,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 /* multi-buffer version of vhost_add_used_and_signal */
 void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 				 struct vhost_virtqueue *vq,
-				 struct vring_used_elem *heads, unsigned count)
+				 struct vhost_used_elem *heads, unsigned count)
 {
 	vhost_add_used_n(vq, heads, count);
 	vhost_signal(dev, vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a7cc7e7..8dea44b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -34,6 +34,10 @@ struct vhost_poll {
 	struct vhost_dev	 *dev;
 };
 
+struct vhost_used_elem {
+	struct vring_used_elem elem;
+};
+
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
@@ -126,7 +130,7 @@ struct vhost_virtqueue {
 	struct iovec iov[UIO_MAXIOV];
 	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
-	struct vring_used_elem *heads;
+	struct vhost_used_elem *heads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
 	struct vhost_umem *iotlb;
@@ -182,12 +186,12 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
-		      struct vring_used_elem *used_elem,
+		      struct vhost_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
 int vhost_get_bufs(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *heads,
+		   struct vhost_used_elem *heads,
 		   int datalen,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
@@ -198,13 +202,13 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *elem, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		   struct vhost_used_elem *elem, int len);
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *, int len);
+			       struct vhost_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *heads, unsigned count);
+			       struct vhost_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 59a01cd..695694f 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
-		struct vring_used_elem used;
+		struct vhost_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = vhost32_to_cpu(vq, used.len);
+		len = vhost32_to_cpu(vq, used.elem.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -346,7 +346,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	int ret;
 	unsigned int out, in;
 	bool added = false;
-- 
2.7.4

^ permalink raw reply related

* [RFC V3 PATCH 2/8] vhost: hide used ring layout from device
From: Jason Wang @ 2018-04-23  5:34 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <1524461700-5469-1-git-send-email-jasowang@redhat.com>

We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 46 +++++++++++++++++++++-----------------
 drivers/vhost/scsi.c  | 62 +++++++++++++++++++++++++++------------------------
 drivers/vhost/vhost.c | 52 +++++++++++++++++++++---------------------
 drivers/vhost/vhost.h |  9 +++++---
 drivers/vhost/vsock.c | 42 +++++++++++++++++-----------------
 5 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 762aa81..826489c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -426,22 +426,24 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
+				    struct vring_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+	int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
 				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
+	if (r == -ENOSPC && vq->busyloop_timeout) {
 		preempt_disable();
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(vq->dev, endtime) &&
 		       vhost_vq_avail_empty(vq->dev, vq))
 			cpu_relax();
 		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
+		r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+				      ARRAY_SIZE(vq->iov), out_num, in_num,
+				      NULL, NULL);
 	}
 
 	return r;
@@ -463,7 +465,6 @@ static void handle_tx(struct vhost_net *net)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned out, in;
-	int head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -476,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+	struct vring_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -499,20 +501,20 @@ static void handle_tx(struct vhost_net *net)
 			vhost_zerocopy_signal_used(net, vq);
 
 
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+		err = vhost_net_tx_get_vq_desc(net, vq, &used, vq->iov,
+					       ARRAY_SIZE(vq->iov),
+					       &out, &in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (err == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			break;
 		if (in) {
 			vq_err(vq, "Unexpected descriptor format for TX: "
 			       "out %d, int %d\n", out, in);
@@ -540,7 +542,8 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+			vq->heads[nvq->upend_idx].id =
+				cpu_to_vhost32(vq, used.id);
 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
@@ -581,7 +584,7 @@ static void handle_tx(struct vhost_net *net)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
 		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+			vhost_add_used_and_signal(&net->dev, vq, &used, 0);
 		else
 			vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
@@ -713,14 +716,12 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
-					   &in, vq_log, &log,
-					   likely(mergeable) ? UIO_MAXIOV : 1);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(headcount < 0))
-			goto out;
+		err = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+				     &in, vq_log, &log,
+				     likely(mergeable) ? UIO_MAXIOV : 1,
+				     &headcount);
 		/* OK, now we need to know about added descriptors. */
-		if (!headcount) {
+		if (err == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
@@ -731,6 +732,9 @@ static void handle_rx(struct vhost_net *net)
 			 * they refilled. */
 			goto out;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			goto out;
 		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 7ad5709..654c71f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	int tvc_vq_desc;
+	struct vring_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,8 +441,9 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
+	struct vring_used_elem used;
 	unsigned out, in;
-	int head, ret;
+	int ret;
 
 	if (!vq->private_data) {
 		vs->vs_events_missed = true;
@@ -451,16 +452,16 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 
 again:
 	vhost_disable_notify(&vs->dev, vq);
-	head = vhost_get_vq_desc(vq, vq->iov,
+	ret = vhost_get_vq_desc(vq, &used, vq->iov,
 			ARRAY_SIZE(vq->iov), &out, &in,
 			NULL, NULL);
-	if (head < 0) {
+	if (ret == -ENOSPC) {
+		if (vhost_enable_notify(&vs->dev, vq))
+			goto again;
 		vs->vs_events_missed = true;
 		return;
 	}
-	if (head == vq->num) {
-		if (vhost_enable_notify(&vs->dev, vq))
-			goto again;
+	if (ret < 0) {
 		vs->vs_events_missed = true;
 		return;
 	}
@@ -480,7 +481,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	eventp = vq->iov[out].iov_base;
 	ret = __copy_to_user(eventp, event, sizeof(*event));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &used, 0);
 	else
 		vq_err(vq, "Faulted on vhost_scsi_send_event\n");
 }
@@ -541,7 +542,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
 		if (likely(ret == sizeof(v_rsp))) {
 			struct vhost_scsi_virtqueue *q;
-			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+			vhost_add_used(cmd->tvc_vq, &cmd->tvc_vq_used, 0);
 			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
 			vq = q - vs->vqs;
 			__set_bit(vq, signal);
@@ -784,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   int head, unsigned out)
+			   struct vring_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -795,7 +796,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 	resp = vq->iov[out].iov_base;
 	ret = __copy_to_user(resp, &rsp, sizeof(rsp));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, used, 0);
 	else
 		pr_err("Faulted on virtio_scsi_cmd_resp\n");
 }
@@ -807,11 +808,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
+	struct vring_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
 	unsigned int out = 0, in = 0;
-	int head, ret, prot_bytes;
+	int ret, prot_bytes;
 	size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
 	size_t out_size, in_size;
 	u16 lun;
@@ -831,22 +833,22 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	vhost_disable_notify(&vs->dev, vq);
 
 	for (;;) {
-		head = vhost_get_vq_desc(vq, vq->iov,
-					 ARRAY_SIZE(vq->iov), &out, &in,
-					 NULL, NULL);
+		ret = vhost_get_vq_desc(vq, &used, vq->iov,
+					ARRAY_SIZE(vq->iov), &out, &in,
+					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 head, out, in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+			 used.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
 				vhost_disable_notify(&vs->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(ret < 0))
+			break;
 		/*
 		 * Check for a sane response buffer so we can report early
 		 * errors back to the guest.
@@ -891,20 +893,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 
 		if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
 			vq_err(vq, "Faulted on copy_from_iter\n");
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/* virtio-scsi spec requires byte 0 of the lun to be 1 */
 		if (unlikely(*lunp != 1)) {
 			vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 
 		tpg = READ_ONCE(vs_tpg[*target]);
 		if (unlikely(!tpg)) {
 			/* Target does not exist, fail the request */
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/*
@@ -950,7 +952,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_TO_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesout,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout);
@@ -958,7 +961,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_FROM_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesin,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
@@ -996,7 +1000,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			vq_err(vq, "Received SCSI CDB with command_size: %d that"
 				" exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n",
 				scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr,
@@ -1005,7 +1009,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		if (IS_ERR(cmd)) {
 			vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
 			       PTR_ERR(cmd));
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd->tvc_vhost = vs;
@@ -1025,7 +1029,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			if (unlikely(ret)) {
 				vq_err(vq, "Failed to map iov to sgl\n");
 				vhost_scsi_release_cmd(&cmd->tvc_se_cmd);
-				vhost_scsi_send_bad_target(vs, vq, head, out);
+				vhost_scsi_send_bad_target(vs, vq, &used, out);
 				continue;
 			}
 		}
@@ -1034,7 +1038,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 * complete the virtio-scsi request in TCM callback context via
 		 * vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
 		 */
-		cmd->tvc_vq_desc = head;
+		cmd->tvc_vq_used = used;
 		/*
 		 * Dispatch cmd descriptor for cmwq execution in process
 		 * context provided by vhost_scsi_workqueue.  This also ensures
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6b455f6..e069adc 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1955,6 +1955,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vring_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -1987,7 +1988,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		 * invalid.
 		 */
 		if (vq->avail_idx == last_avail_idx)
-			return vq->num;
+			return -ENOSPC;
 
 		/* Only get avail ring entries after they have been
 		 * exposed by guest.
@@ -2005,6 +2006,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
+	used->id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2093,10 +2095,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	/* Assume notifications from guest are disabled at this point,
 	 * if they aren't we would need to update avail_event index. */
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
-	return head;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+static void vhost_set_used_len(struct vhost_virtqueue *vq,
+			       struct vring_used_elem *used, int len)
+{
+	used->len = cpu_to_vhost32(vq, len);
+}
+
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
@@ -2113,13 +2121,13 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota)
+		   unsigned int quota,
+		   s16 *count)
 {
 	unsigned int out, in;
 	int seg = 0;
 	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
+	int r = 0, nlogs = 0;
 	/* len is always initialized before use since we are always called with
 	 * datalen > 0.
 	 */
@@ -2130,17 +2138,12 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			r = -ENOBUFS;
 			goto err;
 		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
+		r = vhost_get_vq_desc(vq, &heads[headcount], vq->iov + seg,
 				      ARRAY_SIZE(vq->iov) - seg, &out,
 				      &in, log, log_num);
 		if (unlikely(r < 0))
 			goto err;
 
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
 		if (unlikely(out || in <= 0)) {
 			vq_err(vq, "unexpected descriptor format for RX: "
 				"out %d, in %d\n", out, in);
@@ -2151,24 +2154,26 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
+
 		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
+		vhost_set_used_len(vq, &heads[headcount], len);
 		datalen -= len;
 		++headcount;
 		seg += in;
 	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	vhost_set_used_len(vq, &heads[headcount - 1], len + datalen);
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
 
 	/* Detect overrun */
 	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
+		headcount = UIO_MAXIOV + 1;
 		goto err;
 	}
-	return headcount;
+
+	*count = headcount;
+	return 0;
 err:
 	vhost_discard_vq_desc(vq, headcount);
 	return r;
@@ -2184,14 +2189,11 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+		   int len)
 {
-	struct vring_used_elem heads = {
-		cpu_to_vhost32(vq, head),
-		cpu_to_vhost32(vq, len)
-	};
-
-	return vhost_add_used_n(vq, &heads, 1);
+	vhost_set_used_len(vq, used, len);
+	return vhost_add_used_n(vq, used, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
@@ -2324,9 +2326,9 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       unsigned int head, int len)
+			       struct vring_used_elem *used, int len)
 {
-	vhost_add_used(vq, head, len);
+	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 52edd242..a7cc7e7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -182,6 +182,7 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
+		      struct vring_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
@@ -191,15 +192,17 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota);
+		   unsigned int quota,
+		   s16 *count);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *elem, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int id, int len);
+			       struct vring_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 34bc3ab..59a01cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,11 +98,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
+		struct vring_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
 		size_t len;
-		int head;
+		int ret;
 
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -116,16 +117,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0) {
-			spin_lock_bh(&vsock->send_pkt_list_lock);
-			list_add(&pkt->list, &vsock->send_pkt_list);
-			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			break;
-		}
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -139,6 +133,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			}
 			break;
 		}
+		if (ret < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
 
 		if (out) {
 			virtio_transport_free_pkt(pkt);
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = iov_length(&vq->iov[out], in);
+		len = vhost32_to_cpu(vq, used.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -163,7 +163,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + pkt->len);
 		added = true;
 
 		if (pkt->reply) {
@@ -346,7 +346,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	int head;
+	struct vring_used_elem used;
+	int ret;
 	unsigned int out, in;
 	bool added = false;
 
@@ -367,18 +368,17 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			goto no_more_replies;
 		}
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
 				vhost_disable_notify(&vsock->dev, vq);
 				continue;
 			}
 			break;
 		}
+		if (ret < 0)
+			break;
 
 		pkt = vhost_vsock_alloc_pkt(vq, out, in);
 		if (!pkt) {
@@ -397,7 +397,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 		else
 			virtio_transport_free_pkt(pkt);
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + len);
 		added = true;
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [RFC V3 PATCH 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-04-23  5:34 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, tiwei.bie, jfreimann,
	wexu
In-Reply-To: <1524461700-5469-1-git-send-email-jasowang@redhat.com>

Move get_rx_bufs() to vhost.c and rename it to
vhost_get_rx_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 83 ++-------------------------------------------------
 drivers/vhost/vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  7 +++++
 3 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a..762aa81 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -664,83 +664,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 	return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- *	vq has read descriptors only.
- * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- * @quota       - headcount quota, 1 for big buffer
- *	returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-		       struct vring_used_elem *heads,
-		       int datalen,
-		       unsigned *iovcount,
-		       struct vhost_log *log,
-		       unsigned *log_num,
-		       unsigned int quota)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
-	/* len is always initialized before use since we are always called with
-	 * datalen > 0.
-	 */
-	u32 uninitialized_var(len);
-
-	while (datalen > 0 && headcount < quota) {
-		if (unlikely(seg >= UIO_MAXIOV)) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
-				      ARRAY_SIZE(vq->iov) - seg, &out,
-				      &in, log, log_num);
-		if (unlikely(r < 0))
-			goto err;
-
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (unlikely(out || in <= 0)) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		if (unlikely(log)) {
-			nlogs += *log_num;
-			log += *log_num;
-		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
-		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
-		datalen -= len;
-		++headcount;
-		seg += in;
-	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-	*iovcount = seg;
-	if (unlikely(log))
-		*log_num = nlogs;
-
-	/* Detect overrun */
-	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
-		goto err;
-	}
-	return headcount;
-err:
-	vhost_discard_vq_desc(vq, headcount);
-	return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -790,9 +713,9 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = get_rx_bufs(vq, vq->heads + nheads, vhost_len,
-					&in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
+		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+					   &in, vq_log, &log,
+					   likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			goto out;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e9..6b455f6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2097,6 +2097,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ * @quota       - headcount quota, 1 for big buffer
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r, nlogs = 0;
+	/* len is always initialized before use since we are always called with
+	 * datalen > 0.
+	 */
+	u32 uninitialized_var(len);
+
+	while (datalen > 0 && headcount < quota) {
+		if (unlikely(seg >= UIO_MAXIOV)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		r = vhost_get_vq_desc(vq, vq->iov + seg,
+				      ARRAY_SIZE(vq->iov) - seg, &out,
+				      &in, log, log_num);
+		if (unlikely(r < 0))
+			goto err;
+
+		d = r;
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		if (unlikely(log)) {
+			nlogs += *log_num;
+			log += *log_num;
+		}
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
+		++headcount;
+		seg += in;
+	}
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	*iovcount = seg;
+	if (unlikely(log))
+		*log_num = nlogs;
+
+	/* Detect overrun */
+	if (unlikely(datalen > 0)) {
+		r = UIO_MAXIOV + 1;
+		goto err;
+	}
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return r;
+}
+EXPORT_SYMBOL_GPL(vhost_get_bufs);
+
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b9..52edd242 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,13 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next] lan78xx: Lan7801 Support for Fixed PHY
From: Raghuram Chary J @ 2018-04-23  4:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, unglinuxdriver, woojung.huh, raghuramchary.jallipalli

Adding Fixed PHY support to the lan78xx driver.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
---
 drivers/net/usb/Kconfig   |  1 +
 drivers/net/usb/lan78xx.c | 43 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index f28bd74ac275..418b0904cecb 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -111,6 +111,7 @@ config USB_LAN78XX
 	select MII
 	select PHYLIB
 	select MICROCHIP_PHY
+	select FIXED_PHY
 	help
 	  This option adds support for Microchip LAN78XX based USB 2
 	  & USB 3 10/100/1000 Ethernet adapters.
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 207a3e18c08f..0d52f37c6cf4 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -36,13 +36,13 @@
 #include <linux/irq.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
-#include <linux/phy.h>
+#include <linux/phy_fixed.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC	"LAN78XX USB 3.0 Gigabit Ethernet Devices"
 #define DRIVER_NAME	"lan78xx"
-#define DRIVER_VERSION	"1.0.6"
+#define DRIVER_VERSION	"1.0.7"
 
 #define TX_TIMEOUT_JIFFIES		(5 * HZ)
 #define THROTTLE_JIFFIES		(HZ / 8)
@@ -426,6 +426,7 @@ struct lan78xx_net {
 	struct statstage	stats;
 
 	struct irq_domain_data	domain_data;
+	struct phy_device	*fixedphy;
 };
 
 /* define external phy id */
@@ -2062,11 +2063,39 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	int ret;
 	u32 mii_adv;
 	struct phy_device *phydev;
+	struct fixed_phy_status fphy_status = {
+		.link = 1,
+		.speed = SPEED_1000,
+		.duplex = DUPLEX_FULL,
+	};
 
 	phydev = phy_find_first(dev->mdiobus);
 	if (!phydev) {
-		netdev_err(dev->net, "no PHY found\n");
-		return -EIO;
+		if (dev->chipid == ID_REV_CHIP_ID_7801_) {
+			u32 buf;
+
+			netdev_info(dev->net, "PHY Not Found!! Registering Fixed PHY\n");
+			phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1,
+						    NULL);
+			if (IS_ERR(phydev)) {
+				netdev_err(dev->net, "No PHY/fixed_PHY found\n");
+				return -ENODEV;
+			}
+			netdev_info(dev->net, "Registered FIXED PHY\n");
+			dev->interface = PHY_INTERFACE_MODE_RGMII;
+			dev->fixedphy = phydev;
+			ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
+						MAC_RGMII_ID_TXC_DELAY_EN_);
+			ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
+			ret = lan78xx_read_reg(dev, HW_CFG, &buf);
+			buf |= HW_CFG_CLK125_EN_;
+			buf |= HW_CFG_REFCLK25_EN_;
+			ret = lan78xx_write_reg(dev, HW_CFG, buf);
+			goto phyinit;
+		} else {
+			netdev_err(dev->net, "no PHY found\n");
+			return -EIO;
+		}
 	}
 
 	if ((dev->chipid == ID_REV_CHIP_ID_7800_) ||
@@ -2105,6 +2134,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 		goto error;
 	}
 
+phyinit:
 	/* if phyirq is not set, use polling mode in phylib */
 	if (dev->domain_data.phyirq > 0)
 		phydev->irq = dev->domain_data.phyirq;
@@ -3555,6 +3585,11 @@ static void lan78xx_disconnect(struct usb_interface *intf)
 
 	phy_disconnect(net->phydev);
 
+	if (dev->fixedphy) {
+		fixed_phy_unregister(dev->fixedphy);
+		dev->fixedphy = NULL;
+	}
+
 	unregister_netdev(net);
 
 	cancel_delayed_work_sync(&dev->wq);
-- 
2.16.2

^ permalink raw reply related

* Re: [PATCH bpf-next v3 4/9] bpf/verifier: improve register value range tracking with ARSH
From: Alexei Starovoitov @ 2018-04-23  4:40 UTC (permalink / raw)
  To: Yonghong Song; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <8a76b492-e01a-d79e-3dbe-5a1e6b0e60ce@fb.com>

On Sun, Apr 22, 2018 at 09:31:19PM -0700, Yonghong Song wrote:
> 
> 
> On 4/22/18 9:19 PM, Alexei Starovoitov wrote:
> > On Sun, Apr 22, 2018 at 07:49:13PM -0700, Yonghong Song wrote:
> > > 
> > > 
> > > On 4/22/18 5:16 PM, Alexei Starovoitov wrote:
> > > > On Fri, Apr 20, 2018 at 03:18:37PM -0700, Yonghong Song wrote:
> > > > > When helpers like bpf_get_stack returns an int value
> > > > > and later on used for arithmetic computation, the LSH and ARSH
> > > > > operations are often required to get proper sign extension into
> > > > > 64-bit. For example, without this patch:
> > > > >       54: R0=inv(id=0,umax_value=800)
> > > > >       54: (bf) r8 = r0
> > > > >       55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
> > > > >       55: (67) r8 <<= 32
> > > > >       56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
> > > > >       56: (c7) r8 s>>= 32
> > > > >       57: R8=inv(id=0)
> > > > > With this patch:
> > > > >       54: R0=inv(id=0,umax_value=800)
> > > > >       54: (bf) r8 = r0
> > > > >       55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
> > > > >       55: (67) r8 <<= 32
> > > > >       56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
> > > > >       56: (c7) r8 s>>= 32
> > > > >       57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
> > > > > With better range of "R8", later on when "R8" is added to other register,
> > > > > e.g., a map pointer or scalar-value register, the better register
> > > > > range can be derived and verifier failure may be avoided.
> > > > > 
> > > > > In our later example,
> > > > >       ......
> > > > >       usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
> > > > >       if (usize < 0)
> > > > >           return 0;
> > > > >       ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
> > > > >       ......
> > > > > Without improving ARSH value range tracking, the register representing
> > > > > "max_len - usize" will have smin_value equal to S64_MIN and will be
> > > > > rejected by verifier.
> > > > > 
> > > > > Signed-off-by: Yonghong Song <yhs@fb.com>
> > > > > ---
> > > > >    kernel/bpf/verifier.c | 26 ++++++++++++++++++++++++++
> > > > >    1 file changed, 26 insertions(+)
> > > > > 
> > > > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > > > index 3c8bb92..01c215d 100644
> > > > > --- a/kernel/bpf/verifier.c
> > > > > +++ b/kernel/bpf/verifier.c
> > > > > @@ -2975,6 +2975,32 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
> > > > >    		/* We may learn something more from the var_off */
> > > > >    		__update_reg_bounds(dst_reg);
> > > > >    		break;
> > > > > +	case BPF_ARSH:
> > > > > +		if (umax_val >= insn_bitness) {
> > > > > +			/* Shifts greater than 31 or 63 are undefined.
> > > > > +			 * This includes shifts by a negative number.
> > > > > +			 */
> > > > > +			mark_reg_unknown(env, regs, insn->dst_reg);
> > > > > +			break;
> > > > > +		}
> > > > > +		if (dst_reg->smin_value < 0)
> > > > > +			dst_reg->smin_value >>= umin_val;
> > > > > +		else
> > > > > +			dst_reg->smin_value >>= umax_val;
> > > > > +		if (dst_reg->smax_value < 0)
> > > > > +			dst_reg->smax_value >>= umax_val;
> > > > > +		else
> > > > > +			dst_reg->smax_value >>= umin_val;
> > > > > +		if (src_known)
> > > > > +			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
> > > > > +						       umin_val);
> > > > > +		else
> > > > > +			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
> > > > > +		dst_reg->umin_value >>= umax_val;
> > > > > +		dst_reg->umax_value >>= umin_val;
> > > > > +		/* We may learn something more from the var_off */
> > > > > +		__update_reg_bounds(dst_reg);
> > > > 
> > > > I'm struggling to understand how these bounds are computed.
> > > > Could you add examples in the comments?
> > > 
> > > Okay, let me try to add some comments for better understanding.
> > > 
> > > > In particular if dst_reg is unknown (tnum.mask == -1)
> > > > the above tnum_rshift() will clear upper bits and will make it
> > > > 64-bit positive, but that doesn't seem correct.
> > > > What am I missing?
> > > 
> > > Considering this is arith shift, we probably should just have
> > > dst_reg->var_off = tnum_unknown to be conservative.
> > > 
> > > I could miss something here as well. Let me try to write more
> > > detailed explanation, hopefully to cover all corner cases.
> > 
> > Is there a use case for !src_known ?
> 
> For typical bpf programs, the shift amount should always be known...
> If src_known is true, it must be dealing custom packets or custom
> data structures in tracing, etc.

In the example it was <<= 32 and s>>= 32 only because newly
introduced helper returns signed 32-bit integer that is later
used in the math. I have a hard time imagining useful C code that
needs arithmetic shift with a variable.

^ permalink raw reply

* Re: [PATCH bpf-next v3 4/9] bpf/verifier: improve register value range tracking with ARSH
From: Yonghong Song @ 2018-04-23  4:31 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180423041901.44xlyekpw3kehh7v@ast-mbp>



On 4/22/18 9:19 PM, Alexei Starovoitov wrote:
> On Sun, Apr 22, 2018 at 07:49:13PM -0700, Yonghong Song wrote:
>>
>>
>> On 4/22/18 5:16 PM, Alexei Starovoitov wrote:
>>> On Fri, Apr 20, 2018 at 03:18:37PM -0700, Yonghong Song wrote:
>>>> When helpers like bpf_get_stack returns an int value
>>>> and later on used for arithmetic computation, the LSH and ARSH
>>>> operations are often required to get proper sign extension into
>>>> 64-bit. For example, without this patch:
>>>>       54: R0=inv(id=0,umax_value=800)
>>>>       54: (bf) r8 = r0
>>>>       55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
>>>>       55: (67) r8 <<= 32
>>>>       56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
>>>>       56: (c7) r8 s>>= 32
>>>>       57: R8=inv(id=0)
>>>> With this patch:
>>>>       54: R0=inv(id=0,umax_value=800)
>>>>       54: (bf) r8 = r0
>>>>       55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
>>>>       55: (67) r8 <<= 32
>>>>       56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
>>>>       56: (c7) r8 s>>= 32
>>>>       57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
>>>> With better range of "R8", later on when "R8" is added to other register,
>>>> e.g., a map pointer or scalar-value register, the better register
>>>> range can be derived and verifier failure may be avoided.
>>>>
>>>> In our later example,
>>>>       ......
>>>>       usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>>>>       if (usize < 0)
>>>>           return 0;
>>>>       ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
>>>>       ......
>>>> Without improving ARSH value range tracking, the register representing
>>>> "max_len - usize" will have smin_value equal to S64_MIN and will be
>>>> rejected by verifier.
>>>>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>>    kernel/bpf/verifier.c | 26 ++++++++++++++++++++++++++
>>>>    1 file changed, 26 insertions(+)
>>>>
>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>> index 3c8bb92..01c215d 100644
>>>> --- a/kernel/bpf/verifier.c
>>>> +++ b/kernel/bpf/verifier.c
>>>> @@ -2975,6 +2975,32 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
>>>>    		/* We may learn something more from the var_off */
>>>>    		__update_reg_bounds(dst_reg);
>>>>    		break;
>>>> +	case BPF_ARSH:
>>>> +		if (umax_val >= insn_bitness) {
>>>> +			/* Shifts greater than 31 or 63 are undefined.
>>>> +			 * This includes shifts by a negative number.
>>>> +			 */
>>>> +			mark_reg_unknown(env, regs, insn->dst_reg);
>>>> +			break;
>>>> +		}
>>>> +		if (dst_reg->smin_value < 0)
>>>> +			dst_reg->smin_value >>= umin_val;
>>>> +		else
>>>> +			dst_reg->smin_value >>= umax_val;
>>>> +		if (dst_reg->smax_value < 0)
>>>> +			dst_reg->smax_value >>= umax_val;
>>>> +		else
>>>> +			dst_reg->smax_value >>= umin_val;
>>>> +		if (src_known)
>>>> +			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
>>>> +						       umin_val);
>>>> +		else
>>>> +			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
>>>> +		dst_reg->umin_value >>= umax_val;
>>>> +		dst_reg->umax_value >>= umin_val;
>>>> +		/* We may learn something more from the var_off */
>>>> +		__update_reg_bounds(dst_reg);
>>>
>>> I'm struggling to understand how these bounds are computed.
>>> Could you add examples in the comments?
>>
>> Okay, let me try to add some comments for better understanding.
>>
>>> In particular if dst_reg is unknown (tnum.mask == -1)
>>> the above tnum_rshift() will clear upper bits and will make it
>>> 64-bit positive, but that doesn't seem correct.
>>> What am I missing?
>>
>> Considering this is arith shift, we probably should just have
>> dst_reg->var_off = tnum_unknown to be conservative.
>>
>> I could miss something here as well. Let me try to write more
>> detailed explanation, hopefully to cover all corner cases.
> 
> Is there a use case for !src_known ?

For typical bpf programs, the shift amount should always be known...
If src_known is true, it must be dealing custom packets or custom
data structures in tracing, etc.


> I think test_verifier should have 100% line coverage of verifier.c
> and every 'if' condition in the verifier needs to have real use case
> behind it.
> It's still on my todo list to get rid of [su][min|max]_value tracking
> that was introduced without solid justification.
> 

^ permalink raw reply

* Re: [PATCH net] team: check team dev npinfo when adding a port only
From: kbuild test robot @ 2018-04-23  4:20 UTC (permalink / raw)
  To: Xin Long; +Cc: kbuild-all, network dev, davem, Jiri Pirko, stephen hemminger
In-Reply-To: <ac96d2737077b41d1e7cd68164d881faa18f413f.1524395280.git.lucien.xin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 6041 bytes --]

Hi Xin,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url:    https://github.com/0day-ci/linux/commits/Xin-Long/team-check-team-dev-npinfo-when-adding-a-port-only/20180423-114310
config: i386-randconfig-x071-201816 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   drivers/net/team/team.c: In function 'team_port_add':
>> drivers/net/team/team.c:1221:15: error: 'struct net_device' has no member named 'npinfo'
     if (team->dev->npinfo) {
                  ^~

vim +1221 drivers/net/team/team.c

  1136	
  1137	static void __team_port_change_port_added(struct team_port *port, bool linkup);
  1138	static int team_dev_type_check_change(struct net_device *dev,
  1139					      struct net_device *port_dev);
  1140	
  1141	static int team_port_add(struct team *team, struct net_device *port_dev,
  1142				 struct netlink_ext_ack *extack)
  1143	{
  1144		struct net_device *dev = team->dev;
  1145		struct team_port *port;
  1146		char *portname = port_dev->name;
  1147		int err;
  1148	
  1149		if (port_dev->flags & IFF_LOOPBACK) {
  1150			NL_SET_ERR_MSG(extack, "Loopback device can't be added as a team port");
  1151			netdev_err(dev, "Device %s is loopback device. Loopback devices can't be added as a team port\n",
  1152				   portname);
  1153			return -EINVAL;
  1154		}
  1155	
  1156		if (team_port_exists(port_dev)) {
  1157			NL_SET_ERR_MSG(extack, "Device is already a port of a team device");
  1158			netdev_err(dev, "Device %s is already a port "
  1159					"of a team device\n", portname);
  1160			return -EBUSY;
  1161		}
  1162	
  1163		if (port_dev->features & NETIF_F_VLAN_CHALLENGED &&
  1164		    vlan_uses_dev(dev)) {
  1165			NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up");
  1166			netdev_err(dev, "Device %s is VLAN challenged and team device has VLAN set up\n",
  1167				   portname);
  1168			return -EPERM;
  1169		}
  1170	
  1171		err = team_dev_type_check_change(dev, port_dev);
  1172		if (err)
  1173			return err;
  1174	
  1175		if (port_dev->flags & IFF_UP) {
  1176			NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port");
  1177			netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n",
  1178				   portname);
  1179			return -EBUSY;
  1180		}
  1181	
  1182		port = kzalloc(sizeof(struct team_port) + team->mode->port_priv_size,
  1183			       GFP_KERNEL);
  1184		if (!port)
  1185			return -ENOMEM;
  1186	
  1187		port->dev = port_dev;
  1188		port->team = team;
  1189		INIT_LIST_HEAD(&port->qom_list);
  1190	
  1191		port->orig.mtu = port_dev->mtu;
  1192		err = dev_set_mtu(port_dev, dev->mtu);
  1193		if (err) {
  1194			netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err);
  1195			goto err_set_mtu;
  1196		}
  1197	
  1198		memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len);
  1199	
  1200		err = team_port_enter(team, port);
  1201		if (err) {
  1202			netdev_err(dev, "Device %s failed to enter team mode\n",
  1203				   portname);
  1204			goto err_port_enter;
  1205		}
  1206	
  1207		err = dev_open(port_dev);
  1208		if (err) {
  1209			netdev_dbg(dev, "Device %s opening failed\n",
  1210				   portname);
  1211			goto err_dev_open;
  1212		}
  1213	
  1214		err = vlan_vids_add_by_dev(port_dev, dev);
  1215		if (err) {
  1216			netdev_err(dev, "Failed to add vlan ids to device %s\n",
  1217					portname);
  1218			goto err_vids_add;
  1219		}
  1220	
> 1221		if (team->dev->npinfo) {
  1222			err = team_port_enable_netpoll(team, port);
  1223			if (err) {
  1224				netdev_err(dev, "Failed to enable netpoll on device %s\n",
  1225					   portname);
  1226				goto err_enable_netpoll;
  1227			}
  1228		}
  1229	
  1230		if (!(dev->features & NETIF_F_LRO))
  1231			dev_disable_lro(port_dev);
  1232	
  1233		err = netdev_rx_handler_register(port_dev, team_handle_frame,
  1234						 port);
  1235		if (err) {
  1236			netdev_err(dev, "Device %s failed to register rx_handler\n",
  1237				   portname);
  1238			goto err_handler_register;
  1239		}
  1240	
  1241		err = team_upper_dev_link(team, port, extack);
  1242		if (err) {
  1243			netdev_err(dev, "Device %s failed to set upper link\n",
  1244				   portname);
  1245			goto err_set_upper_link;
  1246		}
  1247	
  1248		err = __team_option_inst_add_port(team, port);
  1249		if (err) {
  1250			netdev_err(dev, "Device %s failed to add per-port options\n",
  1251				   portname);
  1252			goto err_option_port_add;
  1253		}
  1254	
  1255		netif_addr_lock_bh(dev);
  1256		dev_uc_sync_multiple(port_dev, dev);
  1257		dev_mc_sync_multiple(port_dev, dev);
  1258		netif_addr_unlock_bh(dev);
  1259	
  1260		port->index = -1;
  1261		list_add_tail_rcu(&port->list, &team->port_list);
  1262		team_port_enable(team, port);
  1263		__team_compute_features(team);
  1264		__team_port_change_port_added(port, !!netif_carrier_ok(port_dev));
  1265		__team_options_change_check(team);
  1266	
  1267		netdev_info(dev, "Port device %s added\n", portname);
  1268	
  1269		return 0;
  1270	
  1271	err_option_port_add:
  1272		team_upper_dev_unlink(team, port);
  1273	
  1274	err_set_upper_link:
  1275		netdev_rx_handler_unregister(port_dev);
  1276	
  1277	err_handler_register:
  1278		team_port_disable_netpoll(port);
  1279	
  1280	err_enable_netpoll:
  1281		vlan_vids_del_by_dev(port_dev, dev);
  1282	
  1283	err_vids_add:
  1284		dev_close(port_dev);
  1285	
  1286	err_dev_open:
  1287		team_port_leave(team, port);
  1288		team_port_set_orig_dev_addr(port);
  1289	
  1290	err_port_enter:
  1291		dev_set_mtu(port_dev, port->orig.mtu);
  1292	
  1293	err_set_mtu:
  1294		kfree(port);
  1295	
  1296		return err;
  1297	}
  1298	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27063 bytes --]

^ permalink raw reply

* Re: [PATCH bpf-next v3 4/9] bpf/verifier: improve register value range tracking with ARSH
From: Alexei Starovoitov @ 2018-04-23  4:19 UTC (permalink / raw)
  To: Yonghong Song; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <e42ee666-dfbc-23e1-63c4-d7c548a2d3ee@fb.com>

On Sun, Apr 22, 2018 at 07:49:13PM -0700, Yonghong Song wrote:
> 
> 
> On 4/22/18 5:16 PM, Alexei Starovoitov wrote:
> > On Fri, Apr 20, 2018 at 03:18:37PM -0700, Yonghong Song wrote:
> > > When helpers like bpf_get_stack returns an int value
> > > and later on used for arithmetic computation, the LSH and ARSH
> > > operations are often required to get proper sign extension into
> > > 64-bit. For example, without this patch:
> > >      54: R0=inv(id=0,umax_value=800)
> > >      54: (bf) r8 = r0
> > >      55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
> > >      55: (67) r8 <<= 32
> > >      56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
> > >      56: (c7) r8 s>>= 32
> > >      57: R8=inv(id=0)
> > > With this patch:
> > >      54: R0=inv(id=0,umax_value=800)
> > >      54: (bf) r8 = r0
> > >      55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
> > >      55: (67) r8 <<= 32
> > >      56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
> > >      56: (c7) r8 s>>= 32
> > >      57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
> > > With better range of "R8", later on when "R8" is added to other register,
> > > e.g., a map pointer or scalar-value register, the better register
> > > range can be derived and verifier failure may be avoided.
> > > 
> > > In our later example,
> > >      ......
> > >      usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
> > >      if (usize < 0)
> > >          return 0;
> > >      ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
> > >      ......
> > > Without improving ARSH value range tracking, the register representing
> > > "max_len - usize" will have smin_value equal to S64_MIN and will be
> > > rejected by verifier.
> > > 
> > > Signed-off-by: Yonghong Song <yhs@fb.com>
> > > ---
> > >   kernel/bpf/verifier.c | 26 ++++++++++++++++++++++++++
> > >   1 file changed, 26 insertions(+)
> > > 
> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > index 3c8bb92..01c215d 100644
> > > --- a/kernel/bpf/verifier.c
> > > +++ b/kernel/bpf/verifier.c
> > > @@ -2975,6 +2975,32 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
> > >   		/* We may learn something more from the var_off */
> > >   		__update_reg_bounds(dst_reg);
> > >   		break;
> > > +	case BPF_ARSH:
> > > +		if (umax_val >= insn_bitness) {
> > > +			/* Shifts greater than 31 or 63 are undefined.
> > > +			 * This includes shifts by a negative number.
> > > +			 */
> > > +			mark_reg_unknown(env, regs, insn->dst_reg);
> > > +			break;
> > > +		}
> > > +		if (dst_reg->smin_value < 0)
> > > +			dst_reg->smin_value >>= umin_val;
> > > +		else
> > > +			dst_reg->smin_value >>= umax_val;
> > > +		if (dst_reg->smax_value < 0)
> > > +			dst_reg->smax_value >>= umax_val;
> > > +		else
> > > +			dst_reg->smax_value >>= umin_val;
> > > +		if (src_known)
> > > +			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
> > > +						       umin_val);
> > > +		else
> > > +			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
> > > +		dst_reg->umin_value >>= umax_val;
> > > +		dst_reg->umax_value >>= umin_val;
> > > +		/* We may learn something more from the var_off */
> > > +		__update_reg_bounds(dst_reg);
> > 
> > I'm struggling to understand how these bounds are computed.
> > Could you add examples in the comments?
> 
> Okay, let me try to add some comments for better understanding.
> 
> > In particular if dst_reg is unknown (tnum.mask == -1)
> > the above tnum_rshift() will clear upper bits and will make it
> > 64-bit positive, but that doesn't seem correct.
> > What am I missing?
> 
> Considering this is arith shift, we probably should just have
> dst_reg->var_off = tnum_unknown to be conservative.
> 
> I could miss something here as well. Let me try to write more
> detailed explanation, hopefully to cover all corner cases.

Is there a use case for !src_known ?
I think test_verifier should have 100% line coverage of verifier.c
and every 'if' condition in the verifier needs to have real use case
behind it.
It's still on my todo list to get rid of [su][min|max]_value tracking
that was introduced without solid justification.

^ permalink raw reply

* Re: [PATCH net] team: check team dev npinfo when adding a port only
From: kbuild test robot @ 2018-04-23  4:17 UTC (permalink / raw)
  To: Xin Long; +Cc: kbuild-all, network dev, davem, Jiri Pirko, stephen hemminger
In-Reply-To: <ac96d2737077b41d1e7cd68164d881faa18f413f.1524395280.git.lucien.xin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 6108 bytes --]

Hi Xin,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url:    https://github.com/0day-ci/linux/commits/Xin-Long/team-check-team-dev-npinfo-when-adding-a-port-only/20180423-114310
config: x86_64-randconfig-x011-201816 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/net/team/team.c: In function 'team_port_add':
>> drivers/net/team/team.c:1221:17: error: 'struct net_device' has no member named 'npinfo'; did you mean 'vlan_info'?
     if (team->dev->npinfo) {
                    ^~~~~~
                    vlan_info

vim +1221 drivers/net/team/team.c

  1136	
  1137	static void __team_port_change_port_added(struct team_port *port, bool linkup);
  1138	static int team_dev_type_check_change(struct net_device *dev,
  1139					      struct net_device *port_dev);
  1140	
  1141	static int team_port_add(struct team *team, struct net_device *port_dev,
  1142				 struct netlink_ext_ack *extack)
  1143	{
  1144		struct net_device *dev = team->dev;
  1145		struct team_port *port;
  1146		char *portname = port_dev->name;
  1147		int err;
  1148	
  1149		if (port_dev->flags & IFF_LOOPBACK) {
  1150			NL_SET_ERR_MSG(extack, "Loopback device can't be added as a team port");
  1151			netdev_err(dev, "Device %s is loopback device. Loopback devices can't be added as a team port\n",
  1152				   portname);
  1153			return -EINVAL;
  1154		}
  1155	
  1156		if (team_port_exists(port_dev)) {
  1157			NL_SET_ERR_MSG(extack, "Device is already a port of a team device");
  1158			netdev_err(dev, "Device %s is already a port "
  1159					"of a team device\n", portname);
  1160			return -EBUSY;
  1161		}
  1162	
  1163		if (port_dev->features & NETIF_F_VLAN_CHALLENGED &&
  1164		    vlan_uses_dev(dev)) {
  1165			NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up");
  1166			netdev_err(dev, "Device %s is VLAN challenged and team device has VLAN set up\n",
  1167				   portname);
  1168			return -EPERM;
  1169		}
  1170	
  1171		err = team_dev_type_check_change(dev, port_dev);
  1172		if (err)
  1173			return err;
  1174	
  1175		if (port_dev->flags & IFF_UP) {
  1176			NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port");
  1177			netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n",
  1178				   portname);
  1179			return -EBUSY;
  1180		}
  1181	
  1182		port = kzalloc(sizeof(struct team_port) + team->mode->port_priv_size,
  1183			       GFP_KERNEL);
  1184		if (!port)
  1185			return -ENOMEM;
  1186	
  1187		port->dev = port_dev;
  1188		port->team = team;
  1189		INIT_LIST_HEAD(&port->qom_list);
  1190	
  1191		port->orig.mtu = port_dev->mtu;
  1192		err = dev_set_mtu(port_dev, dev->mtu);
  1193		if (err) {
  1194			netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err);
  1195			goto err_set_mtu;
  1196		}
  1197	
  1198		memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len);
  1199	
  1200		err = team_port_enter(team, port);
  1201		if (err) {
  1202			netdev_err(dev, "Device %s failed to enter team mode\n",
  1203				   portname);
  1204			goto err_port_enter;
  1205		}
  1206	
  1207		err = dev_open(port_dev);
  1208		if (err) {
  1209			netdev_dbg(dev, "Device %s opening failed\n",
  1210				   portname);
  1211			goto err_dev_open;
  1212		}
  1213	
  1214		err = vlan_vids_add_by_dev(port_dev, dev);
  1215		if (err) {
  1216			netdev_err(dev, "Failed to add vlan ids to device %s\n",
  1217					portname);
  1218			goto err_vids_add;
  1219		}
  1220	
> 1221		if (team->dev->npinfo) {
  1222			err = team_port_enable_netpoll(team, port);
  1223			if (err) {
  1224				netdev_err(dev, "Failed to enable netpoll on device %s\n",
  1225					   portname);
  1226				goto err_enable_netpoll;
  1227			}
  1228		}
  1229	
  1230		if (!(dev->features & NETIF_F_LRO))
  1231			dev_disable_lro(port_dev);
  1232	
  1233		err = netdev_rx_handler_register(port_dev, team_handle_frame,
  1234						 port);
  1235		if (err) {
  1236			netdev_err(dev, "Device %s failed to register rx_handler\n",
  1237				   portname);
  1238			goto err_handler_register;
  1239		}
  1240	
  1241		err = team_upper_dev_link(team, port, extack);
  1242		if (err) {
  1243			netdev_err(dev, "Device %s failed to set upper link\n",
  1244				   portname);
  1245			goto err_set_upper_link;
  1246		}
  1247	
  1248		err = __team_option_inst_add_port(team, port);
  1249		if (err) {
  1250			netdev_err(dev, "Device %s failed to add per-port options\n",
  1251				   portname);
  1252			goto err_option_port_add;
  1253		}
  1254	
  1255		netif_addr_lock_bh(dev);
  1256		dev_uc_sync_multiple(port_dev, dev);
  1257		dev_mc_sync_multiple(port_dev, dev);
  1258		netif_addr_unlock_bh(dev);
  1259	
  1260		port->index = -1;
  1261		list_add_tail_rcu(&port->list, &team->port_list);
  1262		team_port_enable(team, port);
  1263		__team_compute_features(team);
  1264		__team_port_change_port_added(port, !!netif_carrier_ok(port_dev));
  1265		__team_options_change_check(team);
  1266	
  1267		netdev_info(dev, "Port device %s added\n", portname);
  1268	
  1269		return 0;
  1270	
  1271	err_option_port_add:
  1272		team_upper_dev_unlink(team, port);
  1273	
  1274	err_set_upper_link:
  1275		netdev_rx_handler_unregister(port_dev);
  1276	
  1277	err_handler_register:
  1278		team_port_disable_netpoll(port);
  1279	
  1280	err_enable_netpoll:
  1281		vlan_vids_del_by_dev(port_dev, dev);
  1282	
  1283	err_vids_add:
  1284		dev_close(port_dev);
  1285	
  1286	err_dev_open:
  1287		team_port_leave(team, port);
  1288		team_port_set_orig_dev_addr(port);
  1289	
  1290	err_port_enter:
  1291		dev_set_mtu(port_dev, port->orig.mtu);
  1292	
  1293	err_set_mtu:
  1294		kfree(port);
  1295	
  1296		return err;
  1297	}
  1298	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30326 bytes --]

^ permalink raw reply

* [PATCH bpf-next] bpf: fix virtio-net's length calc for XDP_PASS
From: Nikita V. Shirokov @ 2018-04-23  4:16 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Michael S. Tsirkin,
	Jason Wang
  Cc: netdev, David Ahern, Nikita V. Shirokov

In commit 6870de435b90 ("bpf: make virtio compatible w/ 
bpf_xdp_adjust_tail") i didn't account for vi->hdr_len during new 
packet's length calculation after bpf_prog_run in receive_mergeable. 
because of this all packets, if they were passed to the kernel, 
were truncated by 12 bytes.

Fixes:6870de435b90 ("bpf: make virtio compatible w/ bpf_xdp_adjust_tail")
Reported-by: David Ahern <dsahern@gmail.com>

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---

Notes:
    unfortunately it looks like that xdp_tx is still broken because
    fix by Jason (introduced in "XDP_TX for virtio_net not working in recent kernel?
    " thread) haven't landed yet)

 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 779a4f798522..08ac2cc986aa 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -761,7 +761,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			/* recalculate len if xdp.data or xdp.data_end were
 			 * adjusted
 			 */
-			len = xdp.data_end - xdp.data;
+			len = xdp.data_end - xdp.data + vi->hdr_len;
 			/* We can only create skb based on xdp_page. */
 			if (unlikely(xdp_page != page)) {
 				rcu_read_unlock();
-- 
2.15.1

^ permalink raw reply related

* Re: [PATCH bpf-next v3 9/9] tools/bpf: add a test for bpf_get_stack with tracepoint prog
From: Yonghong Song @ 2018-04-23  2:58 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180423002732.6fw45mevsz3bixkq@ast-mbp>



On 4/22/18 5:27 PM, Alexei Starovoitov wrote:
> On Fri, Apr 20, 2018 at 03:18:42PM -0700, Yonghong Song wrote:
>> The test_stacktrace_map and test_stacktrace_build_id are
>> enhanced to call bpf_get_stack in the helper to get the
>> stack trace as well.  The stack traces from bpf_get_stack
>> and bpf_get_stackid are compared to ensure that for the
>> same stack as represented as the same hash, their ip addresses
>> or build id's must be the same.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   tools/testing/selftests/bpf/test_progs.c           | 63 +++++++++++++++++++---
>>   .../selftests/bpf/test_stacktrace_build_id.c       | 20 ++++++-
>>   tools/testing/selftests/bpf/test_stacktrace_map.c  | 20 +++++--
>>   3 files changed, 92 insertions(+), 11 deletions(-)
>>
>> diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
>> index dad4c3f..06b922a 100644
>> --- a/tools/testing/selftests/bpf/test_progs.c
>> +++ b/tools/testing/selftests/bpf/test_progs.c
>> @@ -897,11 +897,40 @@ static int compare_map_keys(int map1_fd, int map2_fd)
>>   	return 0;
>>   }
>>   
>> +static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
>> +{
>> +	__u32 key, next_key, *cur_key_p, *next_key_p;
>> +	char val_buf1[stack_trace_len], val_buf2[stack_trace_len];
> 
> the kernel is trying to get rid of VLAs.
> test_progs.c already uses them, but if possible let's not
> add more uses of them.

okay, try to get rid of these two VLAs.

> Other than that looks great.

^ permalink raw reply

* Re: [PATCH bpf-next v3 8/9] tools/bpf: add a test for bpf_get_stack with raw tracepoint prog
From: Yonghong Song @ 2018-04-23  2:57 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180423002301.lnhdn6oueweqztvb@ast-mbp>



On 4/22/18 5:23 PM, Alexei Starovoitov wrote:
> On Fri, Apr 20, 2018 at 03:18:41PM -0700, Yonghong Song wrote:
>> The test attached a raw_tracepoint program to sched/sched_switch.
>> It tested to get stack for user space, kernel space and user
>> space with build_id request. It also tested to get user
>> and kernel stack into the same buffer with back-to-back
>> bpf_get_stack helper calls.
>>
>> Whenever the kernel stack is available, the user space
>> application will check to ensure that the kernel function
>> for raw_tracepoint ___bpf_prog_run is part of the stack.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   tools/testing/selftests/bpf/Makefile               |   3 +-
>>   tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 ++++++++++++++++++
>>   tools/testing/selftests/bpf/test_progs.c           | 115 +++++++++++++++++++++
>>   3 files changed, 219 insertions(+), 1 deletion(-)
>>   create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c
>>
>> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
>> index 0b72cc7..54e9e74 100644
>> --- a/tools/testing/selftests/bpf/Makefile
>> +++ b/tools/testing/selftests/bpf/Makefile
>> @@ -32,7 +32,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
>>   	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
>>   	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
>>   	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
>> -	test_btf_haskv.o test_btf_nokv.o
>> +	test_btf_haskv.o test_btf_nokv.o test_get_stack_rawtp.o
>>   
>>   # Order correspond to 'make run_tests' order
>>   TEST_PROGS := test_kmod.sh \
>> @@ -56,6 +56,7 @@ $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
>>   $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
>>   $(OUTPUT)/test_sock: cgroup_helpers.c
>>   $(OUTPUT)/test_sock_addr: cgroup_helpers.c
>> +$(OUTPUT)/test_progs: trace_helpers.c
>>   
>>   .PHONY: force
>>   
>> diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
>> new file mode 100644
>> index 0000000..ba1dcf9
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
>> @@ -0,0 +1,102 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <linux/bpf.h>
>> +#include "bpf_helpers.h"
>> +
>> +/* Permit pretty deep stack traces */
>> +#define MAX_STACK_RAWTP 100
>> +struct stack_trace_t {
>> +	int pid;
>> +	int kern_stack_size;
>> +	int user_stack_size;
>> +	int user_stack_buildid_size;
>> +	__u64 kern_stack[MAX_STACK_RAWTP];
>> +	__u64 user_stack[MAX_STACK_RAWTP];
>> +	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
>> +};
>> +
>> +struct bpf_map_def SEC("maps") perfmap = {
>> +	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>> +	.key_size = sizeof(int),
>> +	.value_size = sizeof(__u32),
>> +	.max_entries = 2,
>> +};
>> +
>> +struct bpf_map_def SEC("maps") stackdata_map = {
>> +	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
>> +	.key_size = sizeof(__u32),
>> +	.value_size = sizeof(struct stack_trace_t),
>> +	.max_entries = 1,
>> +};
>> +
>> +/* Allocate per-cpu space twice the needed. For the code below
>> + *   usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>> + *   if (usize < 0)
>> + *     return 0;
>> + *   ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
>> + *
>> + * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64),
>> + * verifier will complain that access "raw_data + usize"
>> + * with size "max_len - usize" may be out of bound.
>> + * The maximum "raw_data + usize" is "raw_data + max_len"
>> + * and the maximum "max_len - usize" is "max_len", verifier
>> + * concludes that the maximum buffer access range is
>> + * "raw_data[0...max_len * 2 - 1]" and hence reject the program.
>> + *
>> + * Doubling the to-be-used max buffer size can fix this verifier
>> + * issue and avoid complicated C programming massaging.
>> + * This is an acceptable workaround since there is one entry here.
>> + */
>> +struct bpf_map_def SEC("maps") rawdata_map = {
>> +	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
>> +	.key_size = sizeof(__u32),
>> +	.value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2,
>> +	.max_entries = 1,
>> +};
>> +
>> +SEC("tracepoint/sched/sched_switch")
>> +int bpf_prog1(void *ctx)
>> +{
>> +	int max_len, max_buildid_len, usize, ksize, total_size;
>> +	struct stack_trace_t *data;
>> +	void *raw_data;
>> +	__u32 key = 0;
>> +
>> +	data = bpf_map_lookup_elem(&stackdata_map, &key);
>> +	if (!data)
>> +		return 0;
>> +
>> +	max_len = MAX_STACK_RAWTP * sizeof(__u64);
>> +	max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id);
>> +	data->pid = bpf_get_current_pid_tgid();
>> +	data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
>> +					      max_len, 0);
>> +	data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
>> +					    BPF_F_USER_STACK);
>> +	data->user_stack_buildid_size = bpf_get_stack(
>> +		ctx, data->user_stack_buildid, max_buildid_len,
>> +		BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
>> +	bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
>> +
>> +	/* write both kernel and user stacks to the same buffer */
>> +	raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
>> +	if (!raw_data)
>> +		return 0;
>> +
>> +	usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>> +	if (usize < 0)
>> +		return 0;
>> +
>> +	ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
>> +	if (ksize < 0)
> 
> may be instead of teaching verifier about ARSH (which doesn't look
> straighforward) such use case can be done as:
> u32 max_len, usize, ksize;
> ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
> if ((int)ksize < 0)

Just tried, it does not work. The compiler generates code like:

47: (b7) r9 = 800
48: (bf) r1 = r6
49: (bf) r2 = r7
50: (b7) r3 = 800
51: (b7) r4 = 256
52: (85) call bpf_get_stack#66
  R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0) 
R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256 
R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) 
R9_w=inv800 R10=fp0,call_-1
53: (b7) r1 = 0
54: (bf) r8 = r0
55: (67) r8 <<= 32
56: (c7) r8 s>>= 32
57: (6d) if r1 s> r8 goto pc+27
  R0=inv(id=0,umax_value=800) R1=inv0 R6=ctx(id=0,off=0,imm=0) 
R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) 
R8=inv(id=0,umax_value=9223372036854775807,var_off=(0x0; 
0x7fffffffffffffff)) R9=inv800 R10=fp0,call_-1
58: (1f) r9 -= r8
59: (bf) r1 = r8
60: (67) r1 <<= 32
61: (77) r1 >>= 32
62: (bf) r2 = r7
63: (0f) r2 += r1
64: (bf) r1 = r6
65: (bf) r3 = r9
66: (b7) r4 = 0
67: (85) call bpf_get_stack#66
R3 min value is negative, either use unsigned or 'var &= const'

Basically, the compiler does lsh/arsh for "int" value to do the 
comparison and then get this value does a lsh/rsh.
So it looks like we still need arsh.

> 
> That's certainly suboptimal and very much non obvious to program
> developers, but at least it can unblock the bpf_get_stack part
> landing and proper ARSH support can be added later?
> Just a thought.
> 
>> +		return 0;
>> +
>> +	total_size = usize + ksize;
>> +	if (total_size > 0 && total_size <= max_len)
>> +		bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
>> +
>> +	return 0;
>> +}
> 
> the rest of the test looks great. Thank you for adding such exhaustive test.
> 

^ permalink raw reply

* bpf: samples/bpf/sockex2:  Assertion `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0])) == 0' failed.
From: Wang Sheng-Hui @ 2018-04-23  2:56 UTC (permalink / raw)
  To: ast, daniel, netdev

Sorry to trouble you!

I run samples/bpf/sockex2 failed with 
"Assertion `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0])) == 0' failed."

Then I run " strace ./sockex2" and got:
...
bind(3, {sa_family=AF_PACKET, sll_protocol=htons(ETH_P_ALL), sll_ifindex=if_nametoindex("lo"), sll_hatype=ARPHRD_NETROM, sll_pkttype=PACKET_HOST, sll_halen=0}, 20) = 0
setsockopt(3, SOL_SOCKET, SO_ATTACH_BPF, [0], 4) = -1 EINVAL (Invalid argument)
write(2, "sockex2: /root/linux/samples/bpf"..., 156sockex2: /root/linux/samples/bpf/sockex2_user.c:35: main: Assertion `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0])) == 0' failed.
) = 156
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb8ec4bf000
rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, ~[RTMIN RT_1], [], 8) = 0
getpid()                                = 3513
gettid()                                = 3513
tgkill(3513, 3513, SIGABRT)             = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
--- SIGABRT {si_signo=SIGABRT, si_code=SI_TKILL, si_pid=3513, si_uid=0} ---
+++ killed by SIGABRT +++
Aborted

I tried to find out the root cause but failed:
sockex2 is similar with sockex1, except it run with large bpf code and hash map.
Both can pass the program load phase and open raw socket, so I think there is nothing
wrong with memcharge and other checks.
While sockex1 can work, sockext2 failed with BPF attach to socket.
I read though the sock attach code path and didn't find any specific check to report failures. 

Can someone help to fix the failure, please?

Thanks,
shenghui

^ permalink raw reply

* Re: [PATCH bpf-next v3 4/9] bpf/verifier: improve register value range tracking with ARSH
From: Yonghong Song @ 2018-04-23  2:49 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180423001615.wlxnlp6xdquzrntt@ast-mbp>



On 4/22/18 5:16 PM, Alexei Starovoitov wrote:
> On Fri, Apr 20, 2018 at 03:18:37PM -0700, Yonghong Song wrote:
>> When helpers like bpf_get_stack returns an int value
>> and later on used for arithmetic computation, the LSH and ARSH
>> operations are often required to get proper sign extension into
>> 64-bit. For example, without this patch:
>>      54: R0=inv(id=0,umax_value=800)
>>      54: (bf) r8 = r0
>>      55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
>>      55: (67) r8 <<= 32
>>      56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
>>      56: (c7) r8 s>>= 32
>>      57: R8=inv(id=0)
>> With this patch:
>>      54: R0=inv(id=0,umax_value=800)
>>      54: (bf) r8 = r0
>>      55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
>>      55: (67) r8 <<= 32
>>      56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
>>      56: (c7) r8 s>>= 32
>>      57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
>> With better range of "R8", later on when "R8" is added to other register,
>> e.g., a map pointer or scalar-value register, the better register
>> range can be derived and verifier failure may be avoided.
>>
>> In our later example,
>>      ......
>>      usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>>      if (usize < 0)
>>          return 0;
>>      ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
>>      ......
>> Without improving ARSH value range tracking, the register representing
>> "max_len - usize" will have smin_value equal to S64_MIN and will be
>> rejected by verifier.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   kernel/bpf/verifier.c | 26 ++++++++++++++++++++++++++
>>   1 file changed, 26 insertions(+)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 3c8bb92..01c215d 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2975,6 +2975,32 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
>>   		/* We may learn something more from the var_off */
>>   		__update_reg_bounds(dst_reg);
>>   		break;
>> +	case BPF_ARSH:
>> +		if (umax_val >= insn_bitness) {
>> +			/* Shifts greater than 31 or 63 are undefined.
>> +			 * This includes shifts by a negative number.
>> +			 */
>> +			mark_reg_unknown(env, regs, insn->dst_reg);
>> +			break;
>> +		}
>> +		if (dst_reg->smin_value < 0)
>> +			dst_reg->smin_value >>= umin_val;
>> +		else
>> +			dst_reg->smin_value >>= umax_val;
>> +		if (dst_reg->smax_value < 0)
>> +			dst_reg->smax_value >>= umax_val;
>> +		else
>> +			dst_reg->smax_value >>= umin_val;
>> +		if (src_known)
>> +			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
>> +						       umin_val);
>> +		else
>> +			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
>> +		dst_reg->umin_value >>= umax_val;
>> +		dst_reg->umax_value >>= umin_val;
>> +		/* We may learn something more from the var_off */
>> +		__update_reg_bounds(dst_reg);
> 
> I'm struggling to understand how these bounds are computed.
> Could you add examples in the comments?

Okay, let me try to add some comments for better understanding.

> In particular if dst_reg is unknown (tnum.mask == -1)
> the above tnum_rshift() will clear upper bits and will make it
> 64-bit positive, but that doesn't seem correct.
> What am I missing?

Considering this is arith shift, we probably should just have
dst_reg->var_off = tnum_unknown to be conservative.

I could miss something here as well. Let me try to write more
detailed explanation, hopefully to cover all corner cases.

^ permalink raw reply

* Re: [PATCH bpf-next v3 3/9] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-23  2:46 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180422235538.5tqayfahfeqanfou@ast-mbp>



On 4/22/18 4:55 PM, Alexei Starovoitov wrote:
> On Fri, Apr 20, 2018 at 03:18:36PM -0700, Yonghong Song wrote:
>> The special property of return values for helpers bpf_get_stack
>> and bpf_probe_read_str are captured in verifier.
>> Both helpers return a negative error code or
>> a length, which is equal to or smaller than the buffer
>> size argument. This additional information in the
>> verifier can avoid the condition such as "retval > bufsize"
>> in the bpf program. For example, for the code blow,
>>      usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>>      if (usize < 0 || usize > max_len)
>>          return 0;
>> The verifier may have the following errors:
>>      52: (85) call bpf_get_stack#65
>>       R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
>>       R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
>>       R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>>       R9_w=inv800 R10=fp0,call_-1
>>      53: (bf) r8 = r0
>>      54: (bf) r1 = r8
>>      55: (67) r1 <<= 32
>>      56: (bf) r2 = r1
>>      57: (77) r2 >>= 32
>>      58: (25) if r2 > 0x31f goto pc+33
>>       R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
>>                           umax_value=18446744069414584320,
>>                           var_off=(0x0; 0xffffffff00000000))
>>       R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
>>       R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>>       R8=inv(id=0) R9=inv800 R10=fp0,call_-1
>>      59: (1f) r9 -= r8
>>      60: (c7) r1 s>>= 32
>>      61: (bf) r2 = r7
>>      62: (0f) r2 += r1
>>      math between map_value pointer and register with unbounded
>>      min value is not allowed
>> The failure is due to llvm compiler optimization where register "r2",
>> which is a copy of "r1", is tested for condition while later on "r1"
>> is used for map_ptr operation. The verifier is not able to track such
>> inst sequence effectively.
>>
>> Without the "usize > max_len" condition, there is no llvm optimization
>> and the below generated code passed verifier:
>>      52: (85) call bpf_get_stack#65
>>       R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
>>       R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
>>       R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>>       R9_w=inv800 R10=fp0,call_-1
>>      53: (b7) r1 = 0
>>      54: (bf) r8 = r0
>>      55: (67) r8 <<= 32
>>      56: (c7) r8 s>>= 32
>>      57: (6d) if r1 s> r8 goto pc+24
>>       R0=inv(id=0,umax_value=800) R1=inv0 R6=ctx(id=0,off=0,imm=0)
>>       R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>>       R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
>>       R10=fp0,call_-1
>>      58: (bf) r2 = r7
>>      59: (0f) r2 += r8
>>      60: (1f) r9 -= r8
>>      61: (bf) r1 = r6
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++
>>   1 file changed, 27 insertions(+)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index aba9425..3c8bb92 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -164,6 +164,8 @@ struct bpf_call_arg_meta {
>>   	bool pkt_access;
>>   	int regno;
>>   	int access_size;
>> +	s64 msize_smax_value;
>> +	u64 msize_umax_value;
>>   };
>>   
>>   static DEFINE_MUTEX(bpf_verifier_lock);
>> @@ -2027,6 +2029,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>>   		err = check_helper_mem_access(env, regno - 1,
>>   					      reg->umax_value,
>>   					      zero_size_allowed, meta);
>> +
>> +		if (!err && !!meta) {
> 
> Please drop !! in the above.
> 
> Also what happens when
> if (!tnum_is_const(reg->var_off))
>    meta = NULL;
> ?
> it seems two new fields of meta will stay zero initialized
> that later do_refine_retval_range() will set R0->umax_value = 0
> which seems incorrect.

Thanks for catching this. In do_refine_retval_range(), if meta is NULL,
the function should just return. Otherwise, a page fault will happen.

> 
>> +			/* remember the mem_size which may be used later
>> +			 * to refine return values.
>> +			 */
>> +			meta->msize_smax_value = reg->smax_value;
>> +			meta->msize_umax_value = reg->umax_value;
>> +		}
>>   	}
>>   
>>   	return err;
>> @@ -2333,6 +2343,21 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
>>   	return 0;
>>   }
>>   
>> +static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
>> +				   int func_id,
>> +				   struct bpf_call_arg_meta *meta)
>> +{
>> +	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
>> +
>> +	if (ret_type != RET_INTEGER ||
>> +	    (func_id != BPF_FUNC_get_stack &&
>> +	     func_id != BPF_FUNC_probe_read_str))
>> +		return;
>> +
>> +	ret_reg->smax_value = meta->msize_smax_value;
>> +	ret_reg->umax_value = meta->msize_umax_value;
>> +}
>> +
>>   static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
>>   {
>>   	const struct bpf_func_proto *fn = NULL;
>> @@ -2456,6 +2481,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
>>   		return -EINVAL;
>>   	}
>>   
>> +	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
>> +
>>   	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
>>   	if (err)
>>   		return err;
>> -- 
>> 2.9.5
>>

^ permalink raw reply

* Re: [PATCH net-next 2/2] netns: isolate seqnums to use per-netns locks
From: kbuild test robot @ 2018-04-23  2:39 UTC (permalink / raw)
  To: Christian Brauner
  Cc: kbuild-all, ebiederm, davem, netdev, linux-kernel, avagin, ktkhai,
	serge, gregkh, Christian Brauner
In-Reply-To: <20180418152106.18519-3-christian.brauner@ubuntu.com>

[-- Attachment #1: Type: text/plain, Size: 966 bytes --]

Hi Christian,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Christian-Brauner/netns-uevent-performance-tweaks/20180420-013717
config: alpha-alldefconfig (attached as .config)
compiler: alpha-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=alpha 

All errors (new ones prefixed by >>):

   kernel/ksysfs.o: In function `uevent_seqnum_show':
>> (.text+0x18c): undefined reference to `get_ns_uevent_seqnum_by_vpid'
   (.text+0x19c): undefined reference to `get_ns_uevent_seqnum_by_vpid'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6435 bytes --]

^ permalink raw reply

* Re: [PATCH 1/6] rhashtable: remove outdated comments about grow_decision etc
From: NeilBrown @ 2018-04-23  1:41 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, tgraf, netdev, linux-kernel
In-Reply-To: <20180418.214737.1146004592657647139.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1134 bytes --]

On Wed, Apr 18 2018, David Miller wrote:

> From: NeilBrown <neilb@suse.com>
> Date: Thu, 19 Apr 2018 09:09:05 +1000
>
>> On Wed, Apr 18 2018, Herbert Xu wrote:
>> 
>>> On Wed, Apr 18, 2018 at 04:47:01PM +1000, NeilBrown wrote:
>>>> grow_decision and shink_decision no longer exist, so remove
>>>> the remaining references to them.
>>>> 
>>>> Signed-off-by: NeilBrown <neilb@suse.com>
>>>
>>> Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
>> 
>> Thanks.  Is that Ack sufficient for this patch to go upstream, or is
>> there something else that I need to do?
>
> One patch being ACK'd does not release the whole series to be applied
> and the whole series will be treated as a complete unit for that
> purpose.
>
> So if discussion is holding up one patch in the series, it holds up
> the entire series.
>
> So get the entire series in acceptable condition, or submit only one
> change at a time individually and wait for that one to be accepted
> before you submit and ask for feedback on the next one.
>
> I hope that makes things clear for you.

Yes, nice and clear.  Thanks a lot!

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH 2/6] rhashtable: remove incorrect comment on r{hl, hash}table_walk_enter()
From: NeilBrown @ 2018-04-23  1:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <20180419032237.yjvjuh6n7n6tggtr@gondor.apana.org.au>

[-- Attachment #1: Type: text/plain, Size: 3166 bytes --]

On Thu, Apr 19 2018, Herbert Xu wrote:

> On Thu, Apr 19, 2018 at 08:56:28AM +1000, NeilBrown wrote:
>>
>> I don't want to do that - I just want the documentation to be correct
>> (or at least, not be blatantly incorrect).  The function does not sleep,
>> and is safe to call with spin locks held.
>> Do we need to spell out when it can be called?  If so, maybe:
>> 
>>    This function may be called from any process context, including
>>    non-preemptable context, but cannot be called from interrupts.
>
> Just to make it perfectly clear, how about "cannot be called from
> softirq or hardirq context"? Previously the not able to sleep part
> completely ruled out any ambiguity but the new wording could confuse
> people into thinking that this can be called from softirq context
> where it would be unsafe if mixed with process context usage.
>

Sound fair.  Could you Ack the following?  Then I'll resend all the
patches that have an ack.
I've realised that the "further improve stability of rhashtable_walk"
patch isn't actually complete, so I'll withdraw that for now.

Thanks,
NeilBrown

From e16c037398b6c057787437f3a0aaa7fd44c3bee3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 16 Apr 2018 11:05:39 +1000
Subject: [PATCH] rhashtable: Revise incorrect comment on
 r{hl,hash}table_walk_enter()

Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, though
they do take a spinlock without irq protection.
So revise the comments to accurately state the contexts in which
these functions can be called.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h | 5 +++--
 lib/rhashtable.c           | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 87d443a5b11d..4e1f535c2034 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1268,8 +1268,9 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2b2b79974b61..6d490f51174e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -668,8 +668,9 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* [PATCH net] ipv6: add RTA_TABLE and RTA_PREFSRC to rtm_ipv6_policy
From: Eric Dumazet @ 2018-04-23  1:29 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

KMSAN reported use of uninit-value that I tracked to lack
of proper size check on RTA_TABLE attribute.

I also believe RTA_PREFSRC lacks a similar check.

Fixes: 86872cb57925 ("[IPv6] route: FIB6 configuration using struct fib6_config")
Fixes: c3968a857a6b ("ipv6: RTA_PREFSRC support for ipv6 route source address selection")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/ipv6/route.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49b954d6d0fa44ea0c4427e2918b3ab9c1610fe0..cde7d8251377c1a115e02c46843d361d3c0b4313 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3975,6 +3975,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
 
 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
+	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
 	[RTA_OIF]               = { .type = NLA_U32 },
 	[RTA_IIF]		= { .type = NLA_U32 },
 	[RTA_PRIORITY]          = { .type = NLA_U32 },
@@ -3986,6 +3987,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_EXPIRES]		= { .type = NLA_U32 },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_TABLE]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* Re: pull-request: bpf 2018-04-21
From: David Miller @ 2018-04-23  1:16 UTC (permalink / raw)
  To: daniel; +Cc: ast, netdev
In-Reply-To: <20180421002224.3881-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 21 Apr 2018 02:22:24 +0200

> The following pull-request contains BPF updates for your *net* tree.
> 
> The main changes are:
> 
> 1) Fix a deadlock between mm->mmap_sem and bpf_event_mutex when
>    one task is detaching a BPF prog via perf_event_detach_bpf_prog()
>    and another one dumping through bpf_prog_array_copy_info(). For
>    the latter we move the copy_to_user() out of the bpf_event_mutex
>    lock to fix it, from Yonghong.
> 
> 2) Fix test_sock and test_sock_addr.sh failures. The former was
>    hitting rlimit issues and the latter required ping to specify
>    the address family, from Yonghong.
> 
> 3) Remove a dead check in sockmap's sock_map_alloc(), from Jann.
> 
> 4) Add generated files to BPF kselftests gitignore that were previously
>    missed, from Anders.
> 
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Pulled, thanks Daniel.

^ permalink raw reply

* Re: [PATCH net] ibmvnic: Clean actual number of RX or TX pools
From: David Miller @ 2018-04-23  1:13 UTC (permalink / raw)
  To: tlfalcon; +Cc: netdev, nfont, jallen, linuxppc-dev
In-Reply-To: <1524252332-10272-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Fri, 20 Apr 2018 14:25:32 -0500

> Avoid using value stored in the login response buffer when
> cleaning TX and RX buffer pools since these could be inconsistent
> depending on the device state. Instead use the field in the driver's
> private data that tracks the number of active pools.
> 
> Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>

Applied.

^ permalink raw reply

* Re: [PATCHv4 net 0/3] net: sched: ife: malformed ife packet fixes
From: David Miller @ 2018-04-23  1:12 UTC (permalink / raw)
  To: aring; +Cc: yotam.gi, jhs, xiyou.wangcong, jiri, yuvalm, netdev, kernel
In-Reply-To: <20180420191505.27633-1-aring@mojatatu.com>

From: Alexander Aring <aring@mojatatu.com>
Date: Fri, 20 Apr 2018 15:15:02 -0400

> As promised at netdev 2.2 tc workshop I am working on adding scapy support for
> tdc testing. It is still work in progress. I will submit the patches to tdc
> later (they are not in good shape yet). The good news is I have been able to
> find bugs which normal packet testing would not be able to find.
> With fuzzy testing I was able to craft certain malformed packets that IFE
> action was not able to deal with. This patch set fixes those bugs.

Series applied and queued up for -stable.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox