Netdev List
 help / color / mirror / Atom feed
* [RFC][PATCH 12/13] new helper: iov_iter_kvec()
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

initialization of kvec-backed iov_iter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  2 ++
 mm/iov_iter.c       | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 28ed2d9..c567655 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -87,6 +87,8 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
+void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *iov,
+			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 5a3b4a8..7c04051 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -479,6 +479,19 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_single_seg_count);
 
+void iov_iter_kvec(struct iov_iter *i, int direction,
+			const struct kvec *iov, unsigned long nr_segs,
+			size_t count)
+{
+	BUG_ON(!(direction & ITER_KVEC));
+	i->type = direction;
+	i->kvec = (struct kvec *)iov;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_kvec);
+
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
 	unsigned long res = 0;
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 11/13] csum_and_copy_..._iter()
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  2 ++
 mm/iov_iter.c       | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6e16945..28ed2d9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -124,6 +124,8 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
 	i->count = count;
 }
+size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index d74de6d..5a3b4a8 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -3,6 +3,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <net/checksum.h>
 
 #define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
 	size_t left;					\
@@ -605,6 +606,94 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
+			       struct iov_iter *i)
+{
+	char *to = addr;
+	__wsum sum, next;
+	size_t off = 0;
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	sum = *csum;
+	iterate_and_advance(i, bytes, v, ({
+		int err = 0;
+		next = csum_and_copy_from_user(v.iov_base, 
+					       (to += v.iov_len) - v.iov_len,
+					       v.iov_len, 0, &err);
+		if (!err) {
+			sum = csum_block_add(sum, next, off);
+			off += v.iov_len;
+		}
+		err ? v.iov_len : 0;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		next = csum_partial_copy_nocheck(p + v.bv_offset,
+						 (to += v.bv_len) - v.bv_len,
+						 v.bv_len, 0);
+		kunmap_atomic(p);
+		sum = csum_block_add(sum, next, off);
+		off += v.bv_len;
+	}),({
+		next = csum_partial_copy_nocheck(v.iov_base,
+						 (to += v.iov_len) - v.iov_len,
+						 v.iov_len, 0);
+		sum = csum_block_add(sum, next, off);
+		off += v.iov_len;
+	})
+	)
+	*csum = sum;
+	return bytes;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter);
+
+size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
+			     struct iov_iter *i)
+{
+	char *from = addr;
+	__wsum sum, next;
+	size_t off = 0;
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	sum = *csum;
+	iterate_and_advance(i, bytes, v, ({
+		int err = 0;
+		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
+					     v.iov_base, 
+					     v.iov_len, 0, &err);
+		if (!err) {
+			sum = csum_block_add(sum, next, off);
+			off += v.iov_len;
+		}
+		err ? v.iov_len : 0;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
+						 p + v.bv_offset,
+						 v.bv_len, 0);
+		kunmap_atomic(p);
+		sum = csum_block_add(sum, next, off);
+		off += v.bv_len;
+	}),({
+		next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
+						 v.iov_base,
+						 v.iov_len, 0);
+		sum = csum_block_add(sum, next, off);
+		off += v.iov_len;
+	})
+	)
+	*csum = sum;
+	return bytes;
+}
+EXPORT_SYMBOL(csum_and_copy_to_iter);
+
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	size_t size = i->count;
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 10/13] iov_iter.c: handle ITER_KVEC directly
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

... without bothering with copy_..._user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |   1 +
 mm/iov_iter.c       | 101 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 9b15814..6e16945 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -31,6 +31,7 @@ struct iov_iter {
 	size_t count;
 	union {
 		const struct iovec *iov;
+		const struct kvec *kvec;
 		const struct bio_vec *bvec;
 	};
 	unsigned long nr_segs;
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 66665449..d74de6d 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -32,6 +32,29 @@
 	n = wanted - n;					\
 }
 
+#define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
+	size_t wanted = n;				\
+	__p = i->kvec;					\
+	__v.iov_len = min(n, __p->iov_len - skip);	\
+	if (likely(__v.iov_len)) {			\
+		__v.iov_base = __p->iov_base + skip;	\
+		(void)(STEP);				\
+		skip += __v.iov_len;			\
+		n -= __v.iov_len;			\
+	}						\
+	while (unlikely(n)) {				\
+		__p++;					\
+		__v.iov_len = min(n, __p->iov_len);	\
+		if (unlikely(!__v.iov_len))		\
+			continue;			\
+		__v.iov_base = __p->iov_base;		\
+		(void)(STEP);				\
+		skip = __v.iov_len;			\
+		n -= __v.iov_len;			\
+	}						\
+	n = wanted;					\
+}
+
 #define iterate_bvec(i, n, __v, __p, skip, STEP) {	\
 	size_t wanted = n;				\
 	__p = i->bvec;					\
@@ -57,12 +80,16 @@
 	n = wanted;					\
 }
 
-#define iterate_all_kinds(i, n, v, I, B) {			\
+#define iterate_all_kinds(i, n, v, I, B, K) {			\
 	size_t skip = i->iov_offset;				\
 	if (unlikely(i->type & ITER_BVEC)) {			\
 		const struct bio_vec *bvec;			\
 		struct bio_vec v;				\
 		iterate_bvec(i, n, v, bvec, skip, (B))		\
+	} else if (unlikely(i->type & ITER_KVEC)) {		\
+		const struct kvec *kvec;			\
+		struct kvec v;					\
+		iterate_kvec(i, n, v, kvec, skip, (K))		\
 	} else {						\
 		const struct iovec *iov;			\
 		struct iovec v;					\
@@ -70,7 +97,7 @@
 	}							\
 }
 
-#define iterate_and_advance(i, n, v, I, B) {			\
+#define iterate_and_advance(i, n, v, I, B, K) {			\
 	size_t skip = i->iov_offset;				\
 	if (unlikely(i->type & ITER_BVEC)) {			\
 		const struct bio_vec *bvec;			\
@@ -82,6 +109,16 @@
 		}						\
 		i->nr_segs -= bvec - i->bvec;			\
 		i->bvec = bvec;					\
+	} else if (unlikely(i->type & ITER_KVEC)) {		\
+		const struct kvec *kvec;			\
+		struct kvec v;					\
+		iterate_kvec(i, n, v, kvec, skip, (K))		\
+		if (skip == kvec->iov_len) {			\
+			kvec++;					\
+			skip = 0;				\
+		}						\
+		i->nr_segs -= kvec - i->kvec;			\
+		i->kvec = kvec;					\
 	} else {						\
 		const struct iovec *iov;			\
 		struct iovec v;					\
@@ -270,7 +307,7 @@ done:
  */
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
-	if (!(i->type & ITER_BVEC)) {
+	if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
 		char __user *buf = i->iov->iov_base + i->iov_offset;
 		bytes = min(bytes, i->iov->iov_len - i->iov_offset);
 		return fault_in_pages_readable(buf, bytes);
@@ -284,10 +321,14 @@ void iov_iter_init(struct iov_iter *i, int direction,
 			size_t count)
 {
 	/* It will get better.  Eventually... */
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (segment_eq(get_fs(), KERNEL_DS)) {
 		direction |= ITER_KVEC;
-	i->type = direction;
-	i->iov = iov;
+		i->type = direction;
+		i->kvec = (struct kvec *)iov;
+	} else {
+		i->type = direction;
+		i->iov = iov;
+	}
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count;
@@ -328,7 +369,8 @@ size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
 		__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
 			       v.iov_len),
 		memcpy_to_page(v.bv_page, v.bv_offset,
-			       (from += v.bv_len) - v.bv_len, v.bv_len)
+			       (from += v.bv_len) - v.bv_len, v.bv_len),
+		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
 	)
 
 	return bytes;
@@ -348,7 +390,8 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 		__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
 				 v.iov_len),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len)
+				 v.bv_offset, v.bv_len),
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
 	)
 
 	return bytes;
@@ -371,7 +414,7 @@ EXPORT_SYMBOL(copy_page_to_iter);
 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
-	if (i->type & ITER_BVEC) {
+	if (i->type & (ITER_BVEC|ITER_KVEC)) {
 		void *kaddr = kmap_atomic(page);
 		size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
 		kunmap_atomic(kaddr);
@@ -391,7 +434,8 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 
 	iterate_and_advance(i, bytes, v,
 		__clear_user(v.iov_base, v.iov_len),
-		memzero_page(v.bv_page, v.bv_offset, v.bv_len)
+		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
+		memset(v.iov_base, 0, v.iov_len)
 	)
 
 	return bytes;
@@ -406,7 +450,8 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 		__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
 					  v.iov_base, v.iov_len),
 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len)
+				 v.bv_offset, v.bv_len),
+		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
 	)
 	kunmap_atomic(kaddr);
 	return bytes;
@@ -415,7 +460,7 @@ EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
 void iov_iter_advance(struct iov_iter *i, size_t size)
 {
-	iterate_and_advance(i, size, v, 0, 0)
+	iterate_and_advance(i, size, v, 0, 0, 0)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
@@ -443,7 +488,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 
 	iterate_all_kinds(i, size, v,
 		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
-		res |= v.bv_offset | v.bv_len
+		res |= v.bv_offset | v.bv_len,
+		res |= (unsigned long)v.iov_base | v.iov_len
 	)
 	return res;
 }
@@ -478,6 +524,16 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 		*start = v.bv_offset;
 		get_page(*pages = v.bv_page);
 		return v.bv_len;
+	}),({
+		unsigned long addr = (unsigned long)v.iov_base, end;
+		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+
+		if (len > maxpages * PAGE_SIZE)
+			len = maxpages * PAGE_SIZE;
+		addr &= ~(PAGE_SIZE - 1);
+		for (end = addr + len; addr < end; addr += PAGE_SIZE)
+			get_page(*pages++ = virt_to_page(addr));
+		return len - *start;
 	})
 	)
 	return 0;
@@ -530,6 +586,19 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 			return -ENOMEM;
 		get_page(*p = v.bv_page);
 		return v.bv_len;
+	}),({
+		unsigned long addr = (unsigned long)v.iov_base, end;
+		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		int n;
+
+		addr &= ~(PAGE_SIZE - 1);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+		*pages = p = get_pages_array(n);
+		if (!p)
+			return -ENOMEM;
+		for (end = addr + len; addr < end; addr += PAGE_SIZE)
+			get_page(*p++ = virt_to_page(addr));
+		return len - *start;
 	})
 	)
 	return 0;
@@ -554,6 +623,12 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 		npages++;
 		if (npages >= maxpages)
 			return maxpages;
+	}),({
+		unsigned long p = (unsigned long)v.iov_base;
+		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
+			- p / PAGE_SIZE;
+		if (npages >= maxpages)
+			return maxpages;
 	})
 	)
 	return npages;
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 09/13] iov_iter.c: convert copy_to_iter() to iterate_and_advance
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 91 ++++++-----------------------------------------------------
 1 file changed, 9 insertions(+), 82 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 791429d..66665449 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -97,51 +97,6 @@
 	i->iov_offset = skip;					\
 }
 
-static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	left = __copy_to_user(buf, from, copy);
-	copy -= left;
-	skip += copy;
-	from += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_to_user(buf, from, copy);
-		copy -= left;
-		skip = copy;
-		from += copy;
-		bytes -= copy;
-	}
-
-	if (skip == iov->iov_len) {
-		iov++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-
 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -360,51 +315,23 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
 	kunmap_atomic(addr);
 }
 
-static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i)
+size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	size_t skip, copy, wanted;
-	const struct bio_vec *bvec;
-
+	char *from = addr;
 	if (unlikely(bytes > i->count))
 		bytes = i->count;
 
 	if (unlikely(!bytes))
 		return 0;
 
-	wanted = bytes;
-	bvec = i->bvec;
-	skip = i->iov_offset;
-	copy = min_t(size_t, bytes, bvec->bv_len - skip);
-
-	memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
-	skip += copy;
-	from += copy;
-	bytes -= copy;
-	while (bytes) {
-		bvec++;
-		copy = min(bytes, (size_t)bvec->bv_len);
-		memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
-		skip = copy;
-		from += copy;
-		bytes -= copy;
-	}
-	if (skip == bvec->bv_len) {
-		bvec++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= bvec - i->bvec;
-	i->bvec = bvec;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
+	iterate_and_advance(i, bytes, v,
+		__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
+			       v.iov_len),
+		memcpy_to_page(v.bv_page, v.bv_offset,
+			       (from += v.bv_len) - v.bv_len, v.bv_len)
+	)
 
-size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-	if (i->type & ITER_BVEC)
-		return copy_to_iter_bvec(addr, bytes, i);
-	else
-		return copy_to_iter_iovec(addr, bytes, i);
+	return bytes;
 }
 EXPORT_SYMBOL(copy_to_iter);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 08/13] iov_iter.c: convert copy_from_iter() to iterate_and_advance
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 106 +++++++++-------------------------------------------------
 1 file changed, 15 insertions(+), 91 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 17b7144..791429d 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -142,51 +142,6 @@ static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	left = __copy_from_user(to, buf, copy);
-	copy -= left;
-	skip += copy;
-	to += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_from_user(to, buf, copy);
-		copy -= left;
-		skip = copy;
-		to += copy;
-		bytes -= copy;
-	}
-
-	if (skip == iov->iov_len) {
-		iov++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-
 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -444,48 +399,6 @@ static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i)
-{
-	size_t skip, copy, wanted;
-	const struct bio_vec *bvec;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	bvec = i->bvec;
-	skip = i->iov_offset;
-
-	copy = min(bytes, bvec->bv_len - skip);
-
-	memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
-
-	to += copy;
-	skip += copy;
-	bytes -= copy;
-
-	while (bytes) {
-		bvec++;
-		copy = min(bytes, (size_t)bvec->bv_len);
-		memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
-		skip = copy;
-		to += copy;
-		bytes -= copy;
-	}
-	if (skip == bvec->bv_len) {
-		bvec++;
-		skip = 0;
-	}
-	i->count -= wanted;
-	i->nr_segs -= bvec - i->bvec;
-	i->bvec = bvec;
-	i->iov_offset = skip;
-	return wanted;
-}
-
 size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (i->type & ITER_BVEC)
@@ -497,10 +410,21 @@ EXPORT_SYMBOL(copy_to_iter);
 
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (i->type & ITER_BVEC)
-		return copy_from_iter_bvec(addr, bytes, i);
-	else
-		return copy_from_iter_iovec(addr, bytes, i);
+	char *to = addr;
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	iterate_and_advance(i, bytes, v,
+		__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
+				 v.iov_len),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
+	)
+
+	return bytes;
 }
 EXPORT_SYMBOL(copy_from_iter);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 07/13] iov_iter.c: get rid of bvec_copy_page_{to,from}_iter()
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Just have copy_page_{to,from}_iter() fall back to kmap_atomic +
copy_{to,from}_iter() + kunmap_atomic() in ITER_BVEC case.  As
the matter of fact, that's what we want to do for any iov_iter
kind that isn't blocking - e.g. ITER_KVEC will also go that way
once we recognize it on iov_iter.c primitives level

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 60 ++++++++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 39ad713..17b7144 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -486,30 +486,33 @@ static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i)
 	return wanted;
 }
 
-static size_t copy_page_to_iter_bvec(struct page *page, size_t offset,
-					size_t bytes, struct iov_iter *i)
+size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	void *kaddr = kmap_atomic(page);
-	size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i);
-	kunmap_atomic(kaddr);
-	return wanted;
+	if (i->type & ITER_BVEC)
+		return copy_to_iter_bvec(addr, bytes, i);
+	else
+		return copy_to_iter_iovec(addr, bytes, i);
 }
+EXPORT_SYMBOL(copy_to_iter);
 
-static size_t copy_page_from_iter_bvec(struct page *page, size_t offset,
-					size_t bytes, struct iov_iter *i)
+size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	void *kaddr = kmap_atomic(page);
-	size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i);
-	kunmap_atomic(kaddr);
-	return wanted;
+	if (i->type & ITER_BVEC)
+		return copy_from_iter_bvec(addr, bytes, i);
+	else
+		return copy_from_iter_iovec(addr, bytes, i);
 }
+EXPORT_SYMBOL(copy_from_iter);
 
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
-	if (i->type & ITER_BVEC)
-		return copy_page_to_iter_bvec(page, offset, bytes, i);
-	else
+	if (i->type & (ITER_BVEC|ITER_KVEC)) {
+		void *kaddr = kmap_atomic(page);
+		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
+		kunmap_atomic(kaddr);
+		return wanted;
+	} else
 		return copy_page_to_iter_iovec(page, offset, bytes, i);
 }
 EXPORT_SYMBOL(copy_page_to_iter);
@@ -517,31 +520,16 @@ EXPORT_SYMBOL(copy_page_to_iter);
 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
-	if (i->type & ITER_BVEC)
-		return copy_page_from_iter_bvec(page, offset, bytes, i);
-	else
+	if (i->type & ITER_BVEC) {
+		void *kaddr = kmap_atomic(page);
+		size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
+		kunmap_atomic(kaddr);
+		return wanted;
+	} else
 		return copy_page_from_iter_iovec(page, offset, bytes, i);
 }
 EXPORT_SYMBOL(copy_page_from_iter);
 
-size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-	if (i->type & ITER_BVEC)
-		return copy_to_iter_bvec(addr, bytes, i);
-	else
-		return copy_to_iter_iovec(addr, bytes, i);
-}
-EXPORT_SYMBOL(copy_to_iter);
-
-size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-	if (i->type & ITER_BVEC)
-		return copy_from_iter_bvec(addr, bytes, i);
-	else
-		return copy_from_iter_iovec(addr, bytes, i);
-}
-EXPORT_SYMBOL(copy_from_iter);
-
 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(bytes > i->count))
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 06/13] iov_iter.c: convert iov_iter_zero() to iterate_and_advance
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 98 ++++++++---------------------------------------------------
 1 file changed, 12 insertions(+), 86 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 3214b9b..39ad713 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -349,50 +349,6 @@ done:
 	return wanted - bytes;
 }
 
-static size_t zero_iovec(size_t bytes, struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	left = __clear_user(buf, copy);
-	copy -= left;
-	skip += copy;
-	bytes -= copy;
-
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __clear_user(buf, copy);
-		copy -= left;
-		skip = copy;
-		bytes -= copy;
-	}
-
-	if (skip == iov->iov_len) {
-		iov++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-
 /*
  * Fault in the first iovec of the given iov_iter, to a maximum length
  * of bytes. Returns 0 on success, or non-zero if the memory could not be
@@ -548,43 +504,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset,
 	return wanted;
 }
 
-static size_t zero_bvec(size_t bytes, struct iov_iter *i)
-{
-	size_t skip, copy, wanted;
-	const struct bio_vec *bvec;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	bvec = i->bvec;
-	skip = i->iov_offset;
-	copy = min_t(size_t, bytes, bvec->bv_len - skip);
-
-	memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy);
-	skip += copy;
-	bytes -= copy;
-	while (bytes) {
-		bvec++;
-		copy = min(bytes, (size_t)bvec->bv_len);
-		memzero_page(bvec->bv_page, bvec->bv_offset, copy);
-		skip = copy;
-		bytes -= copy;
-	}
-	if (skip == bvec->bv_len) {
-		bvec++;
-		skip = 0;
-	}
-	i->count -= wanted - bytes;
-	i->nr_segs -= bvec - i->bvec;
-	i->bvec = bvec;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -625,11 +544,18 @@ EXPORT_SYMBOL(copy_from_iter);
 
 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 {
-	if (i->type & ITER_BVEC) {
-		return zero_bvec(bytes, i);
-	} else {
-		return zero_iovec(bytes, i);
-	}
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	iterate_and_advance(i, bytes, v,
+		__clear_user(v.iov_base, v.iov_len),
+		memzero_page(v.bv_page, v.bv_offset, v.bv_len)
+	)
+
+	return bytes;
 }
 EXPORT_SYMBOL(iov_iter_zero);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 05/13] iov_iter.c: convert iov_iter_get_pages_alloc() to iterate_all_kinds
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 107 ++++++++++++++++++++++++----------------------------------
 1 file changed, 45 insertions(+), 62 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 75e29ef..3214b9b 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -428,43 +428,6 @@ void iov_iter_init(struct iov_iter *i, int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
-		   struct page ***pages, size_t maxsize,
-		   size_t *start)
-{
-	size_t offset = i->iov_offset;
-	const struct iovec *iov = i->iov;
-	size_t len;
-	unsigned long addr;
-	void *p;
-	int n;
-	int res;
-
-	len = iov->iov_len - offset;
-	if (len > i->count)
-		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
-	addr = (unsigned long)iov->iov_base + offset;
-	len += *start = addr & (PAGE_SIZE - 1);
-	addr &= ~(PAGE_SIZE - 1);
-	n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
-	
-	p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
-	if (!p)
-		p = vmalloc(n * sizeof(struct page *));
-	if (!p)
-		return -ENOMEM;
-
-	res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
-	if (unlikely(res < 0)) {
-		kvfree(p);
-		return res;
-	}
-	*pages = p;
-	return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
 static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
 {
 	char *from = kmap_atomic(page);
@@ -622,27 +585,6 @@ static size_t zero_bvec(size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
-		   struct page ***pages, size_t maxsize,
-		   size_t *start)
-{
-	const struct bio_vec *bvec = i->bvec;
-	size_t len = bvec->bv_len - i->iov_offset;
-	if (len > i->count)
-		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
-	*start = bvec->bv_offset + i->iov_offset;
-
-	*pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
-	if (!*pages)
-		return -ENOMEM;
-
-	get_page(**pages = bvec->bv_page);
-
-	return len;
-}
-
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -777,14 +719,55 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
+static struct page **get_pages_array(size_t n)
+{
+	struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
+	if (!p)
+		p = vmalloc(n * sizeof(struct page *));
+	return p;
+}
+
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
 {
-	if (i->type & ITER_BVEC)
-		return get_pages_alloc_bvec(i, pages, maxsize, start);
-	else
-		return get_pages_alloc_iovec(i, pages, maxsize, start);
+	struct page **p;
+
+	if (maxsize > i->count)
+		maxsize = i->count;
+
+	if (!maxsize)
+		return 0;
+
+	iterate_all_kinds(i, maxsize, v, ({
+		unsigned long addr = (unsigned long)v.iov_base;
+		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		int n;
+		int res;
+
+		addr &= ~(PAGE_SIZE - 1);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+		p = get_pages_array(n);
+		if (!p)
+			return -ENOMEM;
+		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
+		if (unlikely(res < 0)) {
+			kvfree(p);
+			return res;
+		}
+		*pages = p;
+		return (res == n ? len : res * PAGE_SIZE) - *start;
+	0;}),({
+		/* can't be more than PAGE_SIZE */
+		*start = v.bv_offset;
+		*pages = p = get_pages_array(1);
+		if (!p)
+			return -ENOMEM;
+		get_page(*p = v.bv_page);
+		return v.bv_len;
+	})
+	)
+	return 0;
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 04/13] iov_iter.c: convert iov_iter_get_pages() to iterate_all_kinds
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 78 +++++++++++++++++++++--------------------------------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index bc666e7..75e29ef 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -428,34 +428,6 @@ void iov_iter_init(struct iov_iter *i, int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static ssize_t get_pages_iovec(struct iov_iter *i,
-		   struct page **pages, size_t maxsize, unsigned maxpages,
-		   size_t *start)
-{
-	size_t offset = i->iov_offset;
-	const struct iovec *iov = i->iov;
-	size_t len;
-	unsigned long addr;
-	int n;
-	int res;
-
-	len = iov->iov_len - offset;
-	if (len > i->count)
-		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
-	addr = (unsigned long)iov->iov_base + offset;
-	len += *start = addr & (PAGE_SIZE - 1);
-	if (len > maxpages * PAGE_SIZE)
-		len = maxpages * PAGE_SIZE;
-	addr &= ~(PAGE_SIZE - 1);
-	n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
-	res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
-	if (unlikely(res < 0))
-		return res;
-	return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
 static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
@@ -650,24 +622,6 @@ static size_t zero_bvec(size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static ssize_t get_pages_bvec(struct iov_iter *i,
-		   struct page **pages, size_t maxsize, unsigned maxpages,
-		   size_t *start)
-{
-	const struct bio_vec *bvec = i->bvec;
-	size_t len = bvec->bv_len - i->iov_offset;
-	if (len > i->count)
-		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
-	/* can't be more than PAGE_SIZE */
-	*start = bvec->bv_offset + i->iov_offset;
-
-	get_page(*pages = bvec->bv_page);
-
-	return len;
-}
-
 static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
@@ -792,10 +746,34 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
-	if (i->type & ITER_BVEC)
-		return get_pages_bvec(i, pages, maxsize, maxpages, start);
-	else
-		return get_pages_iovec(i, pages, maxsize, maxpages, start);
+	if (maxsize > i->count)
+		maxsize = i->count;
+
+	if (!maxsize)
+		return 0;
+
+	iterate_all_kinds(i, maxsize, v, ({
+		unsigned long addr = (unsigned long)v.iov_base;
+		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		int n;
+		int res;
+
+		if (len > maxpages * PAGE_SIZE)
+			len = maxpages * PAGE_SIZE;
+		addr &= ~(PAGE_SIZE - 1);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
+		if (unlikely(res < 0))
+			return res;
+		return (res == n ? len : res * PAGE_SIZE) - *start;
+	0;}),({
+		/* can't be more than PAGE_SIZE */
+		*start = v.bv_offset;
+		get_page(*pages = v.bv_page);
+		return v.bv_len;
+	})
+	)
+	return 0;
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 03/13] iov_iter.c: convert iov_iter_npages() to iterate_all_kinds
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 73 ++++++++++++++++-------------------------------------------
 1 file changed, 19 insertions(+), 54 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index e91bf0a..bc666e7 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -493,32 +493,6 @@ static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
 	return (res == n ? len : res * PAGE_SIZE) - *start;
 }
 
-static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
-{
-	size_t offset = i->iov_offset;
-	size_t size = i->count;
-	const struct iovec *iov = i->iov;
-	int npages = 0;
-	int n;
-
-	for (n = 0; size && n < i->nr_segs; n++, iov++) {
-		unsigned long addr = (unsigned long)iov->iov_base + offset;
-		size_t len = iov->iov_len - offset;
-		offset = 0;
-		if (unlikely(!len))	/* empty segment */
-			continue;
-		if (len > size)
-			len = size;
-		npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
-			  - addr / PAGE_SIZE;
-		if (npages >= maxpages)	/* don't bother going further */
-			return maxpages;
-		size -= len;
-		offset = 0;
-	}
-	return min(npages, maxpages);
-}
-
 static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
 {
 	char *from = kmap_atomic(page);
@@ -715,30 +689,6 @@ static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
 	return len;
 }
 
-static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
-{
-	size_t offset = i->iov_offset;
-	size_t size = i->count;
-	const struct bio_vec *bvec = i->bvec;
-	int npages = 0;
-	int n;
-
-	for (n = 0; size && n < i->nr_segs; n++, bvec++) {
-		size_t len = bvec->bv_len - offset;
-		offset = 0;
-		if (unlikely(!len))	/* empty segment */
-			continue;
-		if (len > size)
-			len = size;
-		npages++;
-		if (npages >= maxpages)	/* don't bother going further */
-			return maxpages;
-		size -= len;
-		offset = 0;
-	}
-	return min(npages, maxpages);
-}
-
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -862,9 +812,24 @@ EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
-	if (i->type & ITER_BVEC)
-		return iov_iter_npages_bvec(i, maxpages);
-	else
-		return iov_iter_npages_iovec(i, maxpages);
+	size_t size = i->count;
+	int npages = 0;
+
+	if (!size)
+		return 0;
+
+	iterate_all_kinds(i, size, v, ({
+		unsigned long p = (unsigned long)v.iov_base;
+		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
+			- p / PAGE_SIZE;
+		if (npages >= maxpages)
+			return maxpages;
+	0;}),({
+		npages++;
+		if (npages >= maxpages)
+			return maxpages;
+	})
+	)
+	return npages;
 }
 EXPORT_SYMBOL(iov_iter_npages);
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 02/13] iov_iter.c: iterate_and_advance
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

same as iterate_all_kinds, but iterator is moved to the position past
the last byte we'd handled.

iov_iter_advance() converted to it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 104 ++++++++++++++++------------------------------------------
 1 file changed, 28 insertions(+), 76 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 798fcb4..e91bf0a 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -70,6 +70,33 @@
 	}							\
 }
 
+#define iterate_and_advance(i, n, v, I, B) {			\
+	size_t skip = i->iov_offset;				\
+	if (unlikely(i->type & ITER_BVEC)) {			\
+		const struct bio_vec *bvec;			\
+		struct bio_vec v;				\
+		iterate_bvec(i, n, v, bvec, skip, (B))		\
+		if (skip == bvec->bv_len) {			\
+			bvec++;					\
+			skip = 0;				\
+		}						\
+		i->nr_segs -= bvec - i->bvec;			\
+		i->bvec = bvec;					\
+	} else {						\
+		const struct iovec *iov;			\
+		struct iovec v;					\
+		iterate_iovec(i, n, v, iov, skip, (I))		\
+		if (skip == iov->iov_len) {			\
+			iov++;					\
+			skip = 0;				\
+		}						\
+		i->nr_segs -= iov - i->iov;			\
+		i->iov = iov;					\
+	}							\
+	i->count -= n;						\
+	i->iov_offset = skip;					\
+}
+
 static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
 {
 	size_t skip, copy, left, wanted;
@@ -366,42 +393,6 @@ static size_t zero_iovec(size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static void advance_iovec(struct iov_iter *i, size_t bytes)
-{
-	BUG_ON(i->count < bytes);
-
-	if (likely(i->nr_segs == 1)) {
-		i->iov_offset += bytes;
-		i->count -= bytes;
-	} else {
-		const struct iovec *iov = i->iov;
-		size_t base = i->iov_offset;
-		unsigned long nr_segs = i->nr_segs;
-
-		/*
-		 * The !iov->iov_len check ensures we skip over unlikely
-		 * zero-length segments (without overruning the iovec).
-		 */
-		while (bytes || unlikely(i->count && !iov->iov_len)) {
-			int copy;
-
-			copy = min(bytes, iov->iov_len - base);
-			BUG_ON(!i->count || i->count < copy);
-			i->count -= copy;
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				nr_segs--;
-				base = 0;
-			}
-		}
-		i->iov = iov;
-		i->iov_offset = base;
-		i->nr_segs = nr_segs;
-	}
-}
-
 /*
  * Fault in the first iovec of the given iov_iter, to a maximum length
  * of bytes. Returns 0 on success, or non-zero if the memory could not be
@@ -685,42 +676,6 @@ static size_t zero_bvec(size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static void advance_bvec(struct iov_iter *i, size_t bytes)
-{
-	BUG_ON(i->count < bytes);
-
-	if (likely(i->nr_segs == 1)) {
-		i->iov_offset += bytes;
-		i->count -= bytes;
-	} else {
-		const struct bio_vec *bvec = i->bvec;
-		size_t base = i->iov_offset;
-		unsigned long nr_segs = i->nr_segs;
-
-		/*
-		 * The !iov->iov_len check ensures we skip over unlikely
-		 * zero-length segments (without overruning the iovec).
-		 */
-		while (bytes || unlikely(i->count && !bvec->bv_len)) {
-			int copy;
-
-			copy = min(bytes, bvec->bv_len - base);
-			BUG_ON(!i->count || i->count < copy);
-			i->count -= copy;
-			bytes -= copy;
-			base += copy;
-			if (bvec->bv_len == base) {
-				bvec++;
-				nr_segs--;
-				base = 0;
-			}
-		}
-		i->bvec = bvec;
-		i->iov_offset = base;
-		i->nr_segs = nr_segs;
-	}
-}
-
 static ssize_t get_pages_bvec(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
@@ -849,10 +804,7 @@ EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
 void iov_iter_advance(struct iov_iter *i, size_t size)
 {
-	if (i->type & ITER_BVEC)
-		advance_bvec(i, size);
-	else
-		advance_iovec(i, size);
+	iterate_and_advance(i, size, v, 0, 0)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCH 01/13] iov_iter.c: macros for iterating over iov_iter
From: Al Viro @ 2014-12-04 20:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev
In-Reply-To: <20141204202011.GO29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those.  ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at.  step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left".  In the end, the amount actually handled is stored
in size.

iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/iov_iter.c | 212 ++++++++++++++++++++++++----------------------------------
 1 file changed, 86 insertions(+), 126 deletions(-)

diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index e34a3cb..798fcb4 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -4,6 +4,72 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
+#define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
+	size_t left;					\
+	size_t wanted = n;				\
+	__p = i->iov;					\
+	__v.iov_len = min(n, __p->iov_len - skip);	\
+	if (likely(__v.iov_len)) {			\
+		__v.iov_base = __p->iov_base + skip;	\
+		left = (STEP);				\
+		__v.iov_len -= left;			\
+		skip += __v.iov_len;			\
+		n -= __v.iov_len;			\
+	} else {					\
+		left = 0;				\
+	}						\
+	while (unlikely(!left && n)) {			\
+		__p++;					\
+		__v.iov_len = min(n, __p->iov_len);	\
+		if (unlikely(!__v.iov_len))		\
+			continue;			\
+		__v.iov_base = __p->iov_base;		\
+		left = (STEP);				\
+		__v.iov_len -= left;			\
+		skip = __v.iov_len;			\
+		n -= __v.iov_len;			\
+	}						\
+	n = wanted - n;					\
+}
+
+#define iterate_bvec(i, n, __v, __p, skip, STEP) {	\
+	size_t wanted = n;				\
+	__p = i->bvec;					\
+	__v.bv_len = min_t(size_t, n, __p->bv_len - skip);	\
+	if (likely(__v.bv_len)) {			\
+		__v.bv_page = __p->bv_page;		\
+		__v.bv_offset = __p->bv_offset + skip; 	\
+		(void)(STEP);				\
+		skip += __v.bv_len;			\
+		n -= __v.bv_len;			\
+	}						\
+	while (unlikely(n)) {				\
+		__p++;					\
+		__v.bv_len = min_t(size_t, n, __p->bv_len);	\
+		if (unlikely(!__v.bv_len))		\
+			continue;			\
+		__v.bv_page = __p->bv_page;		\
+		__v.bv_offset = __p->bv_offset;		\
+		(void)(STEP);				\
+		skip = __v.bv_len;			\
+		n -= __v.bv_len;			\
+	}						\
+	n = wanted;					\
+}
+
+#define iterate_all_kinds(i, n, v, I, B) {			\
+	size_t skip = i->iov_offset;				\
+	if (unlikely(i->type & ITER_BVEC)) {			\
+		const struct bio_vec *bvec;			\
+		struct bio_vec v;				\
+		iterate_bvec(i, n, v, bvec, skip, (B))		\
+	} else {						\
+		const struct iovec *iov;			\
+		struct iovec v;					\
+		iterate_iovec(i, n, v, iov, skip, (I))		\
+	}							\
+}
+
 static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
 {
 	size_t skip, copy, left, wanted;
@@ -300,54 +366,6 @@ static size_t zero_iovec(size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	size_t copied = 0, left = 0;
-
-	while (bytes) {
-		char __user *buf = iov->iov_base + base;
-		int copy = min(bytes, iov->iov_len - base);
-
-		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
-		copied += copy;
-		bytes -= copy;
-		vaddr += copy;
-		iov++;
-
-		if (unlikely(left))
-			break;
-	}
-	return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-static size_t copy_from_user_atomic_iovec(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap_atomic(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap_atomic(kaddr);
-
-	return copied;
-}
-
 static void advance_iovec(struct iov_iter *i, size_t bytes)
 {
 	BUG_ON(i->count < bytes);
@@ -404,30 +422,6 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 }
 EXPORT_SYMBOL(iov_iter_fault_in_readable);
 
-static unsigned long alignment_iovec(const struct iov_iter *i)
-{
-	const struct iovec *iov = i->iov;
-	unsigned long res;
-	size_t size = i->count;
-	size_t n;
-
-	if (!size)
-		return 0;
-
-	res = (unsigned long)iov->iov_base + i->iov_offset;
-	n = iov->iov_len - i->iov_offset;
-	if (n >= size)
-		return res | size;
-	size -= n;
-	res |= n;
-	while (size > (++iov)->iov_len) {
-		res |= (unsigned long)iov->iov_base | iov->iov_len;
-		size -= iov->iov_len;
-	}
-	res |= (unsigned long)iov->iov_base | size;
-	return res;
-}
-
 void iov_iter_init(struct iov_iter *i, int direction,
 			const struct iovec *iov, unsigned long nr_segs,
 			size_t count)
@@ -691,28 +685,6 @@ static size_t zero_bvec(size_t bytes, struct iov_iter *i)
 	return wanted - bytes;
 }
 
-static size_t copy_from_user_bvec(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t left;
-	const struct bio_vec *bvec;
-	size_t base = i->iov_offset;
-
-	kaddr = kmap_atomic(page);
-	for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
-		size_t copy = min(left, bvec->bv_len - base);
-		if (!bvec->bv_len)
-			continue;
-		memcpy_from_page(kaddr + offset, bvec->bv_page,
-				 bvec->bv_offset + base, copy);
-		offset += copy;
-		left -= copy;
-	}
-	kunmap_atomic(kaddr);
-	return bytes;
-}
-
 static void advance_bvec(struct iov_iter *i, size_t bytes)
 {
 	BUG_ON(i->count < bytes);
@@ -749,30 +721,6 @@ static void advance_bvec(struct iov_iter *i, size_t bytes)
 	}
 }
 
-static unsigned long alignment_bvec(const struct iov_iter *i)
-{
-	const struct bio_vec *bvec = i->bvec;
-	unsigned long res;
-	size_t size = i->count;
-	size_t n;
-
-	if (!size)
-		return 0;
-
-	res = bvec->bv_offset + i->iov_offset;
-	n = bvec->bv_len - i->iov_offset;
-	if (n >= size)
-		return res | size;
-	size -= n;
-	res |= n;
-	while (size > (++bvec)->bv_len) {
-		res |= bvec->bv_offset | bvec->bv_len;
-		size -= bvec->bv_len;
-	}
-	res |= bvec->bv_offset | size;
-	return res;
-}
-
 static ssize_t get_pages_bvec(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
@@ -887,10 +835,15 @@ EXPORT_SYMBOL(iov_iter_zero);
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes)
 {
-	if (i->type & ITER_BVEC)
-		return copy_from_user_bvec(page, i, offset, bytes);
-	else
-		return copy_from_user_atomic_iovec(page, i, offset, bytes);
+	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+	iterate_all_kinds(i, bytes, v,
+		__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
+					  v.iov_base, v.iov_len),
+		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
+	)
+	kunmap_atomic(kaddr);
+	return bytes;
 }
 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
@@ -919,10 +872,17 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
 
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
-	if (i->type & ITER_BVEC)
-		return alignment_bvec(i);
-	else
-		return alignment_iovec(i);
+	unsigned long res = 0;
+	size_t size = i->count;
+
+	if (!size)
+		return 0;
+
+	iterate_all_kinds(i, size, v,
+		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
+		res |= v.bv_offset | v.bv_len
+	)
+	return res;
 }
 EXPORT_SYMBOL(iov_iter_alignment);
 
-- 
2.1.3

^ permalink raw reply related

* [RFC][PATCHES] iov_iter.c rewrite
From: Al Viro @ 2014-12-04 20:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-fsdevel, netdev

	First of all, I want to apologize for the nastiness of preprocessor
use in this series.  Seeing that the whole "macros that look like new kinds
of C statements" thing (including list_for_each_...(), etc) is very much not
to my liking, I really don't trust my taste on finer details and I'd very
much like some feedback.

	The reason for doing that kind of tricks is that iov_iter.c keeps
growing more and more boilerplate code.  For iov_iter-net series we need
	* csum_and_copy_from_iter()
	* csum_and_copy_to_iter()
	* copy_from_iter_nocache()
That's 3 new primitives, each in 2 variants (iovec and bvec).
	* ITER_KVEC handled without going through uaccess.h stuff (and
independent of set_fs() state).
And *that* means 3 variants intstead of 2 for most of the existing primitives.
That's far too much, and the amount of copies of the same logics would pretty
much guarantee that it will be a breeding ground for hard-to-kill bugs.

	The following series (also in vfs.git#iov_iter) actually manages to
do all of the above *and* shrink the damn thing quite a bit.  The generated
code appears to be no worse than before.  The price is a couple of iterator
macros - iterate_all_kinds() and iterate_and_advance().  They are given an
iov_iter, size (i.e. the amount of data in iov_iter beginning we want to go
through), name of the loop variable and 3 variants of loop body - for iovec,
bvec and kvec resp.  Loop variable is declared *inside* the expansion of those
suckers according to the kind of iov_iter - it's struct iovec, struct bio_vec
or struct kvec, covering the current range to deal with.
	The difference between those two is that iterate_and_advance() will
advance the iov_iter by the amount it has handled and iterate_all_kinds()
will leave iov_iter unchanged.

	Unless I hear anybody yelling, it goes into vfs.git#for-next today,
so if you have objections, suggestions, etc., give those *now*.

Al Viro (13):
      iov_iter.c: macros for iterating over iov_iter
      iov_iter.c: iterate_and_advance
      iov_iter.c: convert iov_iter_npages() to iterate_all_kinds
      iov_iter.c: convert iov_iter_get_pages() to iterate_all_kinds
      iov_iter.c: convert iov_iter_get_pages_alloc() to iterate_all_kinds
      iov_iter.c: convert iov_iter_zero() to iterate_and_advance
      iov_iter.c: get rid of bvec_copy_page_{to,from}_iter()
      iov_iter.c: convert copy_from_iter() to iterate_and_advance
      iov_iter.c: convert copy_to_iter() to iterate_and_advance
      iov_iter.c: handle ITER_KVEC directly
      csum_and_copy_..._iter()
      new helper: iov_iter_kvec()
      copy_from_iter_nocache()

Diffstat:
 include/linux/uio.h |    6 +
 mm/iov_iter.c       | 1077 +++++++++++++++++++++------------------------------
 2 files changed, 445 insertions(+), 638 deletions(-)

^ permalink raw reply

* Re: [PATCHv3 net] i40e: Implement ndo_gso_check()
From: Tom Herbert @ 2014-12-04 20:17 UTC (permalink / raw)
  To: Joe Stringer
  Cc: Linux Netdev List, LKML, Jesse Gross, Shannon Nelson,
	Brandeburg, Jesse, Jeff Kirsher, linux.nics
In-Reply-To: <1417718366-14310-1-git-send-email-joestringer@nicira.com>

On Thu, Dec 4, 2014 at 10:39 AM, Joe Stringer <joestringer@nicira.com> wrote:
> ndo_gso_check() was recently introduced to allow NICs to report the
> offloading support that they have on a per-skb basis. Add an
> implementation for this driver which checks for IPIP, GRE, UDP tunnels.
>
> Signed-off-by: Joe Stringer <joestringer@nicira.com>
> ---
> v3: Drop IPIP and GRE (no driver support even though hw supports it).
>     Check for UDP outer protocol for UDP tunnels.
> v2: Expand to include IP in IP and IPv4/IPv6 inside GRE/UDP tunnels.
>     Add MAX_INNER_LENGTH (as 80).
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c |   26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index c3a7f4a..0d6493a 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -7447,6 +7447,31 @@ static int i40e_ndo_fdb_dump(struct sk_buff *skb,
>
>  #endif /* USE_DEFAULT_FDB_DEL_DUMP */
>  #endif /* HAVE_FDB_OPS */
> +static bool i40e_gso_check(struct sk_buff *skb, struct net_device *dev)
> +{
> +       if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) {
> +               unsigned char *ihdr;
> +
> +               if (skb->protocol != IPPROTO_UDP ||
> +                   skb->inner_protocol_type != ENCAP_TYPE_ETHER)
> +                       return false;
> +
> +               if (skb->inner_protocol == htons(ETH_P_TEB))
> +                       ihdr = skb_inner_mac_header(skb);
> +               else if (skb->inner_protocol == htons(ETH_P_IP) ||
> +                        skb->inner_protocol == htons(ETH_P_IPV6))
> +                       ihdr = skb_inner_network_header(skb);
> +               else
> +                       return false;
> +

Wow, this is getting complicated! :-( It's not clear that the protocol
specific checks are needed here since it looks like the header length
is being passed to the device later on. Also, I think we need
skb_inner_mac_header(skb) - skb_transport_header(skb) to always work
to give the length of the encapsulation headers (in case there is no
real inner mac header, then that offset should be for the network
header).

So would a simple check like this work:

if (skb->encapsulation &&
    (skb_inner_mac_header(skb) - skb_transport_header(skb)) >
MAX_TUNNEL_HDR_LEN)
       return false;

> +#define MAX_TUNNEL_HDR_LEN     80
> +               if (ihdr - skb_transport_header(skb) > MAX_TUNNEL_HDR_LEN)
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
>  static const struct net_device_ops i40e_netdev_ops = {
>         .ndo_open               = i40e_open,
>         .ndo_stop               = i40e_close,
> @@ -7487,6 +7512,7 @@ static const struct net_device_ops i40e_netdev_ops = {
>         .ndo_fdb_dump           = i40e_ndo_fdb_dump,
>  #endif
>  #endif
> +       .ndo_gso_check          = i40e_gso_check,
>  };
>
>  /**
> --
> 1.7.10.4
>

^ permalink raw reply

* Re: [patch iproute2 1/6] iproute2: ipa: show switch id
From: Eric W. Biederman @ 2014-12-04 20:06 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse,
	pshelar, azhou, ben, stephen, jeffrey.t.kirsher, vyasevic,
	xiyou.wangcong, john.r.fastabend, edumazet, jhs, sfeldma,
	f.fainelli, roopa, linville, jasowang, nicolas.dichtel,
	ryazanov.s.a, buytenh, aviadr, nbd, alexei.starovoitov,
	Neil.Jerram, ronye, simon.horman, alexander.h.duyck, john.ronciak,
	mleitner, shrijeet, gospo, bcrl, hemal
In-Reply-To: <87wq672p49.fsf@x220.int.ebiederm.org>

ebiederm@xmission.com (Eric W. Biederman) writes:

> Jiri Pirko <jiri@resnulli.us> writes:
>
>>>So this id needs to be globally unique?
>>
>> No. It is enough to be unique within a single system. It serves for no
>> more than to find out 2 ids are same or not, no other info value.
>>
>> So when the drivers uses sane ids (like mac for example, or in case of
>> rocker an id which is passed by qemu command line), the chances of
>> collision are very very close to none (never say never).

Thinking about what you said a little more.

Two different sources of persistent numbers picking numbers by
completely different algorithms can give you no assurance that you don't
produce conflicts.

The switch id as desisgned can not work.

There are expected to be between 2**36 to 2**40 devices in this world.
Your first switch id is a 64it number.  At the very best by the birthday
pardox predicts there will be a conflict ever 2**32 devices or between
2**4 and 2**8 devices in the world with conflicts.  If the ids are not
randomly distributed (which they won't be) things could easily be much
much worse.

That is just good enough the code could get out there and run for years
before you have the nightmare of having to fix all of userspace.   That
is a nightmare no one needs.

So please remove this broken code, and this broken concept from the
kernel and go back to the drawing board.

Eric

^ permalink raw reply

* Re: [patch iproute2 1/6] iproute2: ipa: show switch id
From: Jiri Pirko @ 2014-12-04 19:54 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev, davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse,
	pshelar, azhou, ben, stephen, jeffrey.t.kirsher, vyasevic,
	xiyou.wangcong, john.r.fastabend, edumazet, jhs, sfeldma,
	f.fainelli, roopa, linville, jasowang, nicolas.dichtel,
	ryazanov.s.a, buytenh, aviadr, nbd, alexei.starovoitov,
	Neil.Jerram, ronye, simon.horman, alexander.h.duyck, john.ronciak,
	mleitner, shrijeet, gospo, bcrl, hemal
In-Reply-To: <87wq672p49.fsf@x220.int.ebiederm.org>

Thu, Dec 04, 2014 at 08:26:14PM CET, ebiederm@xmission.com wrote:
>Jiri Pirko <jiri@resnulli.us> writes:
>
>>>So this id needs to be globally unique?
>>
>> No. It is enough to be unique within a single system. It serves for no
>> more than to find out 2 ids are same or not, no other info value.
>>
>> So when the drivers uses sane ids (like mac for example, or in case of
>> rocker an id which is passed by qemu command line), the chances of
>> collision are very very close to none (never say never).
>
>So the switch id isn't necessarily even as good as a manufacturers
>serial number that changes between different models, and anyone who uses
>this id must look at the driver to get uniqueness.
>
>Yes what is needed in the comparison to get uniqueness in a single
>system badly needs to be documented because that switch id is very much
>not enough on it's own.
>
>phys_port_id on the other hand is globally unique.  So please stop
>saying that this is anything like phys_port_id.

I'm sorry. But is is the same. And the purpose is very similar. In case
of phys port id to find out if 2 netdevices share the same physical port.
In case of phys switch id to find out if 2 netdevices share the same
switch parent.

^ permalink raw reply

* Re: [PATCH] net: tulip: Remove private "strncmp"
From: Grant Grundler @ 2014-12-04 19:35 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: Grant Grundler, open list:TULIP NETWORK DRI..., open list
In-Reply-To: <1417689040-14958-1-git-send-email-linux@rasmusvillemoes.dk>

On Thu, Dec 4, 2014 at 2:30 AM, Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
> The comment says that the built-in strncmp didn't work. That is not
> surprising, as apparently "str" semantics are not really what is
> wanted (hint: de4x5_strncmp only stops when two different bytes are
> encountered or the end is reached; not if either byte happens to be
> 0). de4x5_strncmp is actually a memcmp (except for the signature and
> that bytes are not necessarily treated as unsigned char); since only
> the boolean value of the result is used we can just replace
> de4x5_strncmp with memcmp.
>
> Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
> ---
>
> Notes:
>     I don't know if the comment meant to say 3 bytes, or if the code
>     compares meaningful chunks of memory (the first three bytes of
>     &lp->srom span 1.5 fields, and the three bytes from (char*)&lp->srom +
>     0x10 are &lp->srom.{id_block_crc,reserved2,version} - it seems odd
>     that these chunks should ever be equal to each other and to the
>     enet_det[i]). Whether or not the current code works, this patch
>     shouldn't change the semantics, and I'd like to get rid of
>     de4x5_strncmp since it is not, in fact, a strncmp.

+1 I think your analysis is correct. The function appears to be
checking against a black list of MAC addresses that has two "broken"
devices MAC addresses.

Acked-by: Grant Grundler <grundler@parisc-linux.org>

thanks,
grant

ps. I don't like how de4x5_bad_srom() is structured but I can't test
these devices either.  Specifically, the offset of the MAC address
should be known and we should only be testing that offset to see if
it's "that vendor".

>  drivers/net/ethernet/dec/tulip/de4x5.c | 20 +++-----------------
>  1 file changed, 3 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c
> index cf8b6ff..badff18 100644
> --- a/drivers/net/ethernet/dec/tulip/de4x5.c
> +++ b/drivers/net/ethernet/dec/tulip/de4x5.c
> @@ -995,7 +995,6 @@ static void    de4x5_dbg_mii(struct net_device *dev, int k);
>  static void    de4x5_dbg_media(struct net_device *dev);
>  static void    de4x5_dbg_srom(struct de4x5_srom *p);
>  static void    de4x5_dbg_rx(struct sk_buff *skb, int len);
> -static int     de4x5_strncmp(char *a, char *b, int n);
>  static int     dc21041_infoleaf(struct net_device *dev);
>  static int     dc21140_infoleaf(struct net_device *dev);
>  static int     dc21142_infoleaf(struct net_device *dev);
> @@ -4102,8 +4101,7 @@ get_hw_addr(struct net_device *dev)
>  }
>
>  /*
> -** Test for enet addresses in the first 32 bytes. The built-in strncmp
> -** didn't seem to work here...?
> +** Test for enet addresses in the first 32 bytes.
>  */
>  static int
>  de4x5_bad_srom(struct de4x5_private *lp)
> @@ -4111,8 +4109,8 @@ de4x5_bad_srom(struct de4x5_private *lp)
>      int i, status = 0;
>
>      for (i = 0; i < ARRAY_SIZE(enet_det); i++) {
> -       if (!de4x5_strncmp((char *)&lp->srom, (char *)&enet_det[i], 3) &&
> -           !de4x5_strncmp((char *)&lp->srom+0x10, (char *)&enet_det[i], 3)) {
> +       if (!memcmp(&lp->srom, &enet_det[i], 3) &&
> +           !memcmp((char *)&lp->srom+0x10, &enet_det[i], 3)) {
>             if (i == 0) {
>                 status = SMC;
>             } else if (i == 1) {
> @@ -4125,18 +4123,6 @@ de4x5_bad_srom(struct de4x5_private *lp)
>      return status;
>  }
>
> -static int
> -de4x5_strncmp(char *a, char *b, int n)
> -{
> -    int ret=0;
> -
> -    for (;n && !ret; n--) {
> -       ret = *a++ - *b++;
> -    }
> -
> -    return ret;
> -}
> -
>  static void
>  srom_repair(struct net_device *dev, int card)
>  {
> --
> 2.0.4
>

^ permalink raw reply

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
From: Hannes Frederic Sowa @ 2014-12-04 19:32 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: netdev, Herbert Xu, Thomas Graf, Daniel Borkmann, Eric Dumazet
In-Reply-To: <15333.1417721231@famine>

Hi Jay,

On Do, 2014-12-04 at 11:27 -0800, Jay Vosburgh wrote:
> Hannes Frederic Sowa <hannes@stressinduktion.org> wrote:
> 
> >By default the arch_fast_hash hashing function pointers are initialized
> >to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
> >updated to the CRC32 ones. This dispatching scheme incurs a function
> >pointer lookup and indirect call for every hashing operation.
> >
> >To keep the number of clobbered registers short the hashing primitives
> >are implemented in assembler. This makes it easier to do the dispatch
> >by alternative_call.
> 
> 	I have tested this on the same system that panicked with the
> original (now reverted) implementation (commit e5a2c8999576 "fast_hash:
> avoid indirect function calls"), and it functions correctly and does not
> panic.
> 
> 	I looked at the disassembly, and, as a data point, on a
> non-SSE4.2 system, the code generated is not as efficient as Hannes'
> original test patch, found here:
> 
> http://comments.gmane.org/gmane.linux.network/338430
> 
> 	which produced code as follows:
> 
> 0xffffffffa00b6bb9 <ovs_flow_tbl_insert+0xb9>:  mov    %r15,0x348(%r14)
> 0xffffffffa00b6bc0 <ovs_flow_tbl_insert+0xc0>:  movzwl 0x28(%r15),%ecx
> 0xffffffffa00b6bc5 <ovs_flow_tbl_insert+0xc5>:  movzwl 0x2a(%r15),%esi
> 0xffffffffa00b6bca <ovs_flow_tbl_insert+0xca>:  movzwl %cx,%eax
> 0xffffffffa00b6bcd <ovs_flow_tbl_insert+0xcd>:  sub    %ecx,%esi
> 0xffffffffa00b6bcf <ovs_flow_tbl_insert+0xcf>:  lea    0x38(%r14,%rax,1),%rdi
> 0xffffffffa00b6bd4 <ovs_flow_tbl_insert+0xd4>:  sar    $0x2,%esi
> 0xffffffffa00b6bd7 <ovs_flow_tbl_insert+0xd7>:  callq  0xffffffff813a7810 <__jhash2>
> 0xffffffffa00b6bdc <ovs_flow_tbl_insert+0xdc>:  mov    %eax,0x30(%r14)
> 0xffffffffa00b6be0 <ovs_flow_tbl_insert+0xe0>:  mov    (%rbx),%r13
> 0xffffffffa00b6be3 <ovs_flow_tbl_insert+0xe3>:  mov    %r14,%rsi
> 0xffffffffa00b6be6 <ovs_flow_tbl_insert+0xe6>:  mov    %r13,%rdi
> 0xffffffffa00b6be9 <ovs_flow_tbl_insert+0xe9>:  callq  0xffffffffa00b61a0 <table_instance_insert>
> 
> 	This patch's code ends up as follows:
> 
> 0xffffffffa01b5a57 <ovs_flow_tbl_insert+0xb7>:	mov    %r15,0x348(%rcx)
> 0xffffffffa01b5a5e <ovs_flow_tbl_insert+0xbe>:	movzwl 0x28(%r15),%eax
> 0xffffffffa01b5a63 <ovs_flow_tbl_insert+0xc3>:	movzwl 0x2a(%r15),%esi
> 0xffffffffa01b5a68 <ovs_flow_tbl_insert+0xc8>:	movzwl %ax,%edx
> 0xffffffffa01b5a6b <ovs_flow_tbl_insert+0xcb>:	sub    %eax,%esi
> 0xffffffffa01b5a6d <ovs_flow_tbl_insert+0xcd>:	lea    0x38(%rcx,%rdx,1),%rdi
> 0xffffffffa01b5a72 <ovs_flow_tbl_insert+0xd2>:	xor    %edx,%edx
> 0xffffffffa01b5a74 <ovs_flow_tbl_insert+0xd4>:	sar    $0x2,%esi
> 0xffffffffa01b5a77 <ovs_flow_tbl_insert+0xd7>:	callq  0xffffffff813ae9f0 <__jhash_trampoline>
> 0xffffffffa01b5a7c <ovs_flow_tbl_insert+0xdc>:	mov    %eax,0x30(%rcx)
> 0xffffffffa01b5a7f <ovs_flow_tbl_insert+0xdf>:	mov    (%rbx),%r13
> 0xffffffffa01b5a82 <ovs_flow_tbl_insert+0xe2>:	mov    %rcx,%rsi
> 0xffffffffa01b5a85 <ovs_flow_tbl_insert+0xe5>:	mov    %r13,%rdi
> 0xffffffffa01b5a88 <ovs_flow_tbl_insert+0xe8>:	callq  0xffffffffa01b5030 <table_instance_insert>
> 
> 0xffffffff813ae9f0 <__jhash_trampoline>:	push   %rcx
> 0xffffffff813ae9f1 <__jhash_trampoline+0x1>:	push   %r8
> 0xffffffff813ae9f3 <__jhash_trampoline+0x3>:	push   %r9
> 0xffffffff813ae9f5 <__jhash_trampoline+0x5>:	push   %r10
> 0xffffffff813ae9f7 <__jhash_trampoline+0x7>:	push   %r11
> 0xffffffff813ae9f9 <__jhash_trampoline+0x9>:	callq  0xffffffff813ae8a0 <__jhash>
> 0xffffffff813ae9fe <__jhash_trampoline+0xe>:	pop    %r11
> 0xffffffff813aea00 <__jhash_trampoline+0x10>:	pop    %r10
> 0xffffffff813aea02 <__jhash_trampoline+0x12>:	pop    %r9
> 0xffffffff813aea04 <__jhash_trampoline+0x14>:	pop    %r8
> 0xffffffff813aea06 <__jhash_trampoline+0x16>:	pop    %rcx
> 0xffffffff813aea07 <__jhash_trampoline+0x17>:	retq   
> 
> 	In any event, this new patch does work correctly in my test that
> originally failed, and it's debatable how much optimizing for old
> systems is worthwhile.

Yes, that is expected. I also don't have a good idea on how to improve
the hashing on non-SSE4.2 systems in a reasonable amount of time.

> 	I only tested the non-SSE4.2 (i.e., old system) portion on
> x86_64.

I tried every possible setup this time, especially with openvswitch. I
covered ia32 with and without SSE4.2 as well as x86_64 and it always
behaved correctly. Last time the problem was that the static inline
didn't become a function in OVS, but during the testing with rhashtable
it got synthesized into a normal C call because of the indirect
reference.

Thanks a lot,
Hannes

^ permalink raw reply

* Re: [patch iproute2 1/6] iproute2: ipa: show switch id
From: Eric W. Biederman @ 2014-12-04 19:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse,
	pshelar, azhou, ben, stephen, jeffrey.t.kirsher, vyasevic,
	xiyou.wangcong, john.r.fastabend, edumazet, jhs, sfeldma,
	f.fainelli, roopa, linville, jasowang, nicolas.dichtel,
	ryazanov.s.a, buytenh, aviadr, nbd, alexei.starovoitov,
	Neil.Jerram, ronye, simon.horman, alexander.h.duyck, john.ronciak,
	mleitner, shrijeet, gospo, bcrl, hemal
In-Reply-To: <20141204191926.GK1861@nanopsycho.orion>

Jiri Pirko <jiri@resnulli.us> writes:

>>So this id needs to be globally unique?
>
> No. It is enough to be unique within a single system. It serves for no
> more than to find out 2 ids are same or not, no other info value.
>
> So when the drivers uses sane ids (like mac for example, or in case of
> rocker an id which is passed by qemu command line), the chances of
> collision are very very close to none (never say never).

So the switch id isn't necessarily even as good as a manufacturers
serial number that changes between different models, and anyone who uses
this id must look at the driver to get uniqueness.

Yes what is needed in the comparison to get uniqueness in a single
system badly needs to be documented because that switch id is very much
not enough on it's own.

phys_port_id on the other hand is globally unique.  So please stop
saying that this is anything like phys_port_id.

Eric

^ permalink raw reply

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
From: Jay Vosburgh @ 2014-12-04 19:27 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: netdev, Herbert Xu, Thomas Graf, Daniel Borkmann, Eric Dumazet
In-Reply-To: <e77237174e1b8d743ef5171e05abde82bc54af37.1417696901.git.hannes@stressinduktion.org>

Hannes Frederic Sowa <hannes@stressinduktion.org> wrote:

>By default the arch_fast_hash hashing function pointers are initialized
>to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
>updated to the CRC32 ones. This dispatching scheme incurs a function
>pointer lookup and indirect call for every hashing operation.
>
>To keep the number of clobbered registers short the hashing primitives
>are implemented in assembler. This makes it easier to do the dispatch
>by alternative_call.

	I have tested this on the same system that panicked with the
original (now reverted) implementation (commit e5a2c8999576 "fast_hash:
avoid indirect function calls"), and it functions correctly and does not
panic.

	I looked at the disassembly, and, as a data point, on a
non-SSE4.2 system, the code generated is not as efficient as Hannes'
original test patch, found here:

http://comments.gmane.org/gmane.linux.network/338430

	which produced code as follows:

0xffffffffa00b6bb9 <ovs_flow_tbl_insert+0xb9>:  mov    %r15,0x348(%r14)
0xffffffffa00b6bc0 <ovs_flow_tbl_insert+0xc0>:  movzwl 0x28(%r15),%ecx
0xffffffffa00b6bc5 <ovs_flow_tbl_insert+0xc5>:  movzwl 0x2a(%r15),%esi
0xffffffffa00b6bca <ovs_flow_tbl_insert+0xca>:  movzwl %cx,%eax
0xffffffffa00b6bcd <ovs_flow_tbl_insert+0xcd>:  sub    %ecx,%esi
0xffffffffa00b6bcf <ovs_flow_tbl_insert+0xcf>:  lea    0x38(%r14,%rax,1),%rdi
0xffffffffa00b6bd4 <ovs_flow_tbl_insert+0xd4>:  sar    $0x2,%esi
0xffffffffa00b6bd7 <ovs_flow_tbl_insert+0xd7>:  callq  0xffffffff813a7810 <__jhash2>
0xffffffffa00b6bdc <ovs_flow_tbl_insert+0xdc>:  mov    %eax,0x30(%r14)
0xffffffffa00b6be0 <ovs_flow_tbl_insert+0xe0>:  mov    (%rbx),%r13
0xffffffffa00b6be3 <ovs_flow_tbl_insert+0xe3>:  mov    %r14,%rsi
0xffffffffa00b6be6 <ovs_flow_tbl_insert+0xe6>:  mov    %r13,%rdi
0xffffffffa00b6be9 <ovs_flow_tbl_insert+0xe9>:  callq  0xffffffffa00b61a0 <table_instance_insert>

	This patch's code ends up as follows:

0xffffffffa01b5a57 <ovs_flow_tbl_insert+0xb7>:	mov    %r15,0x348(%rcx)
0xffffffffa01b5a5e <ovs_flow_tbl_insert+0xbe>:	movzwl 0x28(%r15),%eax
0xffffffffa01b5a63 <ovs_flow_tbl_insert+0xc3>:	movzwl 0x2a(%r15),%esi
0xffffffffa01b5a68 <ovs_flow_tbl_insert+0xc8>:	movzwl %ax,%edx
0xffffffffa01b5a6b <ovs_flow_tbl_insert+0xcb>:	sub    %eax,%esi
0xffffffffa01b5a6d <ovs_flow_tbl_insert+0xcd>:	lea    0x38(%rcx,%rdx,1),%rdi
0xffffffffa01b5a72 <ovs_flow_tbl_insert+0xd2>:	xor    %edx,%edx
0xffffffffa01b5a74 <ovs_flow_tbl_insert+0xd4>:	sar    $0x2,%esi
0xffffffffa01b5a77 <ovs_flow_tbl_insert+0xd7>:	callq  0xffffffff813ae9f0 <__jhash_trampoline>
0xffffffffa01b5a7c <ovs_flow_tbl_insert+0xdc>:	mov    %eax,0x30(%rcx)
0xffffffffa01b5a7f <ovs_flow_tbl_insert+0xdf>:	mov    (%rbx),%r13
0xffffffffa01b5a82 <ovs_flow_tbl_insert+0xe2>:	mov    %rcx,%rsi
0xffffffffa01b5a85 <ovs_flow_tbl_insert+0xe5>:	mov    %r13,%rdi
0xffffffffa01b5a88 <ovs_flow_tbl_insert+0xe8>:	callq  0xffffffffa01b5030 <table_instance_insert>

0xffffffff813ae9f0 <__jhash_trampoline>:	push   %rcx
0xffffffff813ae9f1 <__jhash_trampoline+0x1>:	push   %r8
0xffffffff813ae9f3 <__jhash_trampoline+0x3>:	push   %r9
0xffffffff813ae9f5 <__jhash_trampoline+0x5>:	push   %r10
0xffffffff813ae9f7 <__jhash_trampoline+0x7>:	push   %r11
0xffffffff813ae9f9 <__jhash_trampoline+0x9>:	callq  0xffffffff813ae8a0 <__jhash>
0xffffffff813ae9fe <__jhash_trampoline+0xe>:	pop    %r11
0xffffffff813aea00 <__jhash_trampoline+0x10>:	pop    %r10
0xffffffff813aea02 <__jhash_trampoline+0x12>:	pop    %r9
0xffffffff813aea04 <__jhash_trampoline+0x14>:	pop    %r8
0xffffffff813aea06 <__jhash_trampoline+0x16>:	pop    %rcx
0xffffffff813aea07 <__jhash_trampoline+0x17>:	retq   

	In any event, this new patch does work correctly in my test that
originally failed, and it's debatable how much optimizing for old
systems is worthwhile.

	I only tested the non-SSE4.2 (i.e., old system) portion on
x86_64.

Tested-by: Jay Vosburgh <jay.vosburgh@canonical.com>

	-J

>Cc: Herbert Xu <herbert@gondor.apana.org.au>
>Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
>Cc: Thomas Graf <tgraf@suug.ch>
>Cc: Daniel Borkmann <dborkman@redhat.com>
>Cc: Eric Dumazet <eric.dumazet@gmail.com>
>Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
>---
> arch/x86/include/asm/hash.h      |  53 ++++++++++-
> arch/x86/kernel/i386_ksyms_32.c  |   6 ++
> arch/x86/kernel/x8664_ksyms_64.c |   6 ++
> arch/x86/lib/Makefile            |   2 +-
> arch/x86/lib/arch_hash.S         | 192 +++++++++++++++++++++++++++++++++++++++
> arch/x86/lib/hash.c              |  92 -------------------
> arch/x86/lib/jhash.c             |   6 ++
> include/asm-generic/hash.h       |  18 +++-
> include/linux/hash.h             |  34 -------
> lib/Makefile                     |   2 +-
> lib/hash.c                       |  39 --------
> net/openvswitch/flow_table.c     |   2 +-
> 12 files changed, 280 insertions(+), 172 deletions(-)
> create mode 100644 arch/x86/lib/arch_hash.S
> delete mode 100644 arch/x86/lib/hash.c
> create mode 100644 arch/x86/lib/jhash.c
> delete mode 100644 lib/hash.c
>
>diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
>index e8c58f8..620081b 100644
>--- a/arch/x86/include/asm/hash.h
>+++ b/arch/x86/include/asm/hash.h
>@@ -1,7 +1,56 @@
> #ifndef _ASM_X86_HASH_H
> #define _ASM_X86_HASH_H
> 
>-struct fast_hash_ops;
>-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
>+#include <linux/cpufeature.h>
>+#include <asm/alternative.h>
>+
>+#include <linux/jhash.h>
>+
>+#ifdef CONFIG_AS_CRC32
>+
>+u32 __jhash_trampoline(const void *data, u32 len, u32 seed);
>+u32 __sse42_crc32(const void *data, u32 len, u32 seed);
>+
>+#ifdef CONFIG_X86_64
>+
>+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>+{
>+	u32 hash;
>+
>+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
>+			 ASM_OUTPUT2("=a" (hash), "=D" (data), "=S" (len),
>+				     "=d" (seed)),
>+			 "1" (data), "2" (len), "3" (seed)
>+			 : "memory", "cc");
>+
>+	return hash;
>+}
>+
>+#else /* CONFIG_X86_64 */
>+
>+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>+{
>+	u32 hash;
>+
>+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
>+			 ASM_OUTPUT2("=a" (hash), "=d" (len), "=c" (seed)),
>+			 "0" (data), "1" (len), "2" (seed)
>+			 : "memory", "cc");
>+
>+	return hash;
>+}
>+
>+#endif /* CONFIG_x86_64 */
>+
>+#else /* CONFIG_AS_CRC32 */
>+
>+u32 __jhash(const void *data, u32 len, u32 seed);
>+
>+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>+{
>+	return __jhash(data, len, seed);
>+}
>+
>+#endif
> 
> #endif /* _ASM_X86_HASH_H */
>diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
>index 05fd74f..afb98da 100644
>--- a/arch/x86/kernel/i386_ksyms_32.c
>+++ b/arch/x86/kernel/i386_ksyms_32.c
>@@ -1,4 +1,5 @@
> #include <linux/module.h>
>+#include <linux/hash.h>
> 
> #include <asm/checksum.h>
> #include <asm/pgtable.h>
>@@ -38,6 +39,11 @@ EXPORT_SYMBOL(strstr);
> EXPORT_SYMBOL(csum_partial);
> EXPORT_SYMBOL(empty_zero_page);
> 
>+#ifdef CONFIG_AS_CRC32
>+EXPORT_SYMBOL(__sse42_crc32);
>+EXPORT_SYMBOL(__jhash_trampoline);
>+#endif
>+
> #ifdef CONFIG_PREEMPT
> EXPORT_SYMBOL(___preempt_schedule);
> #ifdef CONFIG_CONTEXT_TRACKING
>diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
>index 0406819..1094c13 100644
>--- a/arch/x86/kernel/x8664_ksyms_64.c
>+++ b/arch/x86/kernel/x8664_ksyms_64.c
>@@ -3,6 +3,7 @@
> 
> #include <linux/module.h>
> #include <linux/smp.h>
>+#include <linux/hash.h>
> 
> #include <net/checksum.h>
> 
>@@ -42,6 +43,11 @@ EXPORT_SYMBOL(clear_page);
> 
> EXPORT_SYMBOL(csum_partial);
> 
>+#ifdef CONFIG_AS_CRC32
>+EXPORT_SYMBOL(__sse42_crc32);
>+EXPORT_SYMBOL(__jhash_trampoline);
>+#endif
>+
> /*
>  * Export string functions. We normally rely on gcc builtin for most of these,
>  * but gcc sometimes decides not to inline them.
>diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
>index db92793..168bbef 100644
>--- a/arch/x86/lib/Makefile
>+++ b/arch/x86/lib/Makefile
>@@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o
> lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
> lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
> 
>-obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
>+obj-y += msr.o msr-reg.o msr-reg-export.o jhash.o arch_hash.o
> 
> ifeq ($(CONFIG_X86_32),y)
>         obj-y += atomic64_32.o
>diff --git a/arch/x86/lib/arch_hash.S b/arch/x86/lib/arch_hash.S
>new file mode 100644
>index 0000000..ff526a4
>--- /dev/null
>+++ b/arch/x86/lib/arch_hash.S
>@@ -0,0 +1,192 @@
>+#include <linux/linkage.h>
>+#include <asm/dwarf2.h>
>+#include <asm/calling.h>
>+
>+#ifdef CONFIG_AS_CRC32
>+
>+#ifdef CONFIG_X86_64
>+
>+ENTRY(__jhash_trampoline)
>+	CFI_STARTPROC
>+
>+	pushq_cfi %rcx
>+	pushq_cfi %r8
>+	pushq_cfi %r9
>+	pushq_cfi %r10
>+	pushq_cfi %r11
>+
>+	call __jhash
>+
>+	popq_cfi %r11
>+	popq_cfi %r10
>+	popq_cfi %r9
>+	popq_cfi %r8
>+	popq_cfi %rcx
>+
>+	retq
>+
>+	CFI_ENDPROC
>+ENDPROC(__jhash_trampoline)
>+
>+ENTRY(__sse42_crc32)
>+	CFI_STARTPROC
>+
>+	movq %rdx, %rax
>+	cmpq $0x40, %rsi
>+	jb .Lcrc_32bytes
>+	subq $0x40, %rsi
>+
>+.Lcrc_64bytes:
>+	subq $0x40, %rsi
>+	crc32q 0*8(%rdi), %rax
>+	crc32q 1*8(%rdi), %rax
>+	crc32q 2*8(%rdi), %rax
>+	crc32q 3*8(%rdi), %rax
>+	crc32q 4*8(%rdi), %rax
>+	crc32q 5*8(%rdi), %rax
>+	crc32q 6*8(%rdi), %rax
>+	crc32q 7*8(%rdi), %rax
>+	leaq   8*8(%rdi), %rdi
>+	jae .Lcrc_64bytes
>+	addq $0x40, %rsi
>+
>+.Lcrc_32bytes:
>+	cmpq $0x20, %rsi
>+	jb .Lcrc_16bytes
>+
>+	subq $0x20, %rsi
>+	crc32q 0*8(%rdi), %rax
>+	crc32q 1*8(%rdi), %rax
>+	crc32q 2*8(%rdi), %rax
>+	crc32q 3*8(%rdi), %rax
>+	leaq   4*8(%rdi), %rdi
>+
>+.Lcrc_16bytes:
>+	cmpq $0x10, %rsi
>+	jb .Lcrc_8bytes
>+
>+	subq $0x10, %rsi
>+	crc32q 0*8(%rdi), %rax
>+	crc32q 1*8(%rdi), %rax
>+	leaq   2*8(%rdi), %rdi
>+
>+.Lcrc_8bytes:
>+	cmpq $0x8, %rsi
>+	jb .Lcrc_4bytes
>+
>+	subq $0x8, %rsi
>+	crc32q (%rdi), %rax
>+	leaq 1*8(%rdi), %rdi
>+
>+.Lcrc_4bytes:
>+	cmpq $0x4, %rsi
>+	jb .Lcrc_2bytes
>+
>+	subq $0x4, %rsi
>+	crc32l (%rdi), %eax
>+	leaq   1*4(%rdi), %rdi
>+
>+.Lcrc_2bytes:
>+	cmpq $0x2, %rsi
>+	jb .Lcrc_1bytes
>+
>+	subq $0x2, %rsi
>+	crc32w (%rdi), %eax
>+	leaq 1*2(%rdi), %rdi
>+
>+.Lcrc_1bytes:
>+	cmpq $0x1, %rsi
>+	jb .Lend
>+
>+	crc32b (%rdi), %eax
>+.Lend:
>+	retq
>+	CFI_ENDPROC
>+ENDPROC(__sse42_crc32)
>+
>+#else /* CONFIG_X86_32 */
>+
>+ENTRY(__jhash_trampoline)
>+	CFI_STARTPROC
>+
>+	call __jhash
>+
>+	retl
>+	CFI_ENDPROC
>+ENDPROC(__jhash_trampoline)
>+
>+ENTRY(__sse42_crc32)
>+	CFI_STARTPROC
>+
>+	xchgl %eax,%ecx
>+	xchgl %edx,%ecx
>+
>+	cmpl $0x20, %ecx
>+	jb .Lcrc_16bytes
>+	subl $0x20, %ecx
>+
>+.Lcrc_32bytes:
>+	subl $0x20, %ecx
>+	crc32l 0*4(%edx), %eax
>+	crc32l 1*4(%edx), %eax
>+	crc32l 2*4(%edx), %eax
>+	crc32l 3*4(%edx), %eax
>+	crc32l 4*4(%edx), %eax
>+	crc32l 5*4(%edx), %eax
>+	crc32l 6*4(%edx), %eax
>+	crc32l 7*4(%edx), %eax
>+	leal   8*4(%edx), %edx
>+	jae .Lcrc_32bytes
>+	addl $0x20, %ecx
>+
>+.Lcrc_16bytes:
>+	cmpl $0x10, %ecx
>+	jb .Lcrc_8bytes
>+
>+	subl $0x10, %ecx
>+	crc32l 0*4(%edx), %eax
>+	crc32l 1*4(%edx), %eax
>+	crc32l 2*4(%edx), %eax
>+	crc32l 3*4(%edx), %eax
>+	leal   4*4(%edx), %edx
>+
>+.Lcrc_8bytes:
>+	cmpl $0x8, %ecx
>+	jb .Lcrc_4bytes
>+
>+	subl $0x8, %ecx
>+	crc32l 0*4(%edx), %eax
>+	crc32l 1*4(%edx), %eax
>+	leal   2*4(%edx), %edx
>+
>+.Lcrc_4bytes:
>+	cmpl $0x4, %ecx
>+	jb .Lcrc_2bytes
>+
>+	subl $0x4, %ecx
>+	crc32l 0*4(%edx), %eax
>+	leal   1*4(%edx), %edx
>+
>+.Lcrc_2bytes:
>+	cmpl $0x2, %ecx
>+	jb .Lcrc_1bytes
>+
>+	subl $0x2, %ecx
>+	crc32w (%edx), %eax
>+	leal   1*2(%edx), %edx
>+
>+.Lcrc_1bytes:
>+	cmpl $0x1, %ecx
>+	jb .Lend
>+
>+	crc32b (%edx), %eax
>+
>+.Lend:
>+	retl
>+
>+	CFI_ENDPROC
>+ENDPROC(__sse42_crc32)
>+
>+#endif
>+
>+#endif /* CONFIG_AS_CRC32 */
>diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
>deleted file mode 100644
>index ff4fa51..0000000
>--- a/arch/x86/lib/hash.c
>+++ /dev/null
>@@ -1,92 +0,0 @@
>-/*
>- * Some portions derived from code covered by the following notice:
>- *
>- * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
>- * All rights reserved.
>- *
>- * Redistribution and use in source and binary forms, with or without
>- * modification, are permitted provided that the following conditions
>- * are met:
>- *
>- *   * Redistributions of source code must retain the above copyright
>- *     notice, this list of conditions and the following disclaimer.
>- *   * Redistributions in binary form must reproduce the above copyright
>- *     notice, this list of conditions and the following disclaimer in
>- *     the documentation and/or other materials provided with the
>- *     distribution.
>- *   * Neither the name of Intel Corporation nor the names of its
>- *     contributors may be used to endorse or promote products derived
>- *     from this software without specific prior written permission.
>- *
>- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
>- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
>- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
>- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
>- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
>- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>- */
>-
>-#include <linux/hash.h>
>-#include <linux/init.h>
>-
>-#include <asm/processor.h>
>-#include <asm/cpufeature.h>
>-#include <asm/hash.h>
>-
>-static inline u32 crc32_u32(u32 crc, u32 val)
>-{
>-#ifdef CONFIG_AS_CRC32
>-	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
>-#else
>-	asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
>-#endif
>-	return crc;
>-}
>-
>-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
>-{
>-	const u32 *p32 = (const u32 *) data;
>-	u32 i, tmp = 0;
>-
>-	for (i = 0; i < len / 4; i++)
>-		seed = crc32_u32(seed, *p32++);
>-
>-	switch (len & 3) {
>-	case 3:
>-		tmp |= *((const u8 *) p32 + 2) << 16;
>-		/* fallthrough */
>-	case 2:
>-		tmp |= *((const u8 *) p32 + 1) << 8;
>-		/* fallthrough */
>-	case 1:
>-		tmp |= *((const u8 *) p32);
>-		seed = crc32_u32(seed, tmp);
>-		break;
>-	}
>-
>-	return seed;
>-}
>-
>-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
>-{
>-	const u32 *p32 = (const u32 *) data;
>-	u32 i;
>-
>-	for (i = 0; i < len; i++)
>-		seed = crc32_u32(seed, *p32++);
>-
>-	return seed;
>-}
>-
>-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
>-{
>-	if (cpu_has_xmm4_2) {
>-		ops->hash  = intel_crc4_2_hash;
>-		ops->hash2 = intel_crc4_2_hash2;
>-	}
>-}
>diff --git a/arch/x86/lib/jhash.c b/arch/x86/lib/jhash.c
>new file mode 100644
>index 0000000..ab4b408
>--- /dev/null
>+++ b/arch/x86/lib/jhash.c
>@@ -0,0 +1,6 @@
>+#include <linux/jhash.h>
>+
>+u32 __jhash(const void *data, u32 len, u32 seed)
>+{
>+	return jhash(data, len, seed);
>+}
>diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
>index b631284..07b8892 100644
>--- a/include/asm-generic/hash.h
>+++ b/include/asm-generic/hash.h
>@@ -1,9 +1,23 @@
> #ifndef __ASM_GENERIC_HASH_H
> #define __ASM_GENERIC_HASH_H
> 
>-struct fast_hash_ops;
>-static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
>+#include <linux/jhash.h>
>+
>+/**
>+ *	arch_fast_hash - Caclulates a hash over a given buffer that can have
>+ *			 arbitrary size. This function will eventually use an
>+ *			 architecture-optimized hashing implementation if
>+ *			 available, and trades off distribution for speed.
>+ *
>+ *	@data: buffer to hash
>+ *	@len: length of buffer in bytes
>+ *	@seed: start seed
>+ *
>+ *	Returns 32bit hash.
>+ */
>+u32 arch_fast_hash(const void *data, u32 len, u32 seed)
> {
>+	return jhash(data, len, seed);
> }
> 
> #endif /* __ASM_GENERIC_HASH_H */
>diff --git a/include/linux/hash.h b/include/linux/hash.h
>index d0494c3..6e8fb02 100644
>--- a/include/linux/hash.h
>+++ b/include/linux/hash.h
>@@ -84,38 +84,4 @@ static inline u32 hash32_ptr(const void *ptr)
> 	return (u32)val;
> }
> 
>-struct fast_hash_ops {
>-	u32 (*hash)(const void *data, u32 len, u32 seed);
>-	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
>-};
>-
>-/**
>- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
>- *			 arbitrary size. This function will eventually use an
>- *			 architecture-optimized hashing implementation if
>- *			 available, and trades off distribution for speed.
>- *
>- *	@data: buffer to hash
>- *	@len: length of buffer in bytes
>- *	@seed: start seed
>- *
>- *	Returns 32bit hash.
>- */
>-extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
>-
>-/**
>- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
>- *			  size that is of a multiple of 32bit words. This
>- *			  function will eventually use an architecture-
>- *			  optimized hashing implementation if available,
>- *			  and trades off distribution for speed.
>- *
>- *	@data: buffer to hash (must be 32bit padded)
>- *	@len: number of 32bit words
>- *	@seed: start seed
>- *
>- *	Returns 32bit hash.
>- */
>-extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
>-
> #endif /* _LINUX_HASH_H */
>diff --git a/lib/Makefile b/lib/Makefile
>index 0211d2b..4b9baa4 100644
>--- a/lib/Makefile
>+++ b/lib/Makefile
>@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
> 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
> 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
> 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
>-	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o reciprocal_div.o
>+	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
> obj-y += string_helpers.o
> obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
> obj-y += kstrtox.o
>diff --git a/lib/hash.c b/lib/hash.c
>deleted file mode 100644
>index fea973f..0000000
>--- a/lib/hash.c
>+++ /dev/null
>@@ -1,39 +0,0 @@
>-/* General purpose hashing library
>- *
>- * That's a start of a kernel hashing library, which can be extended
>- * with further algorithms in future. arch_fast_hash{2,}() will
>- * eventually resolve to an architecture optimized implementation.
>- *
>- * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
>- * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
>- * Copyright 2013 Thomas Graf <tgraf@redhat.com>
>- * Licensed under the GNU General Public License, version 2.0 (GPLv2)
>- */
>-
>-#include <linux/jhash.h>
>-#include <linux/hash.h>
>-#include <linux/cache.h>
>-
>-static struct fast_hash_ops arch_hash_ops __read_mostly = {
>-	.hash  = jhash,
>-	.hash2 = jhash2,
>-};
>-
>-u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>-{
>-	return arch_hash_ops.hash(data, len, seed);
>-}
>-EXPORT_SYMBOL_GPL(arch_fast_hash);
>-
>-u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
>-{
>-	return arch_hash_ops.hash2(data, len, seed);
>-}
>-EXPORT_SYMBOL_GPL(arch_fast_hash2);
>-
>-static int __init hashlib_init(void)
>-{
>-	setup_arch_fast_hash(&arch_hash_ops);
>-	return 0;
>-}
>-early_initcall(hashlib_init);
>diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>index e0a7fef..79bc65d 100644
>--- a/net/openvswitch/flow_table.c
>+++ b/net/openvswitch/flow_table.c
>@@ -366,7 +366,7 @@ static u32 flow_hash(const struct sw_flow_key *key, int key_start,
> 	/* Make sure number of hash bytes are multiple of u32. */
> 	BUILD_BUG_ON(sizeof(long) % sizeof(u32));
> 
>-	return arch_fast_hash2(hash_key, hash_u32s, 0);
>+	return arch_fast_hash(hash_key, hash_u32s, 0);
> }
> 
> static int flow_key_start(const struct sw_flow_key *key)
>-- 
>1.9.3

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply

* Re: [linux-nics] [PATCHv3 net] i40e: Implement ndo_gso_check()
From: Jeff Kirsher @ 2014-12-04 19:20 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: Joe Stringer, netdev, linux.nics, jesse, linux-kernel, therbert
In-Reply-To: <5480B1F0.9090407@cogentembedded.com>

[-- Attachment #1: Type: text/plain, Size: 2491 bytes --]

On Thu, 2014-12-04 at 22:11 +0300, Sergei Shtylyov wrote:
> Hello.
> 
> On 12/04/2014 09:39 PM, Joe Stringer wrote:
> 
> > ndo_gso_check() was recently introduced to allow NICs to report the
> > offloading support that they have on a per-skb basis. Add an
> > implementation for this driver which checks for IPIP, GRE, UDP tunnels.
> 
> > Signed-off-by: Joe Stringer <joestringer@nicira.com>
> > ---
> > v3: Drop IPIP and GRE (no driver support even though hw supports it).
> >      Check for UDP outer protocol for UDP tunnels.
> > v2: Expand to include IP in IP and IPv4/IPv6 inside GRE/UDP tunnels.
> >      Add MAX_INNER_LENGTH (as 80).
> > ---
> >   drivers/net/ethernet/intel/i40e/i40e_main.c |   26 ++++++++++++++++++++++++++
> >   1 file changed, 26 insertions(+)
> 
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> > index c3a7f4a..0d6493a 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> > @@ -7447,6 +7447,31 @@ static int i40e_ndo_fdb_dump(struct sk_buff *skb,
> >
> >   #endif /* USE_DEFAULT_FDB_DEL_DUMP */
> >   #endif /* HAVE_FDB_OPS */
> 
>     Need empty line here, I think.

Nope, look above the #endif's, that is the blank line that you are
wanting if the above #ifdefs are defined and if not, the blank line is
before the #ifdef's.  So if you were to add a blank line as you
requested, there is a possibility of 2 blank lines in a row.  So it is
not needed.

> 
> > +static bool i40e_gso_check(struct sk_buff *skb, struct net_device *dev)
> > +{
> > +	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) {
> > +		unsigned char *ihdr;
> > +
> > +		if (skb->protocol != IPPROTO_UDP ||
> > +		    skb->inner_protocol_type != ENCAP_TYPE_ETHER)
> > +			return false;
> > +
> > +		if (skb->inner_protocol == htons(ETH_P_TEB))
> > +			ihdr = skb_inner_mac_header(skb);
> > +		else if (skb->inner_protocol == htons(ETH_P_IP) ||
> > +			 skb->inner_protocol == htons(ETH_P_IPV6))
> > +			ihdr = skb_inner_network_header(skb);
> > +		else
> > +			return false;
> 
>     The above is asking to be a *switch* instead, no?
> 
> > +
> > +#define MAX_TUNNEL_HDR_LEN	80
> 
>     I'd #define this just above the function, if not at the start of the file...
> 
> [...]
> 
> WBR, Sergei
> 
> _______________________________________________
> Linux-nics mailing list
> Linux-nics@intel.com



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [patch iproute2 1/6] iproute2: ipa: show switch id
From: Jiri Pirko @ 2014-12-04 19:19 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev, davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse,
	pshelar, azhou, ben, stephen, jeffrey.t.kirsher, vyasevic,
	xiyou.wangcong, john.r.fastabend, edumazet, jhs, sfeldma,
	f.fainelli, roopa, linville, jasowang, nicolas.dichtel,
	ryazanov.s.a, buytenh, aviadr, nbd, alexei.starovoitov,
	Neil.Jerram, ronye, simon.horman, alexander.h.duyck, john.ronciak,
	mleitner, shrijeet, gospo, bcrl, hemal
In-Reply-To: <87k327450a.fsf@x220.int.ebiederm.org>

Thu, Dec 04, 2014 at 07:57:41PM CET, ebiederm@xmission.com wrote:
>Jiri Pirko <jiri@resnulli.us> writes:
>
>> Thu, Dec 04, 2014 at 06:52:49PM CET, ebiederm@xmission.com wrote:
>>>Jiri Pirko <jiri@resnulli.us> writes:
>>>
>>>> Thu, Dec 04, 2014 at 05:15:04PM CET, ebiederm@xmission.com wrote:
>>>>>Jiri Pirko <jiri@resnulli.us> writes:
>>>>>
>>>>>Would someone please explain to me what a switch id is?
>>>>>
>>>>>I looked in the kernel source, and I looked here and while I know
>>>>>switches I don't have a clue what a switch id is.
>>>>>
>>>>>My primary concern at this point is that you have introduced a global
>>>>>identifier that is isn't a hardware property (it certainly does not look
>>>>>like a mac address) and that is unique across network namespaces and
>>>>>thus breaks checkpoint/restart (aka CRIU).
>>>>
>>>> IFLA_PHYS_SWITCH_ID is very similar to IFLA_PHYS_PORT_ID. It is
>>>> generated by the driver and ensures that there is the same switch id for
>>>> all ports belonging to the same switch chip/asic. It is up to the driver
>>>> how to implement the id. I would like to point you to driver
>>>> implementing ndo_get_phys_port_id
>>>
>>>Looking at ndo_get_phys_port_id it is just the per port mac address.  Or
>>>guid in the case of infiniband.  Which really makes me wonder why we
>>>didn't use the same abstractions in the code for address types that we
>>>do for hardware addresses.
>>>
>>>Using mac address or other hardware addresses that are used for layer 2
>>>addressing makes sense to me.  There is a long tradition of that and as
>>>I recall protocols like STP actually requiring having a different mac
>>>address per port.
>>>
>>>When I asked the question I thought the switch id was going to be
>>>something like the ifindex, the software index of a network device.
>>>
>>>
>>>Finally having tracked down the rocker implementation of 
>>>rocker_port_switch_parent_id_get I see it you are reading some 64bit
>>>hardware register.
>>>
>>>Which leads me to ask what are the semantics of switch_id?
>>>
>>>Is the switch id an identifier with a prefix from IEEE and assigned by
>>>the manufacture so that it is guaranteed to the tolerances of the
>>>manufacturing process to be globally unique?
>>
>> It is up to the driver what to use. It can use mac addr. This is same as
>> for phys port id.
>
>My reading of the code says the phys port id is the layer 2 hardware
>address of the port.  Certainly that is the case for all of the
>implementations now and it would be insane for it to be anything else.
>
>>>Is the switch id a random number that is statistically likely to be
>>>globally unique because you have enough bits?   As I recall you need
>>>at least 128 bits to have a reasonable chance of a random number
>>>avoiding the birthday paradox.
>>>
>>>Do we need some kind of manufacturer id to tell one switch id from
>>>another?
>>>
>>>Is the switch id persistent across reboots?
>>
>> Yes it is (as for phys port id).
>
>>>>>Also what in the world does PHYS mean in IFLA_PHYS_SWITCH_ID?  Does that
>>>>>mean we can't have a purely software implementation of this interface?
>>>>>Given that we will want a software implementation at some point
>>>>>including PHYS in the name seems completely wrong.
>>>>
>>>> We can remove the "PHYS", no problem. I do not understand what you say
>>>> about "software implementation". The point is to allow hw switch/ish
>>>> chips to be supported.
>>>
>>>If we are talking about something typically stored in a eeprom like a
>>>mac address phys seems appropriate.
>>
>> Yes, we are.
>>
>>>
>>>Still having a definition of this switch id clean clear enough that
>>>net/bridge and drivers/net/macvlan can implement it seems important.
>>
>> I don't understand why would net/bridge or driver/net/macvlan want to
>> implement this. The purpose is to group ports/netdevs which are part of
>> the same hw switch.
>
>Certainly all of the macvlan devices are part of the same logical
>switch, and are all the same hardware.
>
>>>Even more important is having a definition of switch id clear enough
>>>that userspace can use the switch id to do something useful.
>>
>> Userspace threats this the same it treats phys port id.
>>
>> if two ports/netdevs has same switch id, they belong under same hw
>> switch. That's all.
>
>So this id needs to be globally unique?

No. It is enough to be unique within a single system. It serves for no
more than to find out 2 ids are same or not, no other info value.

So when the drivers uses sane ids (like mac for example, or in case of
rocker an id which is passed by qemu command line), the chances of
collision are very very close to none (never say never).

>
>How does the rocket driver achieve global uniquness?  Is the rocket
>driver using an IEEE assigned prefix based id like the ethernet mac
>address?
>
>This is an important detail as ensuring global uniqueness can take a lot
>of work so there needs to be a general agreement on how global
>uniqueness is achieved.
>
>Saying it is up to the driver how to achieve a globally unique id across
>every different switch driver is really insufficient.  If I have two
>drivers that each implement an id that is 64bit long and they use two
>completely different methods there is a real chance of collision.
>
>So either I need a driver id that I also use in the comparison between
>switch ids or how a driver generates a globally unique id need to be
>specified.
>
>>>Right now switch id looks like one of those weird one manufacturer
>>>properties that is fine to expose as a driver specific property
>>>but I don't yet see it being a generic property I that can be used
>>>usefully in userspace.
>>>
>>>So can we please get some clear semantics or failing that can we please
>>>not expose this to userspace as generic property.
>>
>> I thought that the semantics is clean. Looks like I will have to update
>> Documentation/networking/switchdev.txt adding some more info about
>> this.
>
>The semantics as described so far will not achieve a useful id, which
>makes them rubbish.
>
>Eric

^ permalink raw reply

* Re: [PATCHv3 net] i40e: Implement ndo_gso_check()
From: Sergei Shtylyov @ 2014-12-04 19:11 UTC (permalink / raw)
  To: Joe Stringer, netdev
  Cc: linux-kernel, jesse, shannon.nelson, jesse.brandeburg,
	jeffrey.t.kirsher, therbert, linux.nics
In-Reply-To: <1417718366-14310-1-git-send-email-joestringer@nicira.com>

Hello.

On 12/04/2014 09:39 PM, Joe Stringer wrote:

> ndo_gso_check() was recently introduced to allow NICs to report the
> offloading support that they have on a per-skb basis. Add an
> implementation for this driver which checks for IPIP, GRE, UDP tunnels.

> Signed-off-by: Joe Stringer <joestringer@nicira.com>
> ---
> v3: Drop IPIP and GRE (no driver support even though hw supports it).
>      Check for UDP outer protocol for UDP tunnels.
> v2: Expand to include IP in IP and IPv4/IPv6 inside GRE/UDP tunnels.
>      Add MAX_INNER_LENGTH (as 80).
> ---
>   drivers/net/ethernet/intel/i40e/i40e_main.c |   26 ++++++++++++++++++++++++++
>   1 file changed, 26 insertions(+)

> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index c3a7f4a..0d6493a 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -7447,6 +7447,31 @@ static int i40e_ndo_fdb_dump(struct sk_buff *skb,
>
>   #endif /* USE_DEFAULT_FDB_DEL_DUMP */
>   #endif /* HAVE_FDB_OPS */

    Need empty line here, I think.

> +static bool i40e_gso_check(struct sk_buff *skb, struct net_device *dev)
> +{
> +	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) {
> +		unsigned char *ihdr;
> +
> +		if (skb->protocol != IPPROTO_UDP ||
> +		    skb->inner_protocol_type != ENCAP_TYPE_ETHER)
> +			return false;
> +
> +		if (skb->inner_protocol == htons(ETH_P_TEB))
> +			ihdr = skb_inner_mac_header(skb);
> +		else if (skb->inner_protocol == htons(ETH_P_IP) ||
> +			 skb->inner_protocol == htons(ETH_P_IPV6))
> +			ihdr = skb_inner_network_header(skb);
> +		else
> +			return false;

    The above is asking to be a *switch* instead, no?

> +
> +#define MAX_TUNNEL_HDR_LEN	80

    I'd #define this just above the function, if not at the start of the file...

[...]

WBR, Sergei

^ permalink raw reply

* Re: [patch iproute2 1/6] iproute2: ipa: show switch id
From: Eric W. Biederman @ 2014-12-04 18:57 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse,
	pshelar, azhou, ben, stephen, jeffrey.t.kirsher, vyasevic,
	xiyou.wangcong, john.r.fastabend, edumazet, jhs, sfeldma,
	f.fainelli, roopa, linville, jasowang, nicolas.dichtel,
	ryazanov.s.a, buytenh, aviadr, nbd, alexei.starovoitov,
	Neil.Jerram, ronye, simon.horman, alexander.h.duyck, john.ronciak,
	mleitner, shrijeet, gospo, bcrl, hemal
In-Reply-To: <20141204182451.GI1861@nanopsycho.orion>

Jiri Pirko <jiri@resnulli.us> writes:

> Thu, Dec 04, 2014 at 06:52:49PM CET, ebiederm@xmission.com wrote:
>>Jiri Pirko <jiri@resnulli.us> writes:
>>
>>> Thu, Dec 04, 2014 at 05:15:04PM CET, ebiederm@xmission.com wrote:
>>>>Jiri Pirko <jiri@resnulli.us> writes:
>>>>
>>>>Would someone please explain to me what a switch id is?
>>>>
>>>>I looked in the kernel source, and I looked here and while I know
>>>>switches I don't have a clue what a switch id is.
>>>>
>>>>My primary concern at this point is that you have introduced a global
>>>>identifier that is isn't a hardware property (it certainly does not look
>>>>like a mac address) and that is unique across network namespaces and
>>>>thus breaks checkpoint/restart (aka CRIU).
>>>
>>> IFLA_PHYS_SWITCH_ID is very similar to IFLA_PHYS_PORT_ID. It is
>>> generated by the driver and ensures that there is the same switch id for
>>> all ports belonging to the same switch chip/asic. It is up to the driver
>>> how to implement the id. I would like to point you to driver
>>> implementing ndo_get_phys_port_id
>>
>>Looking at ndo_get_phys_port_id it is just the per port mac address.  Or
>>guid in the case of infiniband.  Which really makes me wonder why we
>>didn't use the same abstractions in the code for address types that we
>>do for hardware addresses.
>>
>>Using mac address or other hardware addresses that are used for layer 2
>>addressing makes sense to me.  There is a long tradition of that and as
>>I recall protocols like STP actually requiring having a different mac
>>address per port.
>>
>>When I asked the question I thought the switch id was going to be
>>something like the ifindex, the software index of a network device.
>>
>>
>>Finally having tracked down the rocker implementation of 
>>rocker_port_switch_parent_id_get I see it you are reading some 64bit
>>hardware register.
>>
>>Which leads me to ask what are the semantics of switch_id?
>>
>>Is the switch id an identifier with a prefix from IEEE and assigned by
>>the manufacture so that it is guaranteed to the tolerances of the
>>manufacturing process to be globally unique?
>
> It is up to the driver what to use. It can use mac addr. This is same as
> for phys port id.

My reading of the code says the phys port id is the layer 2 hardware
address of the port.  Certainly that is the case for all of the
implementations now and it would be insane for it to be anything else.

>>Is the switch id a random number that is statistically likely to be
>>globally unique because you have enough bits?   As I recall you need
>>at least 128 bits to have a reasonable chance of a random number
>>avoiding the birthday paradox.
>>
>>Do we need some kind of manufacturer id to tell one switch id from
>>another?
>>
>>Is the switch id persistent across reboots?
>
> Yes it is (as for phys port id).

>>>>Also what in the world does PHYS mean in IFLA_PHYS_SWITCH_ID?  Does that
>>>>mean we can't have a purely software implementation of this interface?
>>>>Given that we will want a software implementation at some point
>>>>including PHYS in the name seems completely wrong.
>>>
>>> We can remove the "PHYS", no problem. I do not understand what you say
>>> about "software implementation". The point is to allow hw switch/ish
>>> chips to be supported.
>>
>>If we are talking about something typically stored in a eeprom like a
>>mac address phys seems appropriate.
>
> Yes, we are.
>
>>
>>Still having a definition of this switch id clean clear enough that
>>net/bridge and drivers/net/macvlan can implement it seems important.
>
> I don't understand why would net/bridge or driver/net/macvlan want to
> implement this. The purpose is to group ports/netdevs which are part of
> the same hw switch.

Certainly all of the macvlan devices are part of the same logical
switch, and are all the same hardware.

>>Even more important is having a definition of switch id clear enough
>>that userspace can use the switch id to do something useful.
>
> Userspace threats this the same it treats phys port id.
>
> if two ports/netdevs has same switch id, they belong under same hw
> switch. That's all.

So this id needs to be globally unique?

How does the rocket driver achieve global uniquness?  Is the rocket
driver using an IEEE assigned prefix based id like the ethernet mac
address?

This is an important detail as ensuring global uniqueness can take a lot
of work so there needs to be a general agreement on how global
uniqueness is achieved.

Saying it is up to the driver how to achieve a globally unique id across
every different switch driver is really insufficient.  If I have two
drivers that each implement an id that is 64bit long and they use two
completely different methods there is a real chance of collision.

So either I need a driver id that I also use in the comparison between
switch ids or how a driver generates a globally unique id need to be
specified.

>>Right now switch id looks like one of those weird one manufacturer
>>properties that is fine to expose as a driver specific property
>>but I don't yet see it being a generic property I that can be used
>>usefully in userspace.
>>
>>So can we please get some clear semantics or failing that can we please
>>not expose this to userspace as generic property.
>
> I thought that the semantics is clean. Looks like I will have to update
> Documentation/networking/switchdev.txt adding some more info about
> this.

The semantics as described so far will not achieve a useful id, which
makes them rubbish.

Eric

^ permalink raw reply

* Re: [patch iproute2 5/6] link: add missing IFLA_BRPORT_PROXYARP
From: Stephen Hemminger @ 2014-12-04 18:53 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse,
	pshelar, azhou, ben, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417683438-10935-6-git-send-email-jiri@resnulli.us>

On Thu,  4 Dec 2014 09:57:17 +0100
Jiri Pirko <jiri@resnulli.us> wrote:

> From: Scott Feldman <sfeldma@gmail.com>
> 
> Signed-off-by: Scott Feldman <sfeldma@gmail.com>
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---
>  include/linux/if_link.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/include/linux/if_link.h b/include/linux/if_link.h
> index a6e2594..06efa2d 100644
> --- a/include/linux/if_link.h
> +++ b/include/linux/if_link.h
> @@ -242,6 +242,7 @@ enum {
>  	IFLA_BRPORT_FAST_LEAVE,	/* multicast fast leave    */
>  	IFLA_BRPORT_LEARNING,	/* mac learning */
>  	IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */
> +	IFLA_BRPORT_PROXYARP,   /* proxy ARP */
>  	__IFLA_BRPORT_MAX
>  };
>  #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)

Unnecessary patch since I pick up headers from upstream.
Already on net-next branch.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox