* [PATCH net-next 1/5] net/socket: factor out msghdr manipulation helpers
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
@ 2016-11-25 15:39 ` Paolo Abeni
2016-11-25 15:39 ` [PATCH net-next 2/5] net/socket: add per protocol mmesg support Paolo Abeni
` (5 subsequent siblings)
6 siblings, 0 replies; 19+ messages in thread
From: Paolo Abeni @ 2016-11-25 15:39 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
so that they can be later used for recvmmsg refactor
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/net/sock.h | 18 ++++++++++
net/socket.c | 97 +++++++++++++++++++++++++++++-------------------------
2 files changed, 70 insertions(+), 45 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 442cbb1..c92dc19 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1528,6 +1528,24 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
struct sockcm_cookie *sockc);
+static inline bool sock_recvmmsg_timeout(struct timespec *timeout,
+ struct timespec64 end_time)
+{
+ struct timespec64 timeout64;
+
+ if (!timeout)
+ return false;
+
+ ktime_get_ts64(&timeout64);
+ *timeout = timespec64_to_timespec(timespec64_sub(end_time, timeout64));
+ if (timeout->tv_sec < 0) {
+ timeout->tv_sec = timeout->tv_nsec = 0;
+ return true;
+ }
+
+ return timeout->tv_nsec == 0 && timeout->tv_sec == 0;
+}
+
/*
* Functions to fill in entries in struct proto_ops when a protocol
* does not implement a particular function.
diff --git a/net/socket.c b/net/socket.c
index e2584c5..9b5f360 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1903,6 +1903,21 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
UIO_FASTIOV, iov, &kmsg->msg_iter);
}
+static int copy_msghdr_from_user_gen(struct msghdr *msg_sys, unsigned int flags,
+ struct compat_msghdr __user *msg_compat,
+ struct user_msghdr __user *msg,
+ struct sockaddr __user **uaddr,
+ struct iovec **iov,
+ struct sockaddr_storage *addr)
+{
+ msg_sys->msg_name = addr;
+
+ if (MSG_CMSG_COMPAT & flags)
+ return get_compat_msghdr(msg_sys, msg_compat, uaddr, iov);
+ else
+ return copy_msghdr_from_user(msg_sys, msg, uaddr, iov);
+}
+
static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags,
struct used_address *used_address,
@@ -1919,12 +1934,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
int ctl_len;
ssize_t err;
- msg_sys->msg_name = &address;
-
- if (MSG_CMSG_COMPAT & flags)
- err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
- else
- err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
+ err = copy_msghdr_from_user_gen(msg_sys, flags, msg_compat, msg, NULL,
+ &iov, &address);
if (err < 0)
return err;
@@ -2101,6 +2112,34 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
return __sys_sendmmsg(fd, mmsg, vlen, flags);
}
+static int copy_msghdr_to_user_gen(struct msghdr *msg_sys, int flags,
+ struct compat_msghdr __user *msg_compat,
+ struct user_msghdr __user *msg,
+ struct sockaddr __user *uaddr,
+ struct sockaddr_storage *addr,
+ unsigned long cmsgptr)
+{
+ int __user *uaddr_len = COMPAT_NAMELEN(msg);
+ int err;
+
+ if (uaddr) {
+ err = move_addr_to_user(addr, msg_sys->msg_namelen, uaddr,
+ uaddr_len);
+ if (err < 0)
+ return err;
+ }
+ err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
+ COMPAT_FLAGS(msg));
+ if (err)
+ return err;
+ if (MSG_CMSG_COMPAT & flags)
+ return __put_user((unsigned long)msg_sys->msg_control -
+ cmsgptr, &msg_compat->msg_controllen);
+ else
+ return __put_user((unsigned long)msg_sys->msg_control - cmsgptr,
+ &msg->msg_controllen);
+}
+
static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags, int nosec)
{
@@ -2117,14 +2156,9 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
/* user mode address pointers */
struct sockaddr __user *uaddr;
- int __user *uaddr_len = COMPAT_NAMELEN(msg);
- msg_sys->msg_name = &addr;
-
- if (MSG_CMSG_COMPAT & flags)
- err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
- else
- err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
+ err = copy_msghdr_from_user_gen(msg_sys, flags, msg_compat, msg, &uaddr,
+ &iov, &addr);
if (err < 0)
return err;
@@ -2140,24 +2174,8 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
if (err < 0)
goto out_freeiov;
len = err;
-
- if (uaddr != NULL) {
- err = move_addr_to_user(&addr,
- msg_sys->msg_namelen, uaddr,
- uaddr_len);
- if (err < 0)
- goto out_freeiov;
- }
- err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
- COMPAT_FLAGS(msg));
- if (err)
- goto out_freeiov;
- if (MSG_CMSG_COMPAT & flags)
- err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
- &msg_compat->msg_controllen);
- else
- err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
- &msg->msg_controllen);
+ err = copy_msghdr_to_user_gen(msg_sys, flags, msg_compat, msg, uaddr,
+ &addr, cmsg_ptr);
if (err)
goto out_freeiov;
err = len;
@@ -2209,7 +2227,6 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
struct timespec64 end_time;
- struct timespec64 timeout64;
if (timeout &&
poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2260,19 +2277,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
if (flags & MSG_WAITFORONE)
flags |= MSG_DONTWAIT;
- if (timeout) {
- ktime_get_ts64(&timeout64);
- *timeout = timespec64_to_timespec(
- timespec64_sub(end_time, timeout64));
- if (timeout->tv_sec < 0) {
- timeout->tv_sec = timeout->tv_nsec = 0;
- break;
- }
-
- /* Timeout, return less than vlen datagrams */
- if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
- break;
- }
+ /* Timeout, return less than vlen datagrams */
+ if (sock_recvmmsg_timeout(timeout, end_time))
+ break;
/* Out of band data, return right away */
if (msg_sys.msg_flags & MSG_OOB)
--
1.8.3.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH net-next 2/5] net/socket: add per protocol mmesg support
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
2016-11-25 15:39 ` [PATCH net-next 1/5] net/socket: factor out msghdr manipulation helpers Paolo Abeni
@ 2016-11-25 15:39 ` Paolo Abeni
2016-11-25 15:39 ` [PATCH net-next 3/5] net/udp: factor out main skb processing routine Paolo Abeni
` (4 subsequent siblings)
6 siblings, 0 replies; 19+ messages in thread
From: Paolo Abeni @ 2016-11-25 15:39 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
proto->recvmmsg allows leveraging per protocol optimization,
amortizing the protocol/locking overhead on multiple packets.
This commit introduces the procotol level callbacks and change
to generic implementation to use them.
We explicitly pass down to the protocol level both 'timeout'
and 'end_time' to avoid duplicating there the call to
poll_select_set_timeout().
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/linux/net.h | 5 +++++
include/net/inet_common.h | 3 +++
include/net/sock.h | 6 ++++++
net/ipv4/af_inet.c | 16 ++++++++++++++++
net/ipv6/af_inet6.c | 1 +
net/socket.c | 26 +++++++++++++++++++++++++-
6 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/include/linux/net.h b/include/linux/net.h
index cd0c8bd..95f1bc5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -182,6 +182,11 @@ struct proto_ops {
*/
int (*recvmsg) (struct socket *sock, struct msghdr *m,
size_t total_len, int flags);
+ int (*recvmmsg) (struct socket *sock,
+ struct mmsghdr __user *user_mmsg,
+ unsigned int *vlen, unsigned int flags,
+ struct timespec *timeout,
+ const struct timespec64 *end_time);
int (*mmap) (struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t (*sendpage) (struct socket *sock, struct page *page,
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 5d68342..c7d5875 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -26,6 +26,9 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags);
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags);
+int inet_recvmmsg(struct socket *sock, struct mmsghdr __user *user_mmsg,
+ unsigned int *vlen, unsigned int flags,
+ struct timespec *timeout, const struct timespec64 *end_time);
int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog);
void inet_sock_destruct(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index c92dc19..11126f4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1010,6 +1010,12 @@ struct proto {
int (*recvmsg)(struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags,
int *addr_len);
+ int (*recvmmsg)(struct sock *sk,
+ struct mmsghdr __user *user_mmsg,
+ unsigned int *nr,
+ unsigned int flags,
+ struct timespec *timeout,
+ const struct timespec64 *end_time);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cd..747558d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -770,6 +770,21 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
}
EXPORT_SYMBOL(inet_recvmsg);
+int inet_recvmmsg(struct socket *sock, struct mmsghdr __user *ummsg,
+ unsigned int *vlen, unsigned int flags,
+ struct timespec *timeout, const struct timespec64 *end_time)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk->sk_prot->recvmmsg)
+ return -EOPNOTSUPP;
+
+ sock_rps_record_flow(sk);
+
+ return sk->sk_prot->recvmmsg(sk, ummsg, vlen, flags, timeout, end_time);
+}
+EXPORT_SYMBOL(inet_recvmmsg);
+
int inet_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
@@ -942,6 +957,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
+ .recvmmsg = inet_recvmmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.set_peek_off = sk_set_peek_off,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a..f7ce6a2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -571,6 +571,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
+ .recvmmsg = inet_recvmmsg, /* ok */
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = sk_set_peek_off,
diff --git a/net/socket.c b/net/socket.c
index 9b5f360..49e6cd6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2140,6 +2140,8 @@ static int copy_msghdr_to_user_gen(struct msghdr *msg_sys, int flags,
&msg->msg_controllen);
}
+#define MSG_CMSG_MASK (MSG_CMSG_CLOEXEC | MSG_CMSG_COMPAT)
+
static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags, int nosec)
{
@@ -2163,7 +2165,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
return err;
cmsg_ptr = (unsigned long)msg_sys->msg_control;
- msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+ msg_sys->msg_flags = flags & MSG_CMSG_MASK;
/* We assume all kernel code knows the size of sockaddr_storage */
msg_sys->msg_namelen = 0;
@@ -2218,6 +2220,21 @@ long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
* Linux recvmmsg interface
*/
+static int __proto_recvmmsg(struct socket *sock, struct mmsghdr __user *ummsg,
+ unsigned int *vlen, unsigned int flags,
+ struct timespec *timeout,
+ const struct timespec64 *end_time)
+{
+ if (!sock->ops->recvmmsg)
+ return -EOPNOTSUPP;
+
+ if (sock->file->f_flags & O_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+
+ /* defer LSM check and mmsg parsing to the proto operation */
+ return sock->ops->recvmmsg(sock, ummsg, vlen, flags, timeout, end_time);
+}
+
int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
unsigned int flags, struct timespec *timeout)
{
@@ -2243,6 +2260,12 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
if (err)
goto out_put;
+ err = __proto_recvmmsg(sock, mmsg, &vlen, flags, timeout, &end_time);
+ if (err != -EOPNOTSUPP) {
+ datagrams = vlen;
+ goto chk_error;
+ }
+
entry = mmsg;
compat_entry = (struct compat_mmsghdr __user *)mmsg;
@@ -2287,6 +2310,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
cond_resched();
}
+chk_error:
if (err == 0)
goto out_put;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH net-next 3/5] net/udp: factor out main skb processing routine
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
2016-11-25 15:39 ` [PATCH net-next 1/5] net/socket: factor out msghdr manipulation helpers Paolo Abeni
2016-11-25 15:39 ` [PATCH net-next 2/5] net/socket: add per protocol mmesg support Paolo Abeni
@ 2016-11-25 15:39 ` Paolo Abeni
2016-11-25 15:39 ` [PATCH net-next 4/5] net/socket: add helpers for recvmmsg Paolo Abeni
` (3 subsequent siblings)
6 siblings, 0 replies; 19+ messages in thread
From: Paolo Abeni @ 2016-11-25 15:39 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
we will reuse it later for mmsg implementation
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/ipv4/udp.c | 67 ++++++++++++++++++++++++++++++++---------------------
net/ipv6/udp.c | 73 ++++++++++++++++++++++++++++++++++++----------------------
2 files changed, 86 insertions(+), 54 deletions(-)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b3b6bc5..d99429d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1350,34 +1350,18 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
EXPORT_SYMBOL(udp_ioctl);
-/*
- * This should be easy, if there is something there we
- * return it, otherwise we block.
- */
-
-int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len)
+static int udp_process_skb(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, size_t len, int flags,
+ int *addr_len, int off, int peeking, int peeked)
{
- struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
- struct sk_buff *skb;
- unsigned int ulen, copied;
- int peeked, peeking, off;
- int err;
+ struct inet_sock *inet = inet_sk(sk);
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
+ int ulen = skb->len;
+ int copied = len;
+ int err;
- if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len, addr_len);
-
-try_again:
- peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
- if (!skb)
- return err;
-
- ulen = skb->len;
- copied = len;
if (copied > ulen - off)
copied = ulen - off;
else if (copied < ulen)
@@ -1446,10 +1430,41 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
}
kfree_skb(skb);
- /* starting over for a new packet, but check if we need to yield */
- cond_resched();
msg->msg_flags &= ~MSG_TRUNC;
- goto try_again;
+ return -EINVAL;
+}
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
+{
+ struct sk_buff *skb;
+ int peeked, peeking, off;
+ int err;
+
+ if (flags & MSG_ERRQUEUE)
+ return ip_recv_error(sk, msg, len, addr_len);
+
+try_again:
+ peeking = off = sk_peek_offset(sk, flags);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+ if (!skb)
+ return err;
+
+ err = udp_process_skb(sk, skb, msg, len, flags, addr_len, off, peeking,
+ peeked);
+ if (err < 0) {
+ if (err != -EINVAL)
+ return err;
+
+ /* restarting for a new packet, but check if we need to yield */
+ cond_resched();
+ goto try_again;
+ }
+ return err;
}
int __udp_disconnect(struct sock *sk, int flags)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ba25ec2..3218c64 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -318,38 +318,19 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
#endif
-/*
- * This should be easy, if there is something there we
- * return it, otherwise we block.
- */
-
-int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+static int udp6_process_skb(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, size_t len, int flags,
+ int *addr_len, int off, int peeking, int peeked)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet_sock *inet = inet_sk(sk);
- struct sk_buff *skb;
- unsigned int ulen, copied;
- int peeked, peeking, off;
- int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
+ int ulen = skb->len;
+ int copied = len;
int is_udp4;
+ int err;
- if (flags & MSG_ERRQUEUE)
- return ipv6_recv_error(sk, msg, len, addr_len);
-
- if (np->rxpmtu && np->rxopt.bits.rxpmtu)
- return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
-
-try_again:
- peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
- if (!skb)
- return err;
-
- ulen = skb->len;
- copied = len;
if (copied > ulen - off)
copied = ulen - off;
else if (copied < ulen)
@@ -456,10 +437,46 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
kfree_skb(skb);
- /* starting over for a new packet, but check if we need to yield */
- cond_resched();
msg->msg_flags &= ~MSG_TRUNC;
- goto try_again;
+ return -EINVAL;
+}
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ int peeked, peeking, off;
+ struct sk_buff *skb;
+ int err;
+
+ if (flags & MSG_ERRQUEUE)
+ return ipv6_recv_error(sk, msg, len, addr_len);
+
+ if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
+
+try_again:
+ peeking = off = sk_peek_offset(sk, flags);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+ if (!skb)
+ return err;
+
+ err = udp6_process_skb(sk, skb, msg, len, flags, addr_len, off, peeking,
+ peeked);
+ if (err < 0) {
+ if (err != -EINVAL)
+ return err;
+
+ /* restarting for a new packet, but check if we need to yield */
+ cond_resched();
+ goto try_again;
+ }
+ return err;
+
}
void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
--
1.8.3.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH net-next 4/5] net/socket: add helpers for recvmmsg
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
` (2 preceding siblings ...)
2016-11-25 15:39 ` [PATCH net-next 3/5] net/udp: factor out main skb processing routine Paolo Abeni
@ 2016-11-25 15:39 ` Paolo Abeni
2016-11-25 20:52 ` kbuild test robot
` (2 more replies)
2016-11-25 15:39 ` [PATCH net-next 5/5] udp: add recvmmsg implementation Paolo Abeni
` (2 subsequent siblings)
6 siblings, 3 replies; 19+ messages in thread
From: Paolo Abeni @ 2016-11-25 15:39 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
_skb_try_recv_datagram_batch dequeues multiple skb's from the
socket's receive queue, and runs the bulk_destructor callback under
the receive queue lock.
recvmmsg_ctx_from_user retrieves msghdr information from userspace,
and sets up the kernelspace context for processing one datagram.
recvmmsg_ctx_to_user copies to userspace the results of processing one
datagram.
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/linux/skbuff.h | 20 ++++++++++++++++
include/net/sock.h | 19 +++++++++++++++
net/core/datagram.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
net/socket.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 164 insertions(+)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c535fb..5672045 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1598,6 +1598,20 @@ static inline void __skb_insert(struct sk_buff *newsk,
list->qlen++;
}
+static inline void __skb_queue_unsplice(struct sk_buff *first,
+ struct sk_buff *last,
+ unsigned int n,
+ struct sk_buff_head *queue)
+{
+ struct sk_buff *next = last->next, *prev = first->prev;
+
+ queue->qlen -= n;
+ last->next = NULL;
+ first->prev = NULL;
+ next->prev = prev;
+ prev->next = next;
+}
+
static inline void __skb_queue_splice(const struct sk_buff_head *list,
struct sk_buff *prev,
struct sk_buff *next)
@@ -3032,6 +3046,12 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
const struct sk_buff *skb);
+struct sk_buff *__skb_try_recv_datagram_batch(struct sock *sk,
+ unsigned int flags,
+ unsigned int batch,
+ void (*bulk_destructor)(
+ struct sock *sk, int size),
+ int *err);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb),
diff --git a/include/net/sock.h b/include/net/sock.h
index 11126f4..3daf63a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1534,6 +1534,25 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
struct sockcm_cookie *sockc);
+struct recvmmsg_ctx {
+ struct iovec iovstack[UIO_FASTIOV];
+ struct msghdr msg_sys;
+ struct sockaddr __user *uaddr;
+ struct sockaddr_storage addr;
+ unsigned long cmsg_ptr;
+ struct iovec *iov;
+};
+
+int recvmmsg_ctx_from_user(struct sock *sk, struct mmsghdr __user *mmsg,
+ unsigned int flags, int nosec,
+ struct recvmmsg_ctx *ctx);
+int recvmmsg_ctx_to_user(struct mmsghdr __user **mmsg, int len,
+ unsigned int flags, struct recvmmsg_ctx *ctx);
+static inline void recvmmsg_ctx_free(struct recvmmsg_ctx *ctx)
+{
+ kfree(ctx->iov);
+}
+
static inline bool sock_recvmmsg_timeout(struct timespec *timeout,
struct timespec64 end_time)
{
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 49816af..90d1aa2 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -301,6 +301,71 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
}
EXPORT_SYMBOL(skb_recv_datagram);
+/**
+ * __skb_try_recv_datagram_batch - Receive a batch of datagram skbuff
+ * @sk: socket
+ * @flags: MSG_ flags
+ * @batch: maximum batch length
+ * @bulk_destructor: invoked under the receive lock on successful dequeue
+ * @err: error code returned
+ * @last: set to last peeked message to inform the wait function
+ * what to look for when peeking
+ *
+ * like __skb_try_recv_datagram, but dequeue a full batch up to the specified
+ * max length. Returned skbs are linked and the list is NULL terminated.
+ * Peeking is not supported.
+ */
+struct sk_buff *__skb_try_recv_datagram_batch(struct sock *sk,
+ unsigned int flags,
+ unsigned int batch,
+ void (*bulk_destructor)(
+ struct sock *sk, int size),
+ int *err)
+{
+ unsigned int datagrams = 0, totalsize = 0;
+ struct sk_buff *skb, *last, *first;
+ struct sk_buff_head *queue;
+
+ *err = sock_error(sk);
+ if (*err)
+ return NULL;
+
+ queue = &sk->sk_receive_queue;
+ spin_lock_bh(&queue->lock);
+ for (;;) {
+ if (!skb_queue_empty(queue))
+ break;
+
+ spin_unlock_bh(&queue->lock);
+
+ if (!sk_can_busy_loop(sk) ||
+ !sk_busy_loop(sk, flags & MSG_DONTWAIT))
+ goto no_packets;
+
+ spin_lock_bh(&queue->lock);
+ }
+
+ last = (struct sk_buff *)queue;
+ first = (struct sk_buff *)queue->next;
+ skb_queue_walk(queue, skb) {
+ last = skb;
+ totalsize += skb->truesize;
+ if (++datagrams == batch)
+ break;
+ }
+ __skb_queue_unsplice(first, last, datagrams, queue);
+
+ if (bulk_destructor)
+ bulk_destructor(sk, totalsize);
+ spin_unlock_bh(&queue->lock);
+ return first;
+
+no_packets:
+ *err = -EAGAIN;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_try_recv_datagram_batch);
+
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
consume_skb(skb);
diff --git a/net/socket.c b/net/socket.c
index 49e6cd6..ceb627b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2220,6 +2220,66 @@ long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
* Linux recvmmsg interface
*/
+int recvmmsg_ctx_from_user(struct sock *sk, struct mmsghdr __user *mmsg,
+ unsigned int flags, int nosec,
+ struct recvmmsg_ctx *ctx)
+{
+ struct user_msghdr __user *msg = (struct user_msghdr __user *)mmsg;
+ struct compat_msghdr __user *msg_compat;
+ ssize_t err;
+
+ ctx->iov = ctx->iovstack;
+ msg_compat = (struct compat_msghdr __user *)mmsg;
+ err = copy_msghdr_from_user_gen(&ctx->msg_sys, flags, msg_compat, msg,
+ &ctx->uaddr, &ctx->iov, &ctx->addr);
+ if (err < 0) {
+ ctx->iov = NULL;
+ return err;
+ }
+
+ ctx->cmsg_ptr = (unsigned long)ctx->msg_sys.msg_control;
+ ctx->msg_sys.msg_flags = flags & MSG_CMSG_MASK;
+
+ /* We assume all kernel code knows the size of sockaddr_storage */
+ ctx->msg_sys.msg_namelen = 0;
+
+ if (nosec)
+ return 0;
+
+ return security_socket_recvmsg(sk->sk_socket, &ctx->msg_sys,
+ msg_data_left(&ctx->msg_sys), flags);
+}
+
+int recvmmsg_ctx_to_user(struct mmsghdr __user **mmsg_ptr, int len,
+ unsigned int flags, struct recvmmsg_ctx *ctx)
+{
+ struct compat_mmsghdr __user *mmsg_compat;
+ struct mmsghdr __user *mmsg = *mmsg_ptr;
+ int err;
+
+ mmsg_compat = (struct compat_mmsghdr __user *)mmsg;
+ err = copy_msghdr_to_user_gen(&ctx->msg_sys, flags,
+ &mmsg_compat->msg_hdr, &mmsg->msg_hdr,
+ ctx->uaddr, &ctx->addr, ctx->cmsg_ptr);
+ if (err)
+ return err;
+
+ if (MSG_CMSG_COMPAT & flags) {
+ err = __put_user(len, &mmsg_compat->msg_len);
+ if (err < 0)
+ return err;
+
+ *mmsg_ptr = (struct mmsghdr __user *)(mmsg_compat + 1);
+ } else {
+ err = put_user(len, &mmsg->msg_len);
+ if (err < 0)
+ return err;
+
+ *mmsg_ptr = mmsg + 1;
+ }
+ return err;
+}
+
static int __proto_recvmmsg(struct socket *sock, struct mmsghdr __user *ummsg,
unsigned int *vlen, unsigned int flags,
struct timespec *timeout,
--
1.8.3.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 4/5] net/socket: add helpers for recvmmsg
2016-11-25 15:39 ` [PATCH net-next 4/5] net/socket: add helpers for recvmmsg Paolo Abeni
@ 2016-11-25 20:52 ` kbuild test robot
2016-11-25 20:52 ` kbuild test robot
2016-11-25 22:30 ` Eric Dumazet
2 siblings, 0 replies; 19+ messages in thread
From: kbuild test robot @ 2016-11-25 20:52 UTC (permalink / raw)
To: Paolo Abeni
Cc: kbuild-all, netdev, David S. Miller, Eric Dumazet,
Jesper Dangaard Brouer, Hannes Frederic Sowa, Sabrina Dubroca
[-- Attachment #1: Type: text/plain, Size: 1928 bytes --]
Hi Paolo,
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Paolo-Abeni/net-add-protocol-level-recvmmsg-support/20161126-033729
config: i386-randconfig-i1-201647 (attached as .config)
compiler: gcc-4.8 (Debian 4.8.4-1) 4.8.4
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All warnings (new ones prefixed by >>):
net/socket.c: In function 'recvmmsg_ctx_to_user':
>> net/socket.c:2263:11: warning: passing argument 3 of 'copy_msghdr_to_user_gen' from incompatible pointer type [enabled by default]
ctx->uaddr, &ctx->addr, ctx->cmsg_ptr);
^
net/socket.c:2115:12: note: expected 'struct msghdr *' but argument is of type 'struct user_msghdr *'
static int copy_msghdr_to_user_gen(struct msghdr *msg_sys, int flags,
^
vim +/copy_msghdr_to_user_gen +2263 net/socket.c
2247 return 0;
2248
2249 return security_socket_recvmsg(sk->sk_socket, &ctx->msg_sys,
2250 msg_data_left(&ctx->msg_sys), flags);
2251 }
2252
2253 int recvmmsg_ctx_to_user(struct mmsghdr __user **mmsg_ptr, int len,
2254 unsigned int flags, struct recvmmsg_ctx *ctx)
2255 {
2256 struct compat_mmsghdr __user *mmsg_compat;
2257 struct mmsghdr __user *mmsg = *mmsg_ptr;
2258 int err;
2259
2260 mmsg_compat = (struct compat_mmsghdr __user *)mmsg;
2261 err = copy_msghdr_to_user_gen(&ctx->msg_sys, flags,
2262 &mmsg_compat->msg_hdr, &mmsg->msg_hdr,
> 2263 ctx->uaddr, &ctx->addr, ctx->cmsg_ptr);
2264 if (err)
2265 return err;
2266
2267 if (MSG_CMSG_COMPAT & flags) {
2268 err = __put_user(len, &mmsg_compat->msg_len);
2269 if (err < 0)
2270 return err;
2271
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 24598 bytes --]
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 4/5] net/socket: add helpers for recvmmsg
2016-11-25 15:39 ` [PATCH net-next 4/5] net/socket: add helpers for recvmmsg Paolo Abeni
2016-11-25 20:52 ` kbuild test robot
@ 2016-11-25 20:52 ` kbuild test robot
2016-11-25 22:30 ` Eric Dumazet
2 siblings, 0 replies; 19+ messages in thread
From: kbuild test robot @ 2016-11-25 20:52 UTC (permalink / raw)
To: Paolo Abeni
Cc: kbuild-all, netdev, David S. Miller, Eric Dumazet,
Jesper Dangaard Brouer, Hannes Frederic Sowa, Sabrina Dubroca
[-- Attachment #1: Type: text/plain, Size: 1549 bytes --]
Hi Paolo,
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Paolo-Abeni/net-add-protocol-level-recvmmsg-support/20161126-033729
config: i386-randconfig-s0-201647 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
net/socket.c: In function 'recvmmsg_ctx_to_user':
>> net/socket.c:2262:11: error: passing argument 3 of 'copy_msghdr_to_user_gen' from incompatible pointer type [-Werror=incompatible-pointer-types]
&mmsg_compat->msg_hdr, &mmsg->msg_hdr,
^
net/socket.c:2115:12: note: expected 'struct msghdr *' but argument is of type 'struct user_msghdr *'
static int copy_msghdr_to_user_gen(struct msghdr *msg_sys, int flags,
^~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/copy_msghdr_to_user_gen +2262 net/socket.c
2256 struct compat_mmsghdr __user *mmsg_compat;
2257 struct mmsghdr __user *mmsg = *mmsg_ptr;
2258 int err;
2259
2260 mmsg_compat = (struct compat_mmsghdr __user *)mmsg;
2261 err = copy_msghdr_to_user_gen(&ctx->msg_sys, flags,
> 2262 &mmsg_compat->msg_hdr, &mmsg->msg_hdr,
2263 ctx->uaddr, &ctx->addr, ctx->cmsg_ptr);
2264 if (err)
2265 return err;
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 22005 bytes --]
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 4/5] net/socket: add helpers for recvmmsg
2016-11-25 15:39 ` [PATCH net-next 4/5] net/socket: add helpers for recvmmsg Paolo Abeni
2016-11-25 20:52 ` kbuild test robot
2016-11-25 20:52 ` kbuild test robot
@ 2016-11-25 22:30 ` Eric Dumazet
2016-11-27 16:21 ` Paolo Abeni
2 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2016-11-25 22:30 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
On Fri, 2016-11-25 at 16:39 +0100, Paolo Abeni wrote:
> _skb_try_recv_datagram_batch dequeues multiple skb's from the
> socket's receive queue, and runs the bulk_destructor callback under
> the receive queue lock.
...
> + last = (struct sk_buff *)queue;
> + first = (struct sk_buff *)queue->next;
> + skb_queue_walk(queue, skb) {
> + last = skb;
> + totalsize += skb->truesize;
> + if (++datagrams == batch)
> + break;
> + }
This is absolutely not good.
Walking through a list, bringing 2 cache lines per skb, is not the
proper way to deal with bulking.
And I do not see where 'batch' value coming from user space is capped ?
Is it really vlen argument coming from recvmmsg() system call ???
This code runs with BH masked, so you do not want to give user a way to
make you loop there 1000 times
Bulking is nice, only if you do not compromise with system stability and
latency requirements from other users/applications.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 4/5] net/socket: add helpers for recvmmsg
2016-11-25 22:30 ` Eric Dumazet
@ 2016-11-27 16:21 ` Paolo Abeni
0 siblings, 0 replies; 19+ messages in thread
From: Paolo Abeni @ 2016-11-27 16:21 UTC (permalink / raw)
To: Eric Dumazet
Cc: netdev, David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
Hi Eric,
On Fri, 2016-11-25 at 14:30 -0800, Eric Dumazet wrote:
> On Fri, 2016-11-25 at 16:39 +0100, Paolo Abeni wrote:
> > _skb_try_recv_datagram_batch dequeues multiple skb's from the
> > socket's receive queue, and runs the bulk_destructor callback under
> > the receive queue lock.
>
> ...
>
> > + last = (struct sk_buff *)queue;
> > + first = (struct sk_buff *)queue->next;
> > + skb_queue_walk(queue, skb) {
> > + last = skb;
> > + totalsize += skb->truesize;
> > + if (++datagrams == batch)
> > + break;
> > + }
>
> This is absolutely not good.
>
> Walking through a list, bringing 2 cache lines per skb, is not the
> proper way to deal with bulking.
>
> And I do not see where 'batch' value coming from user space is capped ?
>
> Is it really vlen argument coming from recvmmsg() system call ???
>
> This code runs with BH masked, so you do not want to give user a way to
> make you loop there 1000 times
>
> Bulking is nice, only if you do not compromise with system stability and
> latency requirements from other users/applications.
Thank you for reviewing this.
You are right, the cacheline miss while accessing skb->truesize has
measurable performance impact, and the max burst length comes in from
recvmmsg().
We can easily cap the burst to some max value (e.g. 64 or less) and we
can pass to the bulk destructor the skb list and burst length without
accessing skb truesize beforehand. If the burst is short, say 8 skbs or
less, the bulk destructor walk again the list and release the memory,
elsewhere it defers the release after __skb_try_recv_datagram_batch()
completion: we walk the list without the lock held and we acquire it
later again to release all the memory.
Thank you again,
Paolo
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH net-next 5/5] udp: add recvmmsg implementation
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
` (3 preceding siblings ...)
2016-11-25 15:39 ` [PATCH net-next 4/5] net/socket: add helpers for recvmmsg Paolo Abeni
@ 2016-11-25 15:39 ` Paolo Abeni
2016-11-25 17:09 ` Hannes Frederic Sowa
2016-11-25 17:37 ` [PATCH net-next 0/5] net: add protocol level recvmmsg support Jesper Dangaard Brouer
2016-11-25 21:16 ` Eric Dumazet
6 siblings, 1 reply; 19+ messages in thread
From: Paolo Abeni @ 2016-11-25 15:39 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
skbs are extracted from the receive queue in burts, and a single
sk_rmem_alloc/forward allocated memory update is performed for
each burst.
MSG_PEEK and MSG_ERRQUEUE are not supported to keep the implementation
as simple as possible.
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/net/udp.h | 7 +++
net/ipv4/udp.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/udp_impl.h | 3 ++
net/ipv4/udplite.c | 1 +
net/ipv6/udp.c | 16 +++++++
net/ipv6/udp_impl.h | 3 ++
net/ipv6/udplite.c | 1 +
7 files changed, 152 insertions(+)
diff --git a/include/net/udp.h b/include/net/udp.h
index 1661791..2bd63c9 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -308,6 +308,13 @@ struct sock *__udp6_lib_lookup(struct net *net,
struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport);
+int __udp_recvmmsg(struct sock *sk, struct mmsghdr __user *ummsg,
+ unsigned int *vlen, unsigned int flags,
+ struct timespec *timeout, const struct timespec64 *end_time,
+ int (*udp_process_msg)(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg,
+ unsigned int flags));
+
/*
* SNMP statistics for UDP and UDP-Lite
*/
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d99429d..44f1326 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1467,6 +1467,126 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return err;
}
+static void udp_skb_bulk_destructor(struct sock *sk, int totalsize)
+{
+ udp_rmem_release(sk, totalsize, 1);
+}
+
+int __udp_recvmmsg(struct sock *sk, struct mmsghdr __user *mmsg,
+ unsigned int *nr, unsigned int flags,
+ struct timespec *timeout, const struct timespec64 *end_time,
+ int (*process_msg)(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg,
+ unsigned int flags))
+{
+ long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ int datagrams = 0, err = 0, ret = 0, vlen = *nr;
+ struct sk_buff *skb, *next, *last;
+
+ if (flags & (MSG_PEEK | MSG_ERRQUEUE))
+ return -EOPNOTSUPP;
+
+again:
+ for (;;) {
+ skb = __skb_try_recv_datagram_batch(sk, flags, vlen - datagrams,
+ udp_skb_bulk_destructor,
+ &err);
+ if (skb)
+ break;
+
+ if ((err != -EAGAIN) || !timeo || (flags & MSG_DONTWAIT))
+ goto out;
+
+ /* no packets, and we are supposed to wait for the next */
+ if (timeout) {
+ long expires;
+
+ if (sock_recvmmsg_timeout(timeout, *end_time))
+ goto out;
+ expires = timeout->tv_sec * HZ +
+ (timeout->tv_nsec >> 20);
+ if (expires + 1 < timeo)
+ timeo = expires + 1;
+ }
+
+ /* the queue was empty when tried to dequeue */
+ last = (struct sk_buff *)&sk->sk_receive_queue;
+ if (__skb_wait_for_more_packets(sk, &err, &timeo, last))
+ goto out;
+ }
+
+ for (; skb; skb = next) {
+ struct recvmmsg_ctx ctx;
+ int len;
+
+ next = skb->next;
+ err = recvmmsg_ctx_from_user(sk, mmsg, flags, datagrams, &ctx);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto free_ctx;
+ }
+
+ /* process skb's until we find a valid one */
+ for (;;) {
+ len = process_msg(sk, skb, &ctx.msg_sys, flags);
+ if (len >= 0)
+ break;
+
+ /* only non csum errors are propagated to the caller */
+ if (len != -EINVAL) {
+ err = len;
+ goto free_ctx;
+ }
+
+ if (!next)
+ goto free_ctx;
+ skb = next;
+ next = skb->next;
+ }
+
+ err = recvmmsg_ctx_to_user(&mmsg, len, flags, &ctx);
+ if (err < 0)
+ goto free_ctx;
+
+ /* now we're sure the skb is fully processed, we can count it */
+ datagrams++;
+
+free_ctx:
+ recvmmsg_ctx_free(&ctx);
+ if (err < 0)
+ ret = err;
+ }
+
+ /* only handle waitforone after processing a full batch. */
+ if (datagrams && (flags & MSG_WAITFORONE))
+ flags |= MSG_DONTWAIT;
+
+ if (!ret && (datagrams < vlen)) {
+ cond_resched();
+ goto again;
+ }
+
+out:
+ *nr = datagrams;
+ return ret < 0 ? ret : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(__udp_recvmmsg);
+
+static int udp_process_msg(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, unsigned int flags)
+{
+ return udp_process_skb(sk, skb, msg, msg_data_left(msg), flags,
+ &msg->msg_namelen, 0, 0, skb->peeked);
+}
+
+int udp_recvmmsg(struct sock *sk, struct mmsghdr __user *ummsg,
+ unsigned int *nr, unsigned int flags, struct timespec *timeout,
+ const struct timespec64 *end_time)
+{
+ return __udp_recvmmsg(sk, ummsg, nr, flags, timeout, end_time,
+ udp_process_msg);
+}
+
int __udp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -2329,6 +2449,7 @@ struct proto udp_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
+ .recvmmsg = udp_recvmmsg,
.sendpage = udp_sendpage,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 7e0fe4b..f11d608 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -23,6 +23,9 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
#endif
int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
+int udp_recvmmsg(struct sock *sk, struct mmsghdr __user *ummsg,
+ unsigned int *nr, unsigned int flags, struct timespec *timeout,
+ const struct timespec64 *end_time);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 59f10fe..a0e7fe9 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -49,6 +49,7 @@ struct proto udplite_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
+ .recvmmsg = udp_recvmmsg,
.sendpage = udp_sendpage,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3218c64..2c034be 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -479,6 +479,21 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
+static int udp6_process_msg(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, unsigned int flags)
+{
+ return udp6_process_skb(sk, skb, msg, msg_data_left(msg), flags,
+ &msg->msg_namelen, 0, 0, skb->peeked);
+}
+
+int udpv6_recvmmsg(struct sock *sk, struct mmsghdr __user *ummsg,
+ unsigned int *nr, unsigned int flags,
+ struct timespec *timeout, const struct timespec64 *end_time)
+{
+ return __udp_recvmmsg(sk, ummsg, nr, flags, timeout, end_time,
+ udp6_process_msg);
+}
+
void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info,
struct udp_table *udptable)
@@ -1443,6 +1458,7 @@ struct proto udpv6_prot = {
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
+ .recvmmsg = udpv6_recvmmsg,
.release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index f6eb1ab..fe566db 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,6 +26,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
+int udpv6_recvmmsg(struct sock *sk, struct mmsghdr __user *ummsg,
+ unsigned int *nr, unsigned int flags,
+ struct timespec *timeout, const struct timespec64 *end_time);
int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udpv6_destroy_sock(struct sock *sk);
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 2784cc3..23d80ac 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,6 +45,7 @@ struct proto udplitev6_prot = {
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
+ .recvmmsg = udpv6_recvmmsg,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v6_get_port,
--
1.8.3.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 5/5] udp: add recvmmsg implementation
2016-11-25 15:39 ` [PATCH net-next 5/5] udp: add recvmmsg implementation Paolo Abeni
@ 2016-11-25 17:09 ` Hannes Frederic Sowa
2016-11-28 12:32 ` David Laight
2016-11-30 0:22 ` David Miller
0 siblings, 2 replies; 19+ messages in thread
From: Hannes Frederic Sowa @ 2016-11-25 17:09 UTC (permalink / raw)
To: Paolo Abeni, netdev
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Sabrina Dubroca
On 25.11.2016 16:39, Paolo Abeni wrote:
> skbs are extracted from the receive queue in burts, and a single
> sk_rmem_alloc/forward allocated memory update is performed for
> each burst.
> MSG_PEEK and MSG_ERRQUEUE are not supported to keep the implementation
> as simple as possible.
>
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
During review we discussed on how to handle major errors in the kernel:
The old code and the new code still can report back success even though
the kernel got back an EFAULT while copying from kernel space to user
space (due to bad pointers).
I favor that we drop all packets (also the already received batches) in
this case and let the code report -EFAULT and increase sk_drops for all
dropped packets from the queue.
Currently sk_err is set so the next syscall would get an -EFAULT, which
seems very bad and can also be overwritten by incoming icmp packets, so
we never get a notification that we actually had a bad pointer somewhere
in the mmsghdr. Also delivering -EFAULT on the follow-up syscalls really
will make people confused that use strace.
If people would like to know the amount of packets dropped we can make
sk_drops readable by an getsockopt.
Thoughts?
Unfortunately the interface doesn't allow for better error handling.
^ permalink raw reply [flat|nested] 19+ messages in thread
* RE: [PATCH net-next 5/5] udp: add recvmmsg implementation
2016-11-25 17:09 ` Hannes Frederic Sowa
@ 2016-11-28 12:32 ` David Laight
2016-11-30 0:22 ` David Miller
1 sibling, 0 replies; 19+ messages in thread
From: David Laight @ 2016-11-28 12:32 UTC (permalink / raw)
To: 'Hannes Frederic Sowa', Paolo Abeni,
netdev@vger.kernel.org
Cc: David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Sabrina Dubroca
From: Hannes Frederic Sowa
> Sent: 25 November 2016 17:09
...
> Currently sk_err is set so the next syscall would get an -EFAULT, which
> seems very bad and can also be overwritten by incoming icmp packets, so
> we never get a notification that we actually had a bad pointer somewhere
> in the mmsghdr. Also delivering -EFAULT on the follow-up syscalls really
> will make people confused that use strace.
Saving an error code like that seems completely wrong to me.
It is not unreasonable for there to be multiple system calls active
on a single socket at the same time - so any error has to be returned to
the system call that generated it.
(Current locking rules might impose restrictions, but they could change.)
A completely sticky error code might be useful if the only valid action
is close().
If copytouser() fails I'd guess that most system calls just return EFAULT
and discard any data that might have been copied to the start of the users
buffer.
Not unreasonable since it is likely to be a coding error.
David
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 5/5] udp: add recvmmsg implementation
2016-11-25 17:09 ` Hannes Frederic Sowa
2016-11-28 12:32 ` David Laight
@ 2016-11-30 0:22 ` David Miller
2016-11-30 3:47 ` Hannes Frederic Sowa
1 sibling, 1 reply; 19+ messages in thread
From: David Miller @ 2016-11-30 0:22 UTC (permalink / raw)
To: hannes; +Cc: pabeni, netdev, edumazet, brouer, sd
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Fri, 25 Nov 2016 18:09:00 +0100
> During review we discussed on how to handle major errors in the kernel:
>
> The old code and the new code still can report back success even though
> the kernel got back an EFAULT while copying from kernel space to user
> space (due to bad pointers).
>
> I favor that we drop all packets (also the already received batches) in
> this case and let the code report -EFAULT and increase sk_drops for all
> dropped packets from the queue.
>
> Currently sk_err is set so the next syscall would get an -EFAULT, which
> seems very bad and can also be overwritten by incoming icmp packets, so
> we never get a notification that we actually had a bad pointer somewhere
> in the mmsghdr. Also delivering -EFAULT on the follow-up syscalls really
> will make people confused that use strace.
>
> If people would like to know the amount of packets dropped we can make
> sk_drops readable by an getsockopt.
>
> Thoughts?
>
> Unfortunately the interface doesn't allow for better error handling.
I think this is a major problem.
If, as a side effect of batch dequeueing the SKBs from the socket,
you cannot stop properly mid-transfer if an error occurs, well then
you simply cannot batch like that.
You have to stop the exact byte where an error occurs mid-stream,
return the successful amount of bytes transferred, and then return
the error on the next recvmmsg call.
There is no other sane error reporting strategy.
If I get 4 frames, and the kernel can successfully copy the first
three and get an -EFAULT on the 4th. Dammit you better tell the
application this so it can properly process the first 3 packets and
then determine how it is going to error out and recover for the 4th
one.
If we need to add prioritized sk_err stuff, or another value like
"sk_app_err" to handle the ICMP vs. -EFAULT issue, so be it.
I know what you guys are thinking, in that you can't figure out a
way to avoid the transactional overhead if it is necessary to
"put back" some SKBs if one of them in the batch gets a fault.
That's too bad, we need a proper implementation and proper error
reporting. Those performance numbers are useless if we effectively
lose error notifications.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 5/5] udp: add recvmmsg implementation
2016-11-30 0:22 ` David Miller
@ 2016-11-30 3:47 ` Hannes Frederic Sowa
0 siblings, 0 replies; 19+ messages in thread
From: Hannes Frederic Sowa @ 2016-11-30 3:47 UTC (permalink / raw)
To: David Miller; +Cc: pabeni, netdev, edumazet, brouer, sd
Hello,
On Wed, Nov 30, 2016, at 01:22, David Miller wrote:
> From: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Date: Fri, 25 Nov 2016 18:09:00 +0100
>
> > During review we discussed on how to handle major errors in the kernel:
> >
> > The old code and the new code still can report back success even though
> > the kernel got back an EFAULT while copying from kernel space to user
> > space (due to bad pointers).
> >
> > I favor that we drop all packets (also the already received batches) in
> > this case and let the code report -EFAULT and increase sk_drops for all
> > dropped packets from the queue.
> >
> > Currently sk_err is set so the next syscall would get an -EFAULT, which
> > seems very bad and can also be overwritten by incoming icmp packets, so
> > we never get a notification that we actually had a bad pointer somewhere
> > in the mmsghdr. Also delivering -EFAULT on the follow-up syscalls really
> > will make people confused that use strace.
> >
> > If people would like to know the amount of packets dropped we can make
> > sk_drops readable by an getsockopt.
> >
> > Thoughts?
> >
> > Unfortunately the interface doesn't allow for better error handling.
>
> I think this is a major problem.
>
> If, as a side effect of batch dequeueing the SKBs from the socket,
> you cannot stop properly mid-transfer if an error occurs, well then
> you simply cannot batch like that.
>
> You have to stop the exact byte where an error occurs mid-stream,
> return the successful amount of bytes transferred, and then return
> the error on the next recvmmsg call.
>
> There is no other sane error reporting strategy.
Actually I think there is no sane error handling strategy at all.
SIGSEGV and EFAULT should be delivered reliable in my opinion and all
the details become very difficult suddenly.
E.g. if we recvmmsg with -EFAULT and we try to deliver the fault on the
following socket call, I am pretty certain most programs don't bother
with close() return values, so the application might simply ignore it.
Also -EFAULT is not in our repository for error codes to return.
In case of UDP we can simply drop the packets and I would be okay with
that (in some cases we actually guarantee correctly ordered packets,
even for UDP, so we can't simply queue those packets back).
Also I would very much prefer ptrace/gdb to show me the syscall where
the memory management fault happened and not the next one.
> If I get 4 frames, and the kernel can successfully copy the first
> three and get an -EFAULT on the 4th. Dammit you better tell the
> application this so it can properly process the first 3 packets and
> then determine how it is going to error out and recover for the 4th
> one.
>
> If we need to add prioritized sk_err stuff, or another value like
> "sk_app_err" to handle the ICMP vs. -EFAULT issue, so be it.
>
> I know what you guys are thinking, in that you can't figure out a
> way to avoid the transactional overhead if it is necessary to
> "put back" some SKBs if one of them in the batch gets a fault.
I prefer correctness over performance all the time. :)
> That's too bad, we need a proper implementation and proper error
> reporting. Those performance numbers are useless if we effectively
> lose error notifications.
We have those problems right now and besides deprecating the syscalls I
have no idea how to fix this reliably and would probably need a lot of
changes (besides the sk_app_err solution, which I don't really favor at
all).
The syscall should have been designed in a way that the struct mmsghdr
-> msg_len would be ssize_t, so we could return error codes per fragment
and test before starting the batch that we have proper memory, so we
don't fail in the management code. :(
Bye,
Hannes
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 0/5] net: add protocol level recvmmsg support
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
` (4 preceding siblings ...)
2016-11-25 15:39 ` [PATCH net-next 5/5] udp: add recvmmsg implementation Paolo Abeni
@ 2016-11-25 17:37 ` Jesper Dangaard Brouer
2016-11-28 10:52 ` Paolo Abeni
2016-11-25 21:16 ` Eric Dumazet
6 siblings, 1 reply; 19+ messages in thread
From: Jesper Dangaard Brouer @ 2016-11-25 17:37 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Hannes Frederic Sowa,
Sabrina Dubroca, brouer
On Fri, 25 Nov 2016 16:39:51 +0100
Paolo Abeni <pabeni@redhat.com> wrote:
> The goal of recvmmsg() is to amortize the syscall overhead on a possible
> long messages batch, but for most networking protocols, e.g. udp the
> syscall overhead is negligible compared to the protocol specific operations
> like dequeuing.
Sounds good. I'm excited to see work in this area! :-)
[...]
> The udp version of recvmmsg() tries to bulk-dequeue skbs from the receive queue,
> each burst acquires the lock once to extract as many skbs from the receive
> queue as possible, up to the number needed to reach the specified maximum.
> rmem_alloc and fwd memory are touched once per burst.
Sounds good.
> When the protocol-level recvmmsg() is not available or it does not support the
> specified flags, the code falls-back to the current generic implementation.
>
> This series introduces some behavior changes for the recvmmsg() syscall (only
> for udp):
> - the timeout argument now works as expected
> - recvmmsg() does not stop anymore when getting the first error, instead
> it keeps processing the current burst and then handle the error code as
> in the generic implementation.
>
> The measured performance delta is as follow:
>
> before after
> (Kpps) (Kpps)
>
> udp flood[1] 570 1800(+215%)
> max tput[2] 1850 3500(+89%)
> single queue[3] 1850 1630(-11%)
>
> [1] line rate flood using multiple 64 bytes packets and multiple flows
Is [1] sending multiple flow in the a single UDP-sink?
> [2] like [1], but using the minimum number of flows to saturate the user space
> sink, that is 1 flow for the old kernel and 3 for the patched one.
> the tput increases since the contention on the rx lock is low.
> [3] like [1] but using a single flow with both old and new kernel. All the
> packets land on the same rx queue and there is a single ksoftirqd instance
> running
It is important to know, if ksoftirqd and the UDP-sink runs on the same CPU?
> The regression in the single queue scenario is actually due to the improved
> performance of the recvmmsg() syscall: the user space process is now
> significantly faster than the ksoftirqd process so that the latter needs often
> to wake up the user space process.
When measuring these things, make sure that we/you measure both the packets
actually received in the userspace UDP-sink, and also measure packets
RX processed by ksoftirq (and I often also look at what HW got delivered).
Some times, when userspace is too slow, the kernel can/will drop packets.
It is actually quite easily verified with cmdline:
nstat > /dev/null && sleep 1 && nstat
For HW measurements I use the tool ethtool_stats.pl:
https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
> Since ksoftirqd is the bottle-neck is such scenario, overall this causes a
> tput reduction. In a real use case, where the udp sink is performing some
> actual processing of the received data, such regression is unlikely to really
> have an effect.
My experience is that the performance of RX UDP is affected by:
* if socket is connected or not (yes, RX side also)
* state of /proc/sys/net/ipv4/ip_early_demux
You don't need to run with all the combinations, but it would be nice
if you specify what config your have based your measurements on (and
keep them stable in your runs).
I've actually implemented the "--connect" option to my udp_sink
program[1] today, but I've not pushed it yet, if you are interested.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
[1]
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 0/5] net: add protocol level recvmmsg support
2016-11-25 17:37 ` [PATCH net-next 0/5] net: add protocol level recvmmsg support Jesper Dangaard Brouer
@ 2016-11-28 10:52 ` Paolo Abeni
2016-11-28 12:21 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 19+ messages in thread
From: Paolo Abeni @ 2016-11-28 10:52 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: netdev, David S. Miller, Eric Dumazet, Hannes Frederic Sowa,
Sabrina Dubroca
Hi Jesper,
On Fri, 2016-11-25 at 18:37 +0100, Jesper Dangaard Brouer wrote:
> > The measured performance delta is as follow:
> >
> > before after
> > (Kpps) (Kpps)
> >
> > udp flood[1] 570 1800(+215%)
> > max tput[2] 1850 3500(+89%)
> > single queue[3] 1850 1630(-11%)
> >
> > [1] line rate flood using multiple 64 bytes packets and multiple flows
>
> Is [1] sending multiple flow in the a single UDP-sink?
Yes, in the test scenario [1] there are multiple UDP flows using 16
different rx queues on the receiver host, and a single user space
reader.
> > [2] like [1], but using the minimum number of flows to saturate the user space
> > sink, that is 1 flow for the old kernel and 3 for the patched one.
> > the tput increases since the contention on the rx lock is low.
> > [3] like [1] but using a single flow with both old and new kernel. All the
> > packets land on the same rx queue and there is a single ksoftirqd instance
> > running
>
> It is important to know, if ksoftirqd and the UDP-sink runs on the same CPU?
No pinning is enforced. The scheduler moves the user space process on a
different cpu in respect to the ksoftriqd kernel thread.
> > The regression in the single queue scenario is actually due to the improved
> > performance of the recvmmsg() syscall: the user space process is now
> > significantly faster than the ksoftirqd process so that the latter needs often
> > to wake up the user space process.
>
> When measuring these things, make sure that we/you measure both the packets
> actually received in the userspace UDP-sink, and also measure packets
> RX processed by ksoftirq (and I often also look at what HW got delivered).
> Some times, when userspace is too slow, the kernel can/will drop packets.
>
> It is actually quite easily verified with cmdline:
>
> nstat > /dev/null && sleep 1 && nstat
>
> For HW measurements I use the tool ethtool_stats.pl:
> https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
We collected the UDP stats for all the three scenarios; we have lot of
drop in test[1] and little, by design, in test[2]. In test [3], with the
patched kernel, the drops are 0: ksoftirqd is way slower than the user
space sink.
> > Since ksoftirqd is the bottle-neck is such scenario, overall this causes a
> > tput reduction. In a real use case, where the udp sink is performing some
> > actual processing of the received data, such regression is unlikely to really
> > have an effect.
>
> My experience is that the performance of RX UDP is affected by:
> * if socket is connected or not (yes, RX side also)
> * state of /proc/sys/net/ipv4/ip_early_demux
>
> You don't need to run with all the combinations, but it would be nice
> if you specify what config your have based your measurements on (and
> keep them stable in your runs).
>
> I've actually implemented the "--connect" option to my udp_sink
> program[1] today, but I've not pushed it yet, if you are interested.
The reported numbers are all gathered with unconnected sockets and early
demux enabled.
We also used connected socket for test[3], with relative little
difference (the tput increased for both unpatched and patched kernel,
and the difference was roughly the same).
Paolo
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 0/5] net: add protocol level recvmmsg support
2016-11-28 10:52 ` Paolo Abeni
@ 2016-11-28 12:21 ` Jesper Dangaard Brouer
2016-11-28 13:52 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 19+ messages in thread
From: Jesper Dangaard Brouer @ 2016-11-28 12:21 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Hannes Frederic Sowa,
Sabrina Dubroca, brouer
On Mon, 28 Nov 2016 11:52:38 +0100
Paolo Abeni <pabeni@redhat.com> wrote:
> Hi Jesper,
>
> On Fri, 2016-11-25 at 18:37 +0100, Jesper Dangaard Brouer wrote:
> > > The measured performance delta is as follow:
> > >
> > > before after
> > > (Kpps) (Kpps)
> > >
> > > udp flood[1] 570 1800(+215%)
> > > max tput[2] 1850 3500(+89%)
> > > single queue[3] 1850 1630(-11%)
> > >
> > > [1] line rate flood using multiple 64 bytes packets and multiple flows
> >
> > Is [1] sending multiple flow in the a single UDP-sink?
>
> Yes, in the test scenario [1] there are multiple UDP flows using 16
> different rx queues on the receiver host, and a single user space
> reader.
>
> > > [2] like [1], but using the minimum number of flows to saturate the user space
> > > sink, that is 1 flow for the old kernel and 3 for the patched one.
> > > the tput increases since the contention on the rx lock is low.
> > > [3] like [1] but using a single flow with both old and new kernel. All the
> > > packets land on the same rx queue and there is a single ksoftirqd instance
> > > running
> >
> > It is important to know, if ksoftirqd and the UDP-sink runs on the same CPU?
>
> No pinning is enforced. The scheduler moves the user space process on a
> different cpu in respect to the ksoftriqd kernel thread.
This floating userspace process can cause a high variation between test
runs. On my system, the performance drops to approx 600Kpps when
ksoftirqd and udp_sink share the same CPU.
Quick run with your patches applied:
Sender: pktgen with big packets
./pktgen_sample03_burst_single_flow.sh -i mlx5p2 -d 198.18.50.1 \
-m 7c:fe:90:c7:b1:cf -t1 -b128 -s 1472
Forced CPU0 for both ksoftirq and udp_sink
# taskset -c 0 ./udp_sink --count $((10**7)) --port 9 --repeat 1
ns pps cycles
recvMmsg/32 run: 0 10000000 1667.93 599547.16 6685
recvmsg run: 0 10000000 1810.70 552273.39 7257
read run: 0 10000000 1634.72 611723.95 6552
recvfrom run: 0 10000000 1585.06 630891.39 6353
> > > The regression in the single queue scenario is actually due to the improved
> > > performance of the recvmmsg() syscall: the user space process is now
> > > significantly faster than the ksoftirqd process so that the latter needs often
> > > to wake up the user space process.
> >
> > When measuring these things, make sure that we/you measure both the packets
> > actually received in the userspace UDP-sink, and also measure packets
> > RX processed by ksoftirq (and I often also look at what HW got delivered).
> > Some times, when userspace is too slow, the kernel can/will drop packets.
> >
> > It is actually quite easily verified with cmdline:
> >
> > nstat > /dev/null && sleep 1 && nstat
> >
> > For HW measurements I use the tool ethtool_stats.pl:
> > https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
>
> We collected the UDP stats for all the three scenarios; we have lot of
> drop in test[1] and little, by design, in test[2]. In test [3], with the
> patched kernel, the drops are 0: ksoftirqd is way slower than the user
> space sink.
>
> > > Since ksoftirqd is the bottle-neck is such scenario, overall this causes a
> > > tput reduction. In a real use case, where the udp sink is performing some
> > > actual processing of the received data, such regression is unlikely to really
> > > have an effect.
> >
> > My experience is that the performance of RX UDP is affected by:
> > * if socket is connected or not (yes, RX side also)
> > * state of /proc/sys/net/ipv4/ip_early_demux
> >
> > You don't need to run with all the combinations, but it would be nice
> > if you specify what config your have based your measurements on (and
> > keep them stable in your runs).
> >
> > I've actually implemented the "--connect" option to my udp_sink
> > program[1] today, but I've not pushed it yet, if you are interested.
>
> The reported numbers are all gathered with unconnected sockets and early
> demux enabled.
>
> We also used connected socket for test[3], with relative little
> difference (the tput increased for both unpatched and patched kernel,
> and the difference was roughly the same).
When I use connected sockets (RX side) and ip_early_demux enabled, I do
see a performance boost for recvmmsg. With these patches applied,
forced ksoftirqd on CPU0 and udp_sink on CPU2, pktgen single flow
sending size 1472 bytes.
$ sysctl net/ipv4/ip_early_demux
net.ipv4.ip_early_demux = 1
$ grep -H . /proc/sys/net/core/{r,w}mem_max
/proc/sys/net/core/rmem_max:1048576
/proc/sys/net/core/wmem_max:1048576
# taskset -c 2 ./udp_sink --count $((10**7)) --port 9 --repeat 1
# ns pps cycles
recvMmsg/32 run: 0 10000000 462.51 2162095.23 1853
recvmsg run: 0 10000000 536.47 1864041.75 2150
read run: 0 10000000 492.01 2032460.71 1972
recvfrom run: 0 10000000 553.94 1805262.84 2220
# taskset -c 2 ./udp_sink --count $((10**7)) --port 9 --repeat 1 --connect
# ns pps cycles
recvMmsg/32 run: 0 10000000 405.15 2468225.03 1623
recvmsg run: 0 10000000 548.23 1824049.58 2197
read run: 0 10000000 489.76 2041825.27 1962
recvfrom run: 0 10000000 466.18 2145091.77 1868
My theory is that by enabling connect'ed RX socket, the ksoftirqd gets
faster (no fib_lookup) and is no-longer a bottleneck. This is
confirmed by the nstat output below.
Below: unconnected
$ nstat > /dev/null && sleep 1 && nstat
#kernel
IpInReceives 2143944 0.0
IpInDelivers 2143945 0.0
UdpInDatagrams 2143944 0.0
IpExtInOctets 3125889306 0.0
IpExtInNoECTPkts 2143956 0.0
Below: connected
$ nstat > /dev/null && sleep 1 && nstat
#kernel
IpInReceives 2925155 0.0
IpInDelivers 2925156 0.0
UdpInDatagrams 2440925 0.0
UdpInErrors 484230 0.0
UdpRcvbufErrors 484230 0.0
IpExtInOctets 4264896402 0.0
IpExtInNoECTPkts 2925170 0.0
This is a 50Gbit/s link, and IpInReceives correspondent to approx 35Gbit/s.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 0/5] net: add protocol level recvmmsg support
2016-11-28 12:21 ` Jesper Dangaard Brouer
@ 2016-11-28 13:52 ` Jesper Dangaard Brouer
0 siblings, 0 replies; 19+ messages in thread
From: Jesper Dangaard Brouer @ 2016-11-28 13:52 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Hannes Frederic Sowa,
Sabrina Dubroca, brouer
On Mon, 28 Nov 2016 13:21:41 +0100 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> On Mon, 28 Nov 2016 11:52:38 +0100 Paolo Abeni <pabeni@redhat.com> wrote:
> >
> > > > [2] like [1], but using the minimum number of flows to saturate the user space
> > > > sink, that is 1 flow for the old kernel and 3 for the patched one.
> > > > the tput increases since the contention on the rx lock is low.
> > > > [3] like [1] but using a single flow with both old and new kernel. All the
> > > > packets land on the same rx queue and there is a single ksoftirqd instance
> > > > running
[...]
> >
> > We also used connected socket for test[3], with relative little
> > difference (the tput increased for both unpatched and patched kernel,
> > and the difference was roughly the same).
>
> When I use connected sockets (RX side) and ip_early_demux enabled, I do
> see a performance boost for recvmmsg. With these patches applied,
> forced ksoftirqd on CPU0 and udp_sink on CPU2, pktgen single flow
> sending size 1472 bytes.
>
> $ sysctl net/ipv4/ip_early_demux
> net.ipv4.ip_early_demux = 1
>
> $ grep -H . /proc/sys/net/core/{r,w}mem_max
> /proc/sys/net/core/rmem_max:1048576
> /proc/sys/net/core/wmem_max:1048576
>
> # taskset -c 2 ./udp_sink --count $((10**7)) --port 9 --repeat 1
> # ns pps cycles
> recvMmsg/32 run: 0 10000000 462.51 2162095.23 1853
> recvmsg run: 0 10000000 536.47 1864041.75 2150
> read run: 0 10000000 492.01 2032460.71 1972
> recvfrom run: 0 10000000 553.94 1805262.84 2220
>
> # taskset -c 2 ./udp_sink --count $((10**7)) --port 9 --repeat 1 --connect
> # ns pps cycles
> recvMmsg/32 run: 0 10000000 405.15 2468225.03 1623
> recvmsg run: 0 10000000 548.23 1824049.58 2197
> read run: 0 10000000 489.76 2041825.27 1962
> recvfrom run: 0 10000000 466.18 2145091.77 1868
>
> My theory is that by enabling connect'ed RX socket, the ksoftirqd gets
> faster (no fib_lookup) and is no-longer a bottleneck. This is
> confirmed by nstat.
Paolo asked me to do a test with small packets with pktgen, and I was
actually surprised by the result.
# taskset -c 2 ./udp_sink --count $((10**7)) --port 9 --repeat 1 --connect
recvMmsg/32 run: 0 10000000 426.61 2344076.59 1709 17098657328
recvmsg run: 0 10000000 533.49 1874449.82 2138 21382574965
read run: 0 10000000 470.22 2126651.13 1884 18846797802
recvfrom run: 0 10000000 513.74 1946499.83 2059 20591095477
Notice how recvMmsg/32, got slower with 124kpps (2468225 pps -> 2344076 pps).
I was expecting it to get faster, given we just established udp_sink
was the bottleneck, and smaller packet should mean less copy of bytes
to userspace (copy_user_enhanced_fast_string). (With nstat I observe
ksoftirq is again the bottleneck).
Looking at perf diff of CPU2 (baseline=64Bytes) we do see an increase
of copy_user_enhanced_fast_string. More interestingly we see a
decrease in the locking cost when using big packets (see ** below)
# Event 'cycles:ppp'
#
# Baseline Delta Shared Object Symbol
# ........ ....... ................ .........................................
#
15.09% +0.33% [kernel.vmlinux] [k] copy_msghdr_from_user
12.36% +21.89% [kernel.vmlinux] [k] copy_user_enhanced_fast_string
8.65% -0.63% [kernel.vmlinux] [k] udp_process_skb
7.33% -1.88% [kernel.vmlinux] [k] __skb_try_recv_datagram_batch
** 7.12% -6.66% [kernel.vmlinux] [k] udp_rmem_release **
** 6.71% -6.52% [kernel.vmlinux] [k] _raw_spin_lock_bh **
6.35% +1.36% [kernel.vmlinux] [k] __free_page_frag
4.39% +0.29% [kernel.vmlinux] [k] copy_msghdr_to_user_gen
2.87% -1.52% [kernel.vmlinux] [k] skb_release_data
2.60% +0.14% [kernel.vmlinux] [k] __put_user_4
2.27% -2.18% [kernel.vmlinux] [k] __sk_mem_reduce_allocated
2.11% +0.08% [kernel.vmlinux] [k] cmpxchg_double_slab.isra.68
1.90% +2.40% [kernel.vmlinux] [k] __slab_free
1.73% +0.20% [kernel.vmlinux] [k] __udp_recvmmsg
1.62% -1.62% [kernel.vmlinux] [k] intel_idle
1.52% +0.22% [kernel.vmlinux] [k] copy_to_iter
1.20% -0.03% [kernel.vmlinux] [k] import_iovec
1.14% +0.05% [kernel.vmlinux] [k] rw_copy_check_uvector
0.80% -0.04% [kernel.vmlinux] [k] recvmmsg_ctx_to_user
0.75% -0.69% [kernel.vmlinux] [k] __local_bh_enable_ip
0.71% +0.18% [kernel.vmlinux] [k] skb_copy_datagram_iter
0.70% -0.07% [kernel.vmlinux] [k] recvmmsg_ctx_from_user
0.67% +0.08% [kernel.vmlinux] [k] kmem_cache_free
0.56% +0.42% [kernel.vmlinux] [k] udp_process_msg
0.48% +0.05% [kernel.vmlinux] [k] skb_release_head_state
0.46% [kernel.vmlinux] [k] lapic_next_deadline
0.36% [kernel.vmlinux] [k] __switch_to
0.34% -0.03% [kernel.vmlinux] [k] consume_skb
0.32% -0.05% [kernel.vmlinux] [k] skb_consume_udp
The perf diff from CPU0, also show less lock congestion:
# Event 'cycles:ppp'
#
# Baseline Delta Shared Object Symbol
# ........ ....... ................ .........................................
#
11.04% -3.02% [kernel.vmlinux] [k] __udp_enqueue_schedule_skb
9.98% +2.16% [mlx5_core] [k] mlx5e_handle_rx_cqe
7.23% -1.85% [kernel.vmlinux] [k] udp_v4_early_demux
3.90% +0.73% [kernel.vmlinux] [k] build_skb
3.85% -1.77% [kernel.vmlinux] [k] udp_queue_rcv_skb
3.83% +0.02% [kernel.vmlinux] [k] sock_def_readable
** 3.26% -3.19% [kernel.vmlinux] [k] queued_spin_lock_slowpath **
2.99% +0.55% [kernel.vmlinux] [k] __build_skb
2.97% +0.11% [kernel.vmlinux] [k] __udp4_lib_rcv
** 2.87% -1.39% [kernel.vmlinux] [k] _raw_spin_lock **
2.67% +0.60% [kernel.vmlinux] [k] ip_rcv
2.65% +0.61% [kernel.vmlinux] [k] __netif_receive_skb_core
2.64% +0.79% [ip_tables] [k] ipt_do_table
2.37% +0.37% [kernel.vmlinux] [k] read_tsc
2.26% +0.52% [mlx5_core] [k] mlx5e_get_cqe
2.11% -1.15% [kernel.vmlinux] [k] __sk_mem_raise_allocated
2.10% +0.37% [kernel.vmlinux] [k] __rcu_read_unlock
2.04% +0.67% [mlx5_core] [k] mlx5e_alloc_rx_wqe
1.86% +0.40% [kernel.vmlinux] [k] inet_gro_receive
1.57% +0.11% [kernel.vmlinux] [k] kmem_cache_alloc
1.53% +0.28% [kernel.vmlinux] [k] _raw_read_lock
1.53% +0.25% [kernel.vmlinux] [k] dev_gro_receive
1.38% -0.18% [kernel.vmlinux] [k] udp_gro_receive
1.19% +0.37% [kernel.vmlinux] [k] __rcu_read_lock
1.14% +0.31% [kernel.vmlinux] [k] _raw_read_unlock
1.14% +0.12% [kernel.vmlinux] [k] ip_rcv_finish
1.13% +0.20% [kernel.vmlinux] [k] __udp4_lib_lookup
1.05% +0.16% [kernel.vmlinux] [k] ktime_get_with_offset
0.94% +0.38% [kernel.vmlinux] [k] ip_local_deliver_finish
0.91% +0.22% [kernel.vmlinux] [k] do_csum
0.86% -0.04% [kernel.vmlinux] [k] ipv4_pktinfo_prepare
0.84% +0.05% [kernel.vmlinux] [k] sk_filter_trim_cap
0.84% +0.20% [kernel.vmlinux] [k] ip_local_deliver
0.84% +0.19% [kernel.vmlinux] [k] udp4_gro_receive
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH net-next 0/5] net: add protocol level recvmmsg support
2016-11-25 15:39 [PATCH net-next 0/5] net: add protocol level recvmmsg support Paolo Abeni
` (5 preceding siblings ...)
2016-11-25 17:37 ` [PATCH net-next 0/5] net: add protocol level recvmmsg support Jesper Dangaard Brouer
@ 2016-11-25 21:16 ` Eric Dumazet
6 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2016-11-25 21:16 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
Hannes Frederic Sowa, Sabrina Dubroca
On Fri, 2016-11-25 at 16:39 +0100, Paolo Abeni wrote:
> The goal of recvmmsg() is to amortize the syscall overhead on a possible
> long messages batch, but for most networking protocols, e.g. udp the
> syscall overhead is negligible compared to the protocol specific operations
> like dequeuing.
Problem of recvmmsg() is that it blows up L1/L2 cache of the cpu.
It gives false 'good results' until other threads sharing the same cache
hierarchy are competing with you. Then performance is actually lower
than regular recvmsg().
And I presume your tests did not really use the data once copied to user
space, like doing the typical operations a UDP server does on incoming
packets ?
I would rather try to optimize normal recvmsg(), instead of adding so
much code in the kernel for this horrible recvmmsg() super system call.
Looking at how buggy sendmmsg() was until commit 3023898b7d4aac6
("sock: fix sendmmsg for partial sendmsg"), I fear that these 'super'
system calls are way too complex.
How could we improve UDP ?
For example, we could easily have 2 queues to reduce false sharing and
lock contention.
1) One queue accessed by softirq to append packets.
2) One queue accessed by recvmsg(). Make sure these two queues do not
share a cache line.
When 2nd queue is empty, transfer whole first queue in one operation.
Look in net/core/dev.c , process_backlog() for an example of this
strategy.
Alternative would be to use a ring buffer, although the forward_alloc
stuff might be complex.
^ permalink raw reply [flat|nested] 19+ messages in thread