netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC v2] net: Introduce recvmmsg socket syscall
@ 2009-06-11  3:40 Arnaldo Carvalho de Melo
  2009-06-11  9:41 ` David Miller
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-06-11  3:40 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Chris Van Hoof, Clark Williams, Caitlin Bestler,
	Paul Moore, Steven Whitehouse, Rémi Denis-Courmont,
	Neil Horman, Nivedita Singhvi

[-- Attachment #1: Type: text/plain, Size: 14711 bytes --]

Meaning receive multiple messages, reducing the number of syscalls and
net stack entry/exit operations.

Next patches will introduce mechanisms where protocols that want to
optimize this operation will provide an unlocked_recvmsg operation.

This takes into account comments made by:

. Paul Moore: sock_recvmsg is called only for the first datagram,
  sock_recvmsg_nosec is used for the rest.

. Caitlin Bestler: recvmmsg now has a struct timespec timeout, that
  works in the same fashion as the ppoll one.

  If the underlying protocol returns a datagram with MSG_OOB set, this
  will make recvmmsg return right away with as many datagrams (+ the OOB
  one) it has received so far.

. Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen
  datagrams and then recvmsg returns an error, recvmmsg will return
  the successfully received datagrams, store the error and return it
  in the next call.

I'll defer work on the subsequent optimization
(sk_prot->unlocked_recvmmsg) till we get the syscall API sorted out.

One thing to think about is if programs that provide a timeout for the
recvmmsg operation that is smaller than the one set (or the default) for
SO_RCVTIMEO should get their neck broken by inproper rope usage.

I thought about checking that and doing the equivalent to
sock_set_timeout(&sk->sk_rcvtimeo, recvmmsg_timeout), but felt like
recvmmsg was getting too many lines of code, opinions?

Also details such as adding the syscall to all the other archs syscall
tables and providing a socketcall interface for those that want it will
be addressed before final submission.

Attached also goes the updated recvmmsg canonical usage tool.

Thanks for all the comments so far and keep them coming! :-)

- Arnaldo

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e590261..2a188e5 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -832,4 +832,5 @@ ia32_sys_call_table:
 	.quad compat_sys_pwritev
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_counter_open
+	.quad compat_sys_recvmmsg
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 732a307..3e72cae 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,6 +342,7 @@
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_counter_open	336
+#define __NR_recvmmsg		337
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 900e161..713a32a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 #define __NR_perf_counter_open			298
 __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_recvmmsg				299
+__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321d..4881b14 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_counter_open
+	.long sys_recvmmsg
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 421afb4..5aaa78a 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -65,6 +65,12 @@ struct msghdr {
 	unsigned	msg_flags;
 };
 
+/* For recvmmsg/sendmmsg */
+struct mmsghdr {
+	struct msghdr   msg_hdr;
+	unsigned        msg_len;
+};
+
 /*
  *	POSIX 1003.1g - ancillary data object information
  *	Ancillary data consits of a sequence of pairs of
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6c84ad..afefa61 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -25,6 +25,7 @@ struct linux_dirent64;
 struct list_head;
 struct msgbuf;
 struct msghdr;
+struct mmsghdr;
 struct msqid_ds;
 struct new_utsname;
 struct nfsctl_arg;
@@ -556,6 +557,9 @@ asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int __user *);
 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
+			     unsigned int vlen, unsigned flags,
+			     struct timespec __user *timeout);
 asmlinkage long sys_socket(int, int, int);
 asmlinkage long sys_socketpair(int, int, int, int __user *);
 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
diff --git a/include/net/compat.h b/include/net/compat.h
index 5bbf8bf..bcde814 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -18,6 +18,11 @@ struct compat_msghdr {
 	compat_uint_t	msg_flags;
 };
 
+struct compat_mmsghdr {
+	struct compat_msghdr msg_hdr;
+	compat_uint_t        msg_len;
+};
+
 struct compat_cmsghdr {
 	compat_size_t	cmsg_len;
 	compat_int_t	cmsg_level;
@@ -35,6 +40,9 @@ extern int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *);
 extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int);
 extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned);
 extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned);
+extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *,
+					   unsigned, unsigned,
+					   struct compat_timespec __user *);
 extern asmlinkage long compat_sys_getsockopt(int, int, int, char __user *, int __user *);
 extern int put_cmsg_compat(struct msghdr*, int, int, int, void *);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6..f581fb0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,7 +48,9 @@ cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
 cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
+cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
diff --git a/net/compat.c b/net/compat.c
index 8d73905..4fab14c 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -743,6 +743,15 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, uns
 	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
+asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags,
+				    struct compat_timespec __user *timeout)
+{
+	return sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+			    flags | MSG_CMSG_COMPAT,
+			    (struct timespec __user *)timeout);
+}
+
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
diff --git a/net/socket.c b/net/socket.c
index 791d71a..f9f1e20 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -702,6 +702,28 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 	return ret;
 }
 
+static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+			      size_t size, int flags)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+
+	siocb.sock = sock;
+	siocb.scm = NULL;
+	siocb.msg = msg;
+	siocb.size = size;
+	siocb.flags = flags;
+
+	ret = sock->ops->recvmsg(&iocb, sock, msg, size, flags);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size, int flags)
 {
@@ -1965,22 +1987,15 @@ out:
 	return err;
 }
 
-/*
- *	BSD recvmsg interface
- */
-
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
-		unsigned int, flags)
+static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
 	    (struct compat_msghdr __user *)msg;
-	struct socket *sock;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
-	struct msghdr msg_sys;
 	unsigned long cmsg_ptr;
 	int err, iov_size, total_len, len;
-	int fput_needed;
 
 	/* kernel mode address */
 	struct sockaddr_storage addr;
@@ -1990,27 +2005,23 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	int __user *uaddr_len;
 
 	if (MSG_CMSG_COMPAT & flags) {
-		if (get_compat_msghdr(&msg_sys, msg_compat))
+		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
 	}
-	else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+	else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
 		return -EFAULT;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
-
 	err = -EMSGSIZE;
-	if (msg_sys.msg_iovlen > UIO_MAXIOV)
-		goto out_put;
+	if (msg_sys->msg_iovlen > UIO_MAXIOV)
+		goto out;
 
 	/* Check whether to allocate the iovec area */
 	err = -ENOMEM;
-	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
-	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
+	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
 		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
 		if (!iov)
-			goto out_put;
+			goto out;
 	}
 
 	/*
@@ -2018,46 +2029,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	 *      kernel msghdr to use the kernel address space)
 	 */
 
-	uaddr = (__force void __user *)msg_sys.msg_name;
+	uaddr = (__force void __user *)msg_sys->msg_name;
 	uaddr_len = COMPAT_NAMELEN(msg);
 	if (MSG_CMSG_COMPAT & flags) {
-		err = verify_compat_iovec(&msg_sys, iov,
+		err = verify_compat_iovec(msg_sys, iov,
 					  (struct sockaddr *)&addr,
 					  VERIFY_WRITE);
 	} else
-		err = verify_iovec(&msg_sys, iov,
+		err = verify_iovec(msg_sys, iov,
 				   (struct sockaddr *)&addr,
 				   VERIFY_WRITE);
 	if (err < 0)
 		goto out_freeiov;
 	total_len = err;
 
-	cmsg_ptr = (unsigned long)msg_sys.msg_control;
-	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	cmsg_ptr = (unsigned long)msg_sys->msg_control;
+	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = sock_recvmsg(sock, &msg_sys, total_len, flags);
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+							  total_len, flags);
 	if (err < 0)
 		goto out_freeiov;
 	len = err;
 
 	if (uaddr != NULL) {
 		err = move_addr_to_user((struct sockaddr *)&addr,
-					msg_sys.msg_namelen, uaddr,
+					msg_sys->msg_namelen, uaddr,
 					uaddr_len);
 		if (err < 0)
 			goto out_freeiov;
 	}
-	err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
+	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
 			 COMPAT_FLAGS(msg));
 	if (err)
 		goto out_freeiov;
 	if (MSG_CMSG_COMPAT & flags)
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg_compat->msg_controllen);
 	else
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg->msg_controllen);
 	if (err)
 		goto out_freeiov;
@@ -2066,12 +2078,141 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 out_freeiov:
 	if (iov != iovstack)
 		sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
+out:
+	return err;
+}
+
+/*
+ *	BSD recvmsg interface
+ */
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+		unsigned int, flags)
+{
+	int fput_needed, err;
+	struct msghdr msg_sys;
+	struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+	if (!sock)
+		goto out;
+
+	err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
 	fput_light(sock->file, fput_needed);
 out:
 	return err;
 }
 
+/*
+ *     Linux recvmmsg interface
+ */
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags,
+		struct timespec __user *, timeout)
+{
+	int fput_needed, err, datagrams;
+	struct socket *sock;
+	struct mmsghdr __user *entry;
+	struct msghdr msg_sys;
+	struct timespec end_time, delta;
+	struct compat_timespec *timeout_compat =
+		(struct compat_timespec *)timeout;
+
+	if (timeout) {
+		/* Doesn't make much sense */
+		if (flags & MSG_DONTWAIT)
+			return -EINVAL;
+
+		if (flags & MSG_CMSG_COMPAT) {
+			if (get_user(delta.tv_sec, &timeout_compat->tv_sec) ||
+			    get_user(delta.tv_nsec, &timeout_compat->tv_nsec))
+				return -EFAULT;
+		} else if (get_user(delta.tv_sec, &timeout->tv_sec) ||
+			 get_user(delta.tv_nsec, &timeout->tv_nsec))
+				return -EFAULT;
+
+		if (poll_select_set_timeout(&end_time, delta.tv_sec,
+					    delta.tv_nsec))
+			return -EINVAL;
+	}
+
+	datagrams = 0;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		return err;
+
+	err = sock_error(sock->sk);
+	if (err)
+		goto out_put;
+
+	entry = mmsg;
+
+	while (datagrams < vlen) {
+		/*
+		 * No need to do ask LSM for more than the first datagram.
+		 */
+		err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
+				    &msg_sys, flags, datagrams);
+		if (err < 0)
+			break;
+		err = put_user(err, &entry->msg_len);
+		if (err)
+			break;
+		++entry;
+		++datagrams;
+		
+		if (timeout) {
+			ktime_get_ts(&delta);
+			delta = timespec_sub(end_time, delta);
+			if (delta.tv_sec < 0)
+				delta.tv_sec = delta.tv_nsec = 0;
+
+			/* Timeout, return less than vlen datagrams */
+			if (delta.tv_sec == 0 && delta.tv_nsec == 0)
+				break;
+		}
+
+		/* Out of band data, return right away */
+		if (msg_sys.msg_flags & MSG_OOB)
+			break;
+	}
+
+	if (timeout) {
+		if (flags & MSG_CMSG_COMPAT) {
+			if (put_user(delta.tv_sec, &timeout_compat->tv_sec) ||
+			    put_user(delta.tv_nsec, &timeout_compat->tv_nsec))
+				err = -EFAULT;
+		} else if (put_user(delta.tv_sec, &timeout->tv_sec) ||
+			   put_user(delta.tv_nsec, &timeout->tv_nsec))
+				err = -EFAULT;
+	}
+out_put:
+	fput_light(sock->file, fput_needed);
+
+	if (err == 0)
+		return datagrams;
+
+	if (datagrams != 0) {
+		/*
+		 * We may return less entries than requested (vlen) if the
+		 * sock is non block and there aren't enough datagrams...
+		 */
+		if (err != -EAGAIN) {
+			/*
+			 * ... or  if recvmsg returns an error after we
+			 * received some datagrams, where we record the
+			 * error to return on the next call or if the
+			 * app asks about it using getsockopt(SO_ERROR).
+			 */
+			sock->sk->sk_err = -err;
+		}
+
+		return datagrams;
+	}
+
+	return err;
+}
+
 #ifdef __ARCH_WANT_SYS_SOCKETCALL
 
 /* Argument list sizes for sys_socketcall */

[-- Attachment #2: recvmmsg.c --]
[-- Type: text/plain, Size: 3619 bytes --]

#include <stdlib.h>
#include <syscall.h>
#include <stdio.h>
#include <sys/socket.h>
#include <unistd.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <poll.h>
#include <string.h>

struct mmsghdr {
	struct msghdr	msg_hdr;
	unsigned	msg_len;
};

#if defined(__x86_64__) || defined(__i386__)
#include "linux-2.6-tip/arch/x86/include/asm/unistd.h"
#endif

#ifndef NSEC_PER_MSEC
#define NSEC_PER_MSEC	1000000UL
#endif

static inline int recvmmsg(int fd, struct mmsghdr *mmsg,
			   unsigned vlen, unsigned flags,
			   struct timespec *timeout)
{
	return syscall(__NR_recvmmsg, fd, mmsg, vlen, flags, timeout);
}

static void print_stats_peer(struct mmsghdr *datagram, int count, int bytes)
{
	char peer[1024];
	int err = getnameinfo(datagram->msg_hdr.msg_name,
			      datagram->msg_hdr.msg_namelen,
			      peer, sizeof(peer), NULL, 0, 0);
	if (err != 0) {
		fprintf(stderr, "error using getnameinfo: %s\n",
			gai_strerror(err));
			return;
		}
	printf("    %d bytes received from %s in %d datagrams\n",
	       bytes, peer, count);
}

int main(int argc, char *argv[])
{
	struct addrinfo *host;
	struct addrinfo hints = {
		.ai_family   = AF_INET,
		.ai_socktype = SOCK_DGRAM,
		.ai_protocol = IPPROTO_UDP,
		.ai_flags    = AI_PASSIVE,
	};
	const char *port = "5001";
	int batch_size = 8;
	long timeout = 10 * NSEC_PER_MSEC;
	int err, fd;
	int i;

	if (argc > 1)
		port = argv[1];

	if (argc > 2)
		batch_size = atoi(argv[2]);

	if (argc > 3)
		timeout = atol(argv[3]) * NSEC_PER_MSEC;

	char buf[batch_size][256];
	struct iovec iovec[batch_size][1];
	struct sockaddr addr[batch_size];
	struct mmsghdr datagrams[batch_size];

	printf("usage: recvmmsg <port(def 5001)> <batch_size(def 8)> "
	       "<timeout(def 10ms>\n\nWaiting for datagrams...\n");

	err = getaddrinfo(NULL, port, &hints, &host);
	if (err != 0) {
		fprintf(stderr, "error using getaddrinfo: %s\n",
			gai_strerror(err));
		goto out;
	}
	
	fd = socket(host->ai_family, host->ai_socktype, host->ai_protocol);
	if (fd < 0) {
		perror("socket: ");
		goto out_freeaddrinfo;
	}

	if (bind(fd, host->ai_addr, host->ai_addrlen) < 0) {
		perror("bind: ");
		goto out_close_server;
	}

	for (i = 0; i < batch_size; ++i) {
		iovec[i][0].iov_base = buf[i];
		iovec[i][0].iov_len  = sizeof(buf[i]);
		datagrams[i].msg_hdr.msg_iov	 = iovec[i];
		datagrams[i].msg_hdr.msg_iovlen	 = 1;
		datagrams[i].msg_hdr.msg_name	 = &addr[i];
		datagrams[i].msg_hdr.msg_namelen = sizeof(addr[i]);
	}

	struct pollfd pfds[1] = {
		[0] = {
			.fd = fd,
			.events = POLLIN,
		},
	};

	while (1) {
		struct timespec timeout = { .tv_nsec = 10 * NSEC_PER_MSEC, };

		if (poll(pfds, 1, -1) < 0) {
			perror("poll: ");
			return EXIT_FAILURE;
		}

		int nr_datagrams = recvmmsg(fd, datagrams, batch_size,
					    0, &timeout);

		if (nr_datagrams == 0) {
			perror("recvmmsg: ");
			return EXIT_FAILURE;
		}

		printf("nr_datagrams received: %d, remaining: %luns\n",
		       nr_datagrams, timeout.tv_nsec);
		int peer_count = 1;
		int peer_bytes = datagrams[0].msg_len;
		for (i = 1; i < nr_datagrams; ++i) {
			if (memcmp(datagrams[i - 1].msg_hdr.msg_name,
				   datagrams[i].msg_hdr.msg_name,
				   datagrams[i].msg_hdr.msg_namelen) == 0) {
				++peer_count;
				peer_bytes += datagrams[i].msg_len;
				continue;
			}
			
			print_stats_peer(&datagrams[i - 1],
					 peer_count, peer_bytes);
			peer_bytes = datagrams[i].msg_len;
			peer_count = 1;
		}

		print_stats_peer(&datagrams[nr_datagrams - 1],
				 peer_count, peer_bytes);
	}
out_close_server:
	close(fd);
out_freeaddrinfo:
	freeaddrinfo(host);
out:
	return err;
}

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11  3:40 [RFC v2] net: Introduce recvmmsg socket syscall Arnaldo Carvalho de Melo
@ 2009-06-11  9:41 ` David Miller
  2009-06-11 15:34   ` Arnaldo Carvalho de Melo
  2009-06-11 22:58   ` [RFC v3] " Arnaldo Carvalho de Melo
  2009-06-11 18:09 ` [RFC v2] " Paul Moore
  2009-06-12  7:26 ` Rémi Denis-Courmont
  2 siblings, 2 replies; 10+ messages in thread
From: David Miller @ 2009-06-11  9:41 UTC (permalink / raw)
  To: acme
  Cc: netdev, vanhoof, williams, caitlin.bestler, paul.moore, steve,
	remi.denis-courmont, nhorman, niv

From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Jun 2009 00:40:22 -0300

> +	struct compat_timespec *timeout_compat =
> +		(struct compat_timespec *)timeout;

In order for this to build on non-compat platforms, or when
CONFIG_COMPAT is disabled, you'll need to avoid trying to
reference compat_timespec.

We get away with compat_msghdr because we define it as a NOP
in net/compat.h when necessary.

Doing the same, and cleanly, with this non-networking compat knob is
unlikely to be straightforward.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11  9:41 ` David Miller
@ 2009-06-11 15:34   ` Arnaldo Carvalho de Melo
  2009-06-12  0:00     ` David Miller
  2009-06-11 22:58   ` [RFC v3] " Arnaldo Carvalho de Melo
  1 sibling, 1 reply; 10+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-06-11 15:34 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, vanhoof, williams, caitlin.bestler, paul.moore, steve,
	remi.denis-courmont, nhorman, niv

Em Thu, Jun 11, 2009 at 02:41:00AM -0700, David Miller escreveu:
> From: Arnaldo Carvalho de Melo <acme@redhat.com>
> Date: Thu, 11 Jun 2009 00:40:22 -0300
> 
> > +	struct compat_timespec *timeout_compat =
> > +		(struct compat_timespec *)timeout;
> 
> In order for this to build on non-compat platforms, or when
> CONFIG_COMPAT is disabled, you'll need to avoid trying to
> reference compat_timespec.
> 
> We get away with compat_msghdr because we define it as a NOP
> in net/compat.h when necessary.
> 
> Doing the same, and cleanly, with this non-networking compat knob is
> unlikely to be straightforward.

Done, not like msghdr, but should work, testing now.

Thanks for quickly pointing out this brainfart!

- Arnaldo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11  3:40 [RFC v2] net: Introduce recvmmsg socket syscall Arnaldo Carvalho de Melo
  2009-06-11  9:41 ` David Miller
@ 2009-06-11 18:09 ` Paul Moore
  2009-06-11 21:53   ` Arnaldo Carvalho de Melo
  2009-06-12  7:26 ` Rémi Denis-Courmont
  2 siblings, 1 reply; 10+ messages in thread
From: Paul Moore @ 2009-06-11 18:09 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Miller, netdev, Chris Van Hoof, Clark Williams,
	Caitlin Bestler, Steven Whitehouse,  Rémi Denis-Courmont,
	Neil Horman, Nivedita Singhvi

On Wednesday 10 June 2009 11:40:22 pm Arnaldo Carvalho de Melo wrote:
> diff --git a/net/socket.c b/net/socket.c
> index 791d71a..f9f1e20 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -702,6 +702,28 @@ int sock_recvmsg(struct socket *sock, struct msghdr
> *msg, return ret;
>  }
>
> +static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
> +			      size_t size, int flags)
> +{
> +	struct kiocb iocb;
> +	struct sock_iocb siocb;
> +	int ret;
> +
> +	init_sync_kiocb(&iocb, NULL);
> +	iocb.private = &siocb;
> +
> +	siocb.sock = sock;
> +	siocb.scm = NULL;
> +	siocb.msg = msg;
> +	siocb.size = size;
> +	siocb.flags = flags;
> +
> +	ret = sock->ops->recvmsg(&iocb, sock, msg, size, flags);
> +	if (-EIOCBQUEUED == ret)
> +		ret = wait_on_sync_kiocb(&iocb);
> +	return ret;
> +}

Hmmm, in an effort to reduce duplicated code how about updating 
__sock_recvmsg() to something like the following:

static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
                                 struct msghdr *msg, size_t size, int flags)
{
        int err;

        err = security_socket_recvmsg(...);
        if (err)
                return err;

        return sock_recvmsg_nosec(...);
}

The only real difference is that now the *_kiocb() functions get called and I 
have no clue if that is good or bad but it is different :)

>  	/*
> @@ -2018,46 +2029,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr
> __user *, msg, *      kernel msghdr to use the kernel address space)
>  	 */
>
> -	uaddr = (__force void __user *)msg_sys.msg_name;
> +	uaddr = (__force void __user *)msg_sys->msg_name;
>  	uaddr_len = COMPAT_NAMELEN(msg);
>  	if (MSG_CMSG_COMPAT & flags) {
> -		err = verify_compat_iovec(&msg_sys, iov,
> +		err = verify_compat_iovec(msg_sys, iov,
>  					  (struct sockaddr *)&addr,
>  					  VERIFY_WRITE);
>  	} else
> -		err = verify_iovec(&msg_sys, iov,
> +		err = verify_iovec(msg_sys, iov,
>  				   (struct sockaddr *)&addr,
>  				   VERIFY_WRITE);
>  	if (err < 0)
>  		goto out_freeiov;
>  	total_len = err;
>
> -	cmsg_ptr = (unsigned long)msg_sys.msg_control;
> -	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
> +	cmsg_ptr = (unsigned long)msg_sys->msg_control;
> +	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
>
>  	if (sock->file->f_flags & O_NONBLOCK)
>  		flags |= MSG_DONTWAIT;
> -	err = sock_recvmsg(sock, &msg_sys, total_len, flags);
> +	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
> +							  total_len, flags);

Perhaps I'm just being nit-picky here but why not this (it is much easier on 
my eyes at least <g>):

	if (nosec)
		err = sock_recvmsg_nosec(...);
	else
		err = sock_recvmsg(...);

-- 
paul moore
linux @ hp


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11 18:09 ` [RFC v2] " Paul Moore
@ 2009-06-11 21:53   ` Arnaldo Carvalho de Melo
  2009-06-16 20:36     ` Paul Moore
  0 siblings, 1 reply; 10+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-06-11 21:53 UTC (permalink / raw)
  To: Paul Moore
  Cc: David Miller, netdev, Chris Van Hoof, Clark Williams,
	Caitlin Bestler, Steven Whitehouse, Rémi Denis-Courmont,
	Neil Horman, Nivedita Singhvi

Em Thu, Jun 11, 2009 at 02:09:22PM -0400, Paul Moore escreveu:
> On Wednesday 10 June 2009 11:40:22 pm Arnaldo Carvalho de Melo wrote:
> > diff --git a/net/socket.c b/net/socket.c
> > index 791d71a..f9f1e20 100644
> > --- a/net/socket.c
> > +++ b/net/socket.c
> > @@ -702,6 +702,28 @@ int sock_recvmsg(struct socket *sock, struct msghdr
> > *msg, return ret;
> >  }
> >
> > +static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
> > +			      size_t size, int flags)
> > +{
> > +	struct kiocb iocb;
> > +	struct sock_iocb siocb;
> > +	int ret;
> > +
> > +	init_sync_kiocb(&iocb, NULL);
> > +	iocb.private = &siocb;
> > +
> > +	siocb.sock = sock;
> > +	siocb.scm = NULL;
> > +	siocb.msg = msg;
> > +	siocb.size = size;
> > +	siocb.flags = flags;
> > +
> > +	ret = sock->ops->recvmsg(&iocb, sock, msg, size, flags);
> > +	if (-EIOCBQUEUED == ret)
> > +		ret = wait_on_sync_kiocb(&iocb);
> > +	return ret;
> > +}
> 
> Hmmm, in an effort to reduce duplicated code how about updating 
> __sock_recvmsg() to something like the following:
> 
> static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
>                                  struct msghdr *msg, size_t size, int flags)
> {
>         int err;
> 
>         err = security_socket_recvmsg(...);
>         if (err)
>                 return err;
> 
>         return sock_recvmsg_nosec(...);
> }
> 
> The only real difference is that now the *_kiocb() functions get called and I 
> have no clue if that is good or bad but it is different :)

Yeah, gets clearer, like this:

static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
				       struct msghdr *msg, size_t size, int flags)
{
	struct sock_iocb *si = kiocb_to_siocb(iocb);

	si->sock = sock;
	si->scm = NULL;
	si->msg = msg;
	si->size = size;
	si->flags = flags;

	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}

static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
				 struct msghdr *msg, size_t size, int flags)
{
	int err = security_socket_recvmsg(sock, msg, size, flags);

	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
}

static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
			      size_t size, int flags)
{
	struct kiocb iocb;
	struct sock_iocb siocb;
	int ret;

	init_sync_kiocb(&iocb, NULL);
	iocb.private = &siocb;
	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
	if (-EIOCBQUEUED == ret)
		ret = wait_on_sync_kiocb(&iocb);
	return ret;
}

Better now? :-)

> >  	/*
> > @@ -2018,46 +2029,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr
> > __user *, msg, *      kernel msghdr to use the kernel address space)
> >  	 */
> >
> > -	uaddr = (__force void __user *)msg_sys.msg_name;
> > +	uaddr = (__force void __user *)msg_sys->msg_name;
> >  	uaddr_len = COMPAT_NAMELEN(msg);
> >  	if (MSG_CMSG_COMPAT & flags) {
> > -		err = verify_compat_iovec(&msg_sys, iov,
> > +		err = verify_compat_iovec(msg_sys, iov,
> >  					  (struct sockaddr *)&addr,
> >  					  VERIFY_WRITE);
> >  	} else
> > -		err = verify_iovec(&msg_sys, iov,
> > +		err = verify_iovec(msg_sys, iov,
> >  				   (struct sockaddr *)&addr,
> >  				   VERIFY_WRITE);
> >  	if (err < 0)
> >  		goto out_freeiov;
> >  	total_len = err;
> >
> > -	cmsg_ptr = (unsigned long)msg_sys.msg_control;
> > -	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
> > +	cmsg_ptr = (unsigned long)msg_sys->msg_control;
> > +	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
> >
> >  	if (sock->file->f_flags & O_NONBLOCK)
> >  		flags |= MSG_DONTWAIT;
> > -	err = sock_recvmsg(sock, &msg_sys, total_len, flags);
> > +	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
> > +							  total_len, flags);
> 
> Perhaps I'm just being nit-picky here but why not this (it is much easier on 
> my eyes at least <g>):
> 
> 	if (nosec)
> 		err = sock_recvmsg_nosec(...);
> 	else
> 		err = sock_recvmsg(...);

Well, its like "if (foo)" versus "if (foo != NULL)", I prefer to reduce
the number of source code lines and stress that the parameter list is
the same, anybody else feels confused by this?

- Arnaldo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [RFC v3] net: Introduce recvmmsg socket syscall
  2009-06-11  9:41 ` David Miller
  2009-06-11 15:34   ` Arnaldo Carvalho de Melo
@ 2009-06-11 22:58   ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 10+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-06-11 22:58 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Chris Van Hoof, Clark Williams, Caitlin Bestler,
	Paul Moore, Steven Whitehouse, Rémi Denis-Courmont,
	Neil Horman, Nivedita Singhvi

Em Thu, Jun 11, 2009 at 02:41:00AM -0700, David Miller escreveu:
> From: Arnaldo Carvalho de Melo <acme@redhat.com>
> Date: Thu, 11 Jun 2009 00:40:22 -0300
> 
> > +	struct compat_timespec *timeout_compat =
> > +		(struct compat_timespec *)timeout;
> 
> In order for this to build on non-compat platforms, or when
> CONFIG_COMPAT is disabled, you'll need to avoid trying to
> reference compat_timespec.
> 
> We get away with compat_msghdr because we define it as a NOP
> in net/compat.h when necessary.
> 
> Doing the same, and cleanly, with this non-networking compat knob is
> unlikely to be straightforward.

Better now?

Paul, this one also has the sock_recvmsg_nosec rewrite
following your suggestion.

I'll write a proper changeset log once we get the more easy stuff out of
the way and people are confortable with it.

Ah, here is how things are being distributed now in a test setup with
two clients and running recvmmsg like this:

[acme@emilia git]$ perf record ./recvmmsg 5001 256 5000
usage: recvmmsg <port(def 5001)> <batch_size(def 8)> <timeout(def 10ms>

Waiting for datagrams...
nr_datagrams received: 256, remaining: 4416588ns
    5376 bytes received from filo.ghostprotocols.net in 21 datagrams
    768 bytes received from doppio.ghostprotocols.net in 3 datagrams
    256 bytes received from filo.ghostprotocols.net in 1 datagrams
    2304 bytes received from doppio.ghostprotocols.net in 9 datagrams
    256 bytes received from filo.ghostprotocols.net in 1 datagrams
    2560 bytes received from doppio.ghostprotocols.net in 10 datagrams
    256 bytes received from filo.ghostprotocols.net in 1 datagrams
<SNIP>

[acme@emilia git]$ perf report --sort comm,dso,symbol | head -30

#
# (4532588 samples)
#
# Overhead            Command  Shared Object     Symbol
# ........ .........  .........................  ......
#
     4.85%  recvmmsg  [kernel]                   [k] copy_user_generic_string
     3.17%  recvmmsg  [kernel]                   [k] _spin_lock_irqsave
     2.35%  recvmmsg  [kernel]                   [k] _spin_lock
     2.18%  recvmmsg  [kernel]                   [k] udp_recvmsg
     2.04%  recvmmsg  /lib64/libnss_files-2.5.so [.] internal_getent
     1.52%  recvmmsg  [kernel]                   [k] n_tty_write
     1.49%  recvmmsg  [kernel]                   [k] _spin_lock_bh
     1.44%  recvmmsg  [kernel]                   [k] __rcu_read_lock
     1.30%  recvmmsg  [kernel]                   [k] __skb_recv_datagram
     1.23%  recvmmsg  [kernel]                   [k] __rcu_read_unlock
     1.14%  recvmmsg  [kernel]                   [k] __cache_free
     1.14%  recvmmsg  [kernel]                   [k] avc_has_perm_noaudit
     1.12%  recvmmsg  /lib64/libc-2.5.so         [.] _IO_vfprintf_internal
     1.10%  recvmmsg  [kernel]                   [k] __sys_recvmsg
     1.06%  recvmmsg  ./recvmmsg                 [.] main
     1.04%  recvmmsg  [kernel]                   [k] csum_partial
     1.02%  recvmmsg  [kernel]                   [k] _spin_unlock_irqrestore
     0.98%  recvmmsg  [kernel]                   [k] n_tty_receive_buf
     0.88%  recvmmsg  /lib64/libc-2.5.so         [.] __GI_inet_pton
     0.87%  recvmmsg  [kernel]                   [k] system_call
     0.86%  recvmmsg  [kernel]                   [k] ktime_get_ts
     0.83%  recvmmsg  /lib64/libc-2.5.so         [.] __memchr
     0.76%  recvmmsg  /lib64/libc-2.5.so         [.] _IO_file_xsputn_internal
[acme@emilia git]$ 

We'll see after sk_prot->unlocked_recvmsg gets reintroduced and I remove silly
stuff such as fprintf from the test app... :-)

Thanks!

- Arnaldo

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e590261..2a188e5 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -832,4 +832,5 @@ ia32_sys_call_table:
 	.quad compat_sys_pwritev
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_counter_open
+	.quad compat_sys_recvmmsg
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 732a307..3e72cae 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,6 +342,7 @@
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_counter_open	336
+#define __NR_recvmmsg		337
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 900e161..713a32a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 #define __NR_perf_counter_open			298
 __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_recvmmsg				299
+__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321d..4881b14 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_counter_open
+	.long sys_recvmmsg
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 421afb4..5ed3644 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -65,6 +65,12 @@ struct msghdr {
 	unsigned	msg_flags;
 };
 
+/* For recvmmsg/sendmmsg */
+struct mmsghdr {
+	struct msghdr   msg_hdr;
+	unsigned        msg_len;
+};
+
 /*
  *	POSIX 1003.1g - ancillary data object information
  *	Ancillary data consits of a sequence of pairs of
@@ -322,6 +328,10 @@ extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uadd
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
+struct timespec;
+
+extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+			  unsigned int flags, struct timespec *timeout);
 #endif
 #endif /* not kernel and not glibc */
 #endif /* _LINUX_SOCKET_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6c84ad..afefa61 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -25,6 +25,7 @@ struct linux_dirent64;
 struct list_head;
 struct msgbuf;
 struct msghdr;
+struct mmsghdr;
 struct msqid_ds;
 struct new_utsname;
 struct nfsctl_arg;
@@ -556,6 +557,9 @@ asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int __user *);
 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
+			     unsigned int vlen, unsigned flags,
+			     struct timespec __user *timeout);
 asmlinkage long sys_socket(int, int, int);
 asmlinkage long sys_socketpair(int, int, int, int __user *);
 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
diff --git a/include/net/compat.h b/include/net/compat.h
index 5bbf8bf..96c38d8 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -18,6 +18,11 @@ struct compat_msghdr {
 	compat_uint_t	msg_flags;
 };
 
+struct compat_mmsghdr {
+	struct compat_msghdr msg_hdr;
+	compat_uint_t        msg_len;
+};
+
 struct compat_cmsghdr {
 	compat_size_t	cmsg_len;
 	compat_int_t	cmsg_level;
@@ -35,6 +40,9 @@ extern int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *);
 extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int);
 extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned);
 extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned);
+extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *,
+					   unsigned, unsigned,
+					   struct timespec __user *);
 extern asmlinkage long compat_sys_getsockopt(int, int, int, char __user *, int __user *);
 extern int put_cmsg_compat(struct msghdr*, int, int, int, void *);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6..f581fb0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,7 +48,9 @@ cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
 cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
+cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
diff --git a/net/compat.c b/net/compat.c
index 8d73905..6f04f2d 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -743,6 +743,29 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, uns
 	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
+asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags,
+				    struct timespec __user *timeout)
+{
+	int datagrams;
+	struct timespec ktspec;
+	struct compat_timespec __user *utspec =
+			(struct compat_timespec __user *)timeout;
+
+	if (get_user(ktspec.tv_sec, &utspec->tv_sec) ||
+	    get_user(ktspec.tv_nsec, &utspec->tv_nsec))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+				   flags | MSG_CMSG_COMPAT, &ktspec);
+	if (datagrams > 0 &&
+	    put_user(ktspec.tv_sec, &utspec->tv_sec) ||
+	    put_user(ktspec.tv_nsec, &utspec->tv_nsec))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
diff --git a/net/socket.c b/net/socket.c
index 791d71a..1809a5e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -668,10 +668,9 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
-static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
-				 struct msghdr *msg, size_t size, int flags)
+static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
+				       struct msghdr *msg, size_t size, int flags)
 {
-	int err;
 	struct sock_iocb *si = kiocb_to_siocb(iocb);
 
 	si->sock = sock;
@@ -680,13 +679,17 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	si->size = size;
 	si->flags = flags;
 
-	err = security_socket_recvmsg(sock, msg, size, flags);
-	if (err)
-		return err;
-
 	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
 }
 
+static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				 struct msghdr *msg, size_t size, int flags)
+{
+	int err = security_socket_recvmsg(sock, msg, size, flags);
+
+	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
+}
+
 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 		 size_t size, int flags)
 {
@@ -702,6 +705,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 	return ret;
 }
 
+static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+			      size_t size, int flags)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size, int flags)
 {
@@ -1965,22 +1983,15 @@ out:
 	return err;
 }
 
-/*
- *	BSD recvmsg interface
- */
-
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
-		unsigned int, flags)
+static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
 	    (struct compat_msghdr __user *)msg;
-	struct socket *sock;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
-	struct msghdr msg_sys;
 	unsigned long cmsg_ptr;
 	int err, iov_size, total_len, len;
-	int fput_needed;
 
 	/* kernel mode address */
 	struct sockaddr_storage addr;
@@ -1990,27 +2001,23 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	int __user *uaddr_len;
 
 	if (MSG_CMSG_COMPAT & flags) {
-		if (get_compat_msghdr(&msg_sys, msg_compat))
+		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
 	}
-	else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+	else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
 		return -EFAULT;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
-
 	err = -EMSGSIZE;
-	if (msg_sys.msg_iovlen > UIO_MAXIOV)
-		goto out_put;
+	if (msg_sys->msg_iovlen > UIO_MAXIOV)
+		goto out;
 
 	/* Check whether to allocate the iovec area */
 	err = -ENOMEM;
-	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
-	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
+	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
 		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
 		if (!iov)
-			goto out_put;
+			goto out;
 	}
 
 	/*
@@ -2018,46 +2025,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	 *      kernel msghdr to use the kernel address space)
 	 */
 
-	uaddr = (__force void __user *)msg_sys.msg_name;
+	uaddr = (__force void __user *)msg_sys->msg_name;
 	uaddr_len = COMPAT_NAMELEN(msg);
 	if (MSG_CMSG_COMPAT & flags) {
-		err = verify_compat_iovec(&msg_sys, iov,
+		err = verify_compat_iovec(msg_sys, iov,
 					  (struct sockaddr *)&addr,
 					  VERIFY_WRITE);
 	} else
-		err = verify_iovec(&msg_sys, iov,
+		err = verify_iovec(msg_sys, iov,
 				   (struct sockaddr *)&addr,
 				   VERIFY_WRITE);
 	if (err < 0)
 		goto out_freeiov;
 	total_len = err;
 
-	cmsg_ptr = (unsigned long)msg_sys.msg_control;
-	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	cmsg_ptr = (unsigned long)msg_sys->msg_control;
+	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = sock_recvmsg(sock, &msg_sys, total_len, flags);
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+							  total_len, flags);
 	if (err < 0)
 		goto out_freeiov;
 	len = err;
 
 	if (uaddr != NULL) {
 		err = move_addr_to_user((struct sockaddr *)&addr,
-					msg_sys.msg_namelen, uaddr,
+					msg_sys->msg_namelen, uaddr,
 					uaddr_len);
 		if (err < 0)
 			goto out_freeiov;
 	}
-	err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
+	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
 			 COMPAT_FLAGS(msg));
 	if (err)
 		goto out_freeiov;
 	if (MSG_CMSG_COMPAT & flags)
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg_compat->msg_controllen);
 	else
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg->msg_controllen);
 	if (err)
 		goto out_freeiov;
@@ -2066,12 +2074,147 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 out_freeiov:
 	if (iov != iovstack)
 		sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
+out:
+	return err;
+}
+
+/*
+ *	BSD recvmsg interface
+ */
+
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+		unsigned int, flags)
+{
+	int fput_needed, err;
+	struct msghdr msg_sys;
+	struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+	if (!sock)
+		goto out;
+
+	err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
 	fput_light(sock->file, fput_needed);
 out:
 	return err;
 }
 
+/*
+ *     Linux recvmmsg interface
+ */
+
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+		   unsigned int flags, struct timespec *timeout)
+{
+	int fput_needed, err, datagrams;
+	struct socket *sock;
+	struct mmsghdr __user *entry;
+	struct msghdr msg_sys;
+	struct timespec end_time;
+
+	if (timeout) {
+		/* Doesn't make much sense */
+		if (flags & MSG_DONTWAIT)
+			return -EINVAL;
+
+		if (poll_select_set_timeout(&end_time, timeout->tv_sec,
+					    timeout->tv_nsec))
+			return -EINVAL;
+	}
+
+	datagrams = 0;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		return err;
+
+	err = sock_error(sock->sk);
+	if (err)
+		goto out_put;
+
+	entry = mmsg;
+
+	while (datagrams < vlen) {
+		/*
+		 * No need to do ask LSM for more than the first datagram.
+		 */
+		err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
+				    &msg_sys, flags, datagrams);
+		if (err < 0)
+			break;
+		err = put_user(err, &entry->msg_len);
+		if (err)
+			break;
+		++entry;
+		++datagrams;
+		
+		if (timeout) {
+			ktime_get_ts(timeout);
+			*timeout = timespec_sub(end_time, *timeout);
+			if (timeout->tv_sec < 0) {
+				timeout->tv_sec = timeout->tv_nsec = 0;
+				break;
+			}
+
+			/* Timeout, return less than vlen datagrams */
+			if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
+				break;
+		}
+
+		/* Out of band data, return right away */
+		if (msg_sys.msg_flags & MSG_OOB)
+			break;
+	}
+
+out_put:
+	fput_light(sock->file, fput_needed);
+
+	if (err == 0)
+		return datagrams;
+
+	if (datagrams != 0) {
+		/*
+		 * We may return less entries than requested (vlen) if the
+		 * sock is non block and there aren't enough datagrams...
+		 */
+		if (err != -EAGAIN) {
+			/*
+			 * ... or  if recvmsg returns an error after we
+			 * received some datagrams, where we record the
+			 * error to return on the next call or if the
+			 * app asks about it using getsockopt(SO_ERROR).
+			 */
+			sock->sk->sk_err = -err;
+		}
+
+		return datagrams;
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags,
+		struct timespec __user *, timeout)
+{
+	int datagrams;
+	struct timespec timeout_sys;
+
+	if (!timeout)
+		return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+	if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+
+	if (datagrams > 0 && 
+	    copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
 #ifdef __ARCH_WANT_SYS_SOCKETCALL
 
 /* Argument list sizes for sys_socketcall */

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11 15:34   ` Arnaldo Carvalho de Melo
@ 2009-06-12  0:00     ` David Miller
  0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2009-06-12  0:00 UTC (permalink / raw)
  To: acme
  Cc: netdev, vanhoof, williams, caitlin.bestler, paul.moore, steve,
	remi.denis-courmont, nhorman, niv

From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Jun 2009 12:34:46 -0300

> Thanks for quickly pointing out this brainfart!

Couldn't everyone else smell it too?

Maybe I'm just super-sensitive.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11  3:40 [RFC v2] net: Introduce recvmmsg socket syscall Arnaldo Carvalho de Melo
  2009-06-11  9:41 ` David Miller
  2009-06-11 18:09 ` [RFC v2] " Paul Moore
@ 2009-06-12  7:26 ` Rémi Denis-Courmont
  2009-06-12 14:19   ` Arnaldo Carvalho de Melo
  2 siblings, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2009-06-12  7:26 UTC (permalink / raw)
  To: ext Arnaldo Carvalho de Melo
  Cc: David Miller, netdev@vger.kernel.org, Chris Van Hoof,
	Clark Williams, Caitlin Bestler, Paul Moore, Steven Whitehouse,
	Neil Horman, Nivedita Singhvi

On Thursday 11 June 2009 06:40:22 ext Arnaldo Carvalho de Melo wrote:
> diff --git a/arch/x86/include/asm/unistd_64.h
> b/arch/x86/include/asm/unistd_64.h index 900e161..713a32a 100644
> --- a/arch/x86/include/asm/unistd_64.h
> +++ b/arch/x86/include/asm/unistd_64.h
> @@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
>  __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
>  #define __NR_perf_counter_open                 298
>  __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
> +#define __NR_recvmmsg                          299
> +__SYSCALL(__NR_recvmmsg, sys_recvmmsg)

I guess socketcall is deprecated in favor of full syscalls, then?
(sorry if this is a stupid question)

> +       if (timeout) {
> +               /* Doesn't make much sense */
> +               if (flags & MSG_DONTWAIT)
> +                       return -EINVAL;

An application could possibly hit this degenerated case at the end of a loop 
or whatever, and EINVAL makes it look like this is a bug. Why not EAGAIN?

-- 
Rémi Denis-Courmont
Nokia Devices R&D, Maemo Software, Helsinki


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-12  7:26 ` Rémi Denis-Courmont
@ 2009-06-12 14:19   ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 10+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-06-12 14:19 UTC (permalink / raw)
  To: Rémi Denis-Courmont
  Cc: David Miller, netdev, Chris Van Hoof, Clark Williams,
	Caitlin Bestler, Paul Moore, Steven Whitehouse, Neil Horman,
	Nivedita Singhvi

Em Fri, Jun 12, 2009 at 10:26:25AM +0300, Rémi Denis-Courmont escreveu:
> On Thursday 11 June 2009 06:40:22 ext Arnaldo Carvalho de Melo wrote:
> > diff --git a/arch/x86/include/asm/unistd_64.h
> > b/arch/x86/include/asm/unistd_64.h index 900e161..713a32a 100644
> > --- a/arch/x86/include/asm/unistd_64.h
> > +++ b/arch/x86/include/asm/unistd_64.h
> > @@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
> >  __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
> >  #define __NR_perf_counter_open                 298
> >  __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
> > +#define __NR_recvmmsg                          299
> > +__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
> 
> I guess socketcall is deprecated in favor of full syscalls, then?
> (sorry if this is a stupid question)

In some architectures, like x86_64, yes, but some still need it, the
ones that set __ARCH_WANT_SYS_SOCKETCALL:

[acme@doppio linux-2.6-tip]$ echo $(find . -type f | xargs egrep
"define.+__ARCH_WANT_SYS_SOCKETCALL" | sed -r 's#./arch/(\w+)/.*#\1#g' |
sort)
arm cris frv h8300 m32r m68k microblaze mips mn10300 parisc powerpc s390
sh sh sparc x86 x86

So the alpha and ia64, for instance, doesn't want sys_socketcall:

[acme@doppio linux-2.6-tip]$ egrep 'sys_((recv|send)msg|[gs]etsockopt)'
arch/alpha/kernel/systbls.S
	.quad sys_setsockopt			/* 105 */
	.quad sys_recvmsg
	.quad sys_sendmsg
	.quad sys_getsockopt
[acme@doppio linux-2.6-tip]$ egrep sys_socketcall arch/alpha/kernel/systbls.S
[acme@doppio linux-2.6-tip]$

[acme@doppio linux-2.6-tip]$ egrep 'sys_((recv|send)msg|[gs]etsockopt)' arch/ia64/kernel/entry.S
	data8 sys_setsockopt
	data8 sys_getsockopt
	data8 sys_sendmsg			// 1205
	data8 sys_recvmsg
[acme@doppio linux-2.6-tip]$ egrep sys_socketcall
arch/ia64/kernel/entry.S
[acme@doppio linux-2.6-tip]$ 

But some define __ARCH_WANT_SYS_SOCKETCALL conditionally, like x86_64
and um:

[acme@doppio linux-2.6-tip]$ find arch -type f | xargs egrep 'define.+__NO_STUBS'
arch/x86/kernel/syscall_64.c:#define __NO_STUBS
arch/x86/kernel/asm-offsets_64.c:#define __NO_STUBS 1
arch/um/sys-x86_64/syscall_table.c:#define __NO_STUBS
[acme@doppio linux-2.6-tip]$ 

And others want socketcall if preserving old ABIs, like ARM:

arch/arm/include/asm/unistd.h

#if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT)
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_OLDUMOUNT
#define __ARCH_WANT_SYS_ALARM
#define __ARCH_WANT_SYS_UTIME
#define __ARCH_WANT_SYS_OLD_GETRLIMIT
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_SYS_SOCKETCALL
#endif

So for the final revision indeed I need to provide a sys_socketcall
interface to sys_recvmmsg.
 
> > +       if (timeout) {
> > +               /* Doesn't make much sense */
> > +               if (flags & MSG_DONTWAIT)
> > +                       return -EINVAL;
> 
> An application could possibly hit this degenerated case at the end of a loop 
> or whatever, and EINVAL makes it look like this is a bug. Why not EAGAIN?

EAGAIN for me looks like something that when repeated would possibly
produce a valid result, but in this case it will never be that way.

But lemme look around, perhaps this assumption of mine is invalidated by
some strange standard...  EAGAIN = EWOULDBLOCK, and in the current
implementation it wouldn't block at all, because recvmsg would return
right away...

The only semantic we could associate to this would be to make recvmmsg
call, in a non blocking way recvmsg (as specified in the flags) and if
it returns EAGAIN go to sleep waiting for more packets, i.e. a mixed
blocking recvmmsg with a noblocking recvmsg, sounds of creeping
featuritis...

Or perhaps just remove this check and let recvmmsg return imediately,
the app could then use this degenerated case to measure how long it took
for the packets to be received, looks like the most liberal and removes
3 lines of source code and a branch :-\

Keep it simple?

- Arnaldo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v2] net: Introduce recvmmsg socket syscall
  2009-06-11 21:53   ` Arnaldo Carvalho de Melo
@ 2009-06-16 20:36     ` Paul Moore
  0 siblings, 0 replies; 10+ messages in thread
From: Paul Moore @ 2009-06-16 20:36 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Miller, netdev, Chris Van Hoof, Clark Williams,
	Caitlin Bestler, Steven Whitehouse,  Rémi Denis-Courmont,
	Neil Horman, Nivedita Singhvi

On Thursday 11 June 2009 05:53:42 pm Arnaldo Carvalho de Melo wrote:
> Em Thu, Jun 11, 2009 at 02:09:22PM -0400, Paul Moore escreveu:
> > On Wednesday 10 June 2009 11:40:22 pm Arnaldo Carvalho de Melo wrote:
> > > diff --git a/net/socket.c b/net/socket.c
> > > index 791d71a..f9f1e20 100644
> > > --- a/net/socket.c
> > > +++ b/net/socket.c
> > > @@ -702,6 +702,28 @@ int sock_recvmsg(struct socket *sock, struct
> > > msghdr *msg, return ret;
> > >  }
> > >
> > > +static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
> > > +			      size_t size, int flags)
> > > +{
> > > +	struct kiocb iocb;
> > > +	struct sock_iocb siocb;
> > > +	int ret;
> > > +
> > > +	init_sync_kiocb(&iocb, NULL);
> > > +	iocb.private = &siocb;
> > > +
> > > +	siocb.sock = sock;
> > > +	siocb.scm = NULL;
> > > +	siocb.msg = msg;
> > > +	siocb.size = size;
> > > +	siocb.flags = flags;
> > > +
> > > +	ret = sock->ops->recvmsg(&iocb, sock, msg, size, flags);
> > > +	if (-EIOCBQUEUED == ret)
> > > +		ret = wait_on_sync_kiocb(&iocb);
> > > +	return ret;
> > > +}
> >
> > Hmmm, in an effort to reduce duplicated code how about updating
> > __sock_recvmsg() to something like the following:
> >
> > static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
> >                                  struct msghdr *msg, size_t size, int
> > flags) {
> >         int err;
> >
> >         err = security_socket_recvmsg(...);
> >         if (err)
> >                 return err;
> >
> >         return sock_recvmsg_nosec(...);
> > }
> >
> > The only real difference is that now the *_kiocb() functions get called
> > and I have no clue if that is good or bad but it is different :)
>
> Yeah, gets clearer, like this:
>
> static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket
> *sock, struct msghdr *msg, size_t size, int flags)
> {
> 	struct sock_iocb *si = kiocb_to_siocb(iocb);
>
> 	si->sock = sock;
> 	si->scm = NULL;
> 	si->msg = msg;
> 	si->size = size;
> 	si->flags = flags;
>
> 	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
> }
>
> static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
> 				 struct msghdr *msg, size_t size, int flags)
> {
> 	int err = security_socket_recvmsg(sock, msg, size, flags);
>
> 	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
> }
>
> static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
> 			      size_t size, int flags)
> {
> 	struct kiocb iocb;
> 	struct sock_iocb siocb;
> 	int ret;
>
> 	init_sync_kiocb(&iocb, NULL);
> 	iocb.private = &siocb;
> 	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
> 	if (-EIOCBQUEUED == ret)
> 		ret = wait_on_sync_kiocb(&iocb);
> 	return ret;
> }
>
> Better now? :-)

Sorry for the delay, I was away for a few days and just got back to email this 
morning ... but yes, much better now, thanks :)

-- 
paul moore
linux @ hp


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2009-06-16 20:36 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-11  3:40 [RFC v2] net: Introduce recvmmsg socket syscall Arnaldo Carvalho de Melo
2009-06-11  9:41 ` David Miller
2009-06-11 15:34   ` Arnaldo Carvalho de Melo
2009-06-12  0:00     ` David Miller
2009-06-11 22:58   ` [RFC v3] " Arnaldo Carvalho de Melo
2009-06-11 18:09 ` [RFC v2] " Paul Moore
2009-06-11 21:53   ` Arnaldo Carvalho de Melo
2009-06-16 20:36     ` Paul Moore
2009-06-12  7:26 ` Rémi Denis-Courmont
2009-06-12 14:19   ` Arnaldo Carvalho de Melo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).