netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* kiocb->private is too large for kiocb's on-stack
@ 2004-06-28  8:08 William Lee Irwin III
  2004-06-28  8:12 ` Andrew Morton
  0 siblings, 1 reply; 5+ messages in thread
From: William Lee Irwin III @ 2004-06-28  8:08 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, netdev, akpm, davem

sizeof(struct kiocb) is dangerously large for a structure commonly
allocated on-stack. This patch converts the 24*sizeof(long) field,
->private, to a void pointer for use by file_operations entrypoints.
A ->dtor() method is added to the kiocb in order to support the release
of dynamically allocated structures referred to by ->private.

The sole in-tree users of ->private are async network read/write,
which are not, in fact, async, and so need not handle preallocated
->private as they would need to if ->ki_retry were ever used. The sole
truly async operations are direct IO pread()/pwrite() which do not
now use ->ki_retry(). All they would need to do in that case is to
check for ->private already being allocated for async kiocbs.

This rips 88B off the stack on 32-bit in the common case.

vs. 2.6.7-mm3

Index: mm3-2.6.7/include/net/sock.h
===================================================================
--- mm3-2.6.7.orig/include/net/sock.h	2004-06-27 15:44:42.365168400 -0700
+++ mm3-2.6.7/include/net/sock.h	2004-06-27 15:50:40.651700568 -0700
@@ -617,17 +617,17 @@
 	struct scm_cookie	*scm;
 	struct msghdr		*msg, async_msg;
 	struct iovec		async_iov;
+	struct kiocb		*kiocb;
 };
 
 static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb)
 {
-	BUG_ON(sizeof(struct sock_iocb) > KIOCB_PRIVATE_SIZE);
 	return (struct sock_iocb *)iocb->private;
 }
 
 static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 {
-	return container_of((void *)si, struct kiocb, private);
+	return si->kiocb;
 }
 
 struct socket_alloc {
Index: mm3-2.6.7/include/linux/aio.h
===================================================================
--- mm3-2.6.7.orig/include/linux/aio.h	2004-06-15 22:18:54.000000000 -0700
+++ mm3-2.6.7/include/linux/aio.h	2004-06-27 15:50:40.654700112 -0700
@@ -23,8 +23,6 @@
 
 #define KIOCB_SYNC_KEY		(~0U)
 
-#define KIOCB_PRIVATE_SIZE	(24 * sizeof(long))
-
 /* ki_flags bits */
 #define KIF_LOCKED		0
 #define KIF_KICKED		1
@@ -55,6 +53,7 @@
 	struct kioctx		*ki_ctx;	/* may be NULL for sync ops */
 	int			(*ki_cancel)(struct kiocb *, struct io_event *);
 	long			(*ki_retry)(struct kiocb *);
+	void			(*ki_dtor)(struct kiocb *);
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
@@ -65,8 +64,7 @@
 	} ki_obj;
 	__u64			ki_user_data;	/* user's data for completion */
 	loff_t			ki_pos;
-
-	char			private[KIOCB_PRIVATE_SIZE];
+	void			*private;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -79,6 +77,7 @@
 		(x)->ki_filp = (filp);			\
 		(x)->ki_ctx = &tsk->active_mm->default_kioctx;	\
 		(x)->ki_cancel = NULL;			\
+		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
 	} while (0)
 
Index: mm3-2.6.7/net/socket.c
===================================================================
--- mm3-2.6.7.orig/net/socket.c	2004-06-15 22:19:13.000000000 -0700
+++ mm3-2.6.7/net/socket.c	2004-06-27 15:50:40.662698896 -0700
@@ -548,9 +548,11 @@
 int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 {
 	struct kiocb iocb;
+	struct sock_iocb siocb;
 	int ret;
 
 	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
 	ret = __sock_sendmsg(&iocb, sock, msg, size);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&iocb);
@@ -581,15 +583,22 @@
 		 size_t size, int flags)
 {
 	struct kiocb iocb;
+	struct sock_iocb siocb;
 	int ret;
 
         init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
 	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&iocb);
 	return ret;
 }
 
+static void sock_aio_dtor(struct kiocb *iocb)
+{
+	kfree(iocb->private);
+}
+
 /*
  *	Read data from a socket. ubuf is a user mode pointer. We make sure the user
  *	area ubuf...ubuf+size-1 is writable before asking the protocol.
@@ -598,7 +607,7 @@
 static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
 			 size_t size, loff_t pos)
 {
-	struct sock_iocb *x = kiocb_to_siocb(iocb);
+	struct sock_iocb *x, siocb;
 	struct socket *sock;
 	int flags;
 
@@ -607,6 +616,16 @@
 	if (size==0)		/* Match SYS5 behaviour */
 		return 0;
 
+	if (is_sync_kiocb(iocb))
+		x = &siocb;
+	else {
+		x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
+		if (!x)
+			return -ENOMEM;
+		iocb->ki_dtor = sock_aio_dtor;
+	}
+	iocb->private = x;
+	x->kiocb = iocb;
 	sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode); 
 
 	x->async_msg.msg_name = NULL;
@@ -631,7 +650,7 @@
 static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
 			  size_t size, loff_t pos)
 {
-	struct sock_iocb *x = kiocb_to_siocb(iocb);
+	struct sock_iocb *x, siocb;
 	struct socket *sock;
 	
 	if (pos != 0)
@@ -639,6 +658,16 @@
 	if(size==0)		/* Match SYS5 behaviour */
 		return 0;
 
+	if (is_sync_kiocb(iocb))
+		x = &siocb;
+	else {
+		x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
+		if (!x)
+			return -ENOMEM;
+		iocb->ki_dtor = sock_aio_dtor;
+	}
+	iocb->private = x;
+	x->kiocb = iocb;
 	sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode); 
 
 	x->async_msg.msg_name = NULL;
Index: mm3-2.6.7/fs/aio.c
===================================================================
--- mm3-2.6.7.orig/fs/aio.c	2004-06-27 15:44:54.278357320 -0700
+++ mm3-2.6.7/fs/aio.c	2004-06-27 15:50:40.667698136 -0700
@@ -396,6 +396,8 @@
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
 	req->ki_obj.user = NULL;
+	req->ki_dtor = NULL;
+	req->private = NULL;
 
 	/* Check if the completion queue has enough free space to
 	 * accept an event from this io.
@@ -436,9 +438,13 @@
 
 static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
+	if (req->ki_dtor)
+		req->ki_dtor(req);
 	req->ki_ctx = NULL;
 	req->ki_filp = NULL;
 	req->ki_obj.user = NULL;
+	req->ki_dtor = NULL;
+	req->private = NULL;
 	kmem_cache_free(kiocb_cachep, req);
 	ctx->reqs_active--;
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: kiocb->private is too large for kiocb's on-stack
  2004-06-28  8:08 kiocb->private is too large for kiocb's on-stack William Lee Irwin III
@ 2004-06-28  8:12 ` Andrew Morton
  2004-06-28  8:20   ` William Lee Irwin III
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2004-06-28  8:12 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: linux-kernel, linux-fsdevel, netdev, davem

William Lee Irwin III <wli@holomorphy.com> wrote:
>
> sizeof(struct kiocb) is dangerously large for a structure commonly
> allocated on-stack. This patch converts the 24*sizeof(long) field,
> ->private, to a void pointer for use by file_operations entrypoints.
> A ->dtor() method is added to the kiocb in order to support the release
> of dynamically allocated structures referred to by ->private.
> 
> The sole in-tree users of ->private are async network read/write,
> which are not, in fact, async, and so need not handle preallocated
> ->private as they would need to if ->ki_retry were ever used. The sole
> truly async operations are direct IO pread()/pwrite() which do not
> now use ->ki_retry(). All they would need to do in that case is to
> check for ->private already being allocated for async kiocbs.
> 
> This rips 88B off the stack on 32-bit in the common case.
> 

>  int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
>  {
>  	struct kiocb iocb;
> +	struct sock_iocb siocb;
>  	int ret;
>  
>  	init_sync_kiocb(&iocb, NULL);
> +	iocb.private = &siocb;
>  	ret = __sock_sendmsg(&iocb, sock, msg, size);
>  	if (-EIOCBQUEUED == ret)
>  		ret = wait_on_sync_kiocb(&iocb);

That's so much better than what we had before it ain't funny.

Was this runtime tested?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: kiocb->private is too large for kiocb's on-stack
  2004-06-28  8:12 ` Andrew Morton
@ 2004-06-28  8:20   ` William Lee Irwin III
  2004-06-28 18:25     ` David S. Miller
  2004-06-29  2:02     ` William Lee Irwin III
  0 siblings, 2 replies; 5+ messages in thread
From: William Lee Irwin III @ 2004-06-28  8:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, linux-fsdevel, netdev, davem

On Mon, Jun 28, 2004 at 01:12:32AM -0700, Andrew Morton wrote:
> That's so much better than what we had before it ain't funny.
> Was this runtime tested?

Yes. Oracle exercises this, and it survives OAST.

I'll write a dedicated userspace testcase for the aio operations and
follow up with that.


-- wli

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: kiocb->private is too large for kiocb's on-stack
  2004-06-28  8:20   ` William Lee Irwin III
@ 2004-06-28 18:25     ` David S. Miller
  2004-06-29  2:02     ` William Lee Irwin III
  1 sibling, 0 replies; 5+ messages in thread
From: David S. Miller @ 2004-06-28 18:25 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: akpm, linux-kernel, linux-fsdevel, netdev

On Mon, 28 Jun 2004 01:20:16 -0700
William Lee Irwin III <wli@holomorphy.com> wrote:

> On Mon, Jun 28, 2004 at 01:12:32AM -0700, Andrew Morton wrote:
> > That's so much better than what we had before it ain't funny.
> > Was this runtime tested?
> 
> Yes. Oracle exercises this, and it survives OAST.
> 
> I'll write a dedicated userspace testcase for the aio operations and
> follow up with that.

This all looks fine to me.  Andrew, would you like me to push this
patch along or did you already plan to take care of it.

Nice work William.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: kiocb->private is too large for kiocb's on-stack
  2004-06-28  8:20   ` William Lee Irwin III
  2004-06-28 18:25     ` David S. Miller
@ 2004-06-29  2:02     ` William Lee Irwin III
  1 sibling, 0 replies; 5+ messages in thread
From: William Lee Irwin III @ 2004-06-29  2:02 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel, linux-fsdevel, netdev, davem

/* On Mon, Jun 28, 2004 at 01:12:32AM -0700, Andrew Morton wrote:
>> That's so much better than what we had before it ain't funny.
>> Was this runtime tested?

On Mon, Jun 28, 2004 at 01:20:16AM -0700, William Lee Irwin III wrote:
> Yes. Oracle exercises this, and it survives OAST.
> I'll write a dedicated userspace testcase for the aio operations and
> follow up with that.

This is pretty damn mindless but seems to pound on the stuff. Maybe
something better can be devised (e.g. passing msgs around in a ring).


-- wli */

#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <limits.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>

#define die()								\
	do {								\
		int __err = errno;					\
		perror(__FUNCTION__);					\
		fprintf(stderr, "dead in %s() at %s:%d errno = %d\n",	\
			__FUNCTION__, __FILE__, __LINE__, __err);	\
		abort();						\
	} while (0)

pid_t gettid(void)
{
	return syscall(__NR_gettid);
}

long io_submit(aio_context_t context, long n, struct iocb **iocbs)
{
	return syscall(__NR_io_submit, context, n, iocbs);
}

long io_setup(unsigned n, aio_context_t *context)
{
	return syscall(__NR_io_setup, n, context);
}

long io_getevents(aio_context_t context, long min, long n,
				struct io_event *events, struct timespec *ts)
{
	return syscall(__NR_io_getevents, context, min, n, events, ts);
}

int epoll_add(int epfd, int fd, struct epoll_event *event)
{
	return epoll_ctl(epfd, EPOLL_CTL_ADD, fd, event);
}

static void *client(void *arg)
{
	int n = 0, *fds = arg;
	pthread_t id = pthread_self();

	while (1) {
		write(fds[1], &id, sizeof(pthread_t));
		read(fds[1], &id, sizeof(pthread_t));
		++n;
		if (!(n % (1 << 16)))
			printf("boo from %d\n", gettid());
	}
}

int main(int argc, char * const argv[])
{
	long i, j, k, nr_clients;
	pthread_t *clients;
	int *svs, epfd;
	struct epoll_event *events;
	aio_context_t context = 0;
	struct iocb *iocbs, **iocbps;
	struct io_event *io_events;
	pthread_t *bufs;

	if (argc != 2)
		return EXIT_FAILURE;
	nr_clients = strtol(argv[1], NULL, 0);
	if (nr_clients >= INT_MAX || nr_clients < 0)
		return EXIT_FAILURE;
	if ((size_t)nr_clients >= SIZE_MAX/sizeof(pthread_t))
		return EXIT_FAILURE;
	if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(int)))
		return EXIT_FAILURE;
	if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(struct iocb)))
		return EXIT_FAILURE;
	if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(struct iocb *)))
		return EXIT_FAILURE;
	if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(struct io_event)))
		return EXIT_FAILURE;
	if (!(bufs = calloc(nr_clients, 2*sizeof(pthread_t))))
		return EXIT_FAILURE;
	if (!(clients = calloc(nr_clients, sizeof(pthread_t)))) {
		die();
		goto free_bufs;
	}
	if (!(svs = calloc(nr_clients, 2*sizeof(int)))) {
		die();
		goto free_clients;
	}
	if (!(events = calloc(nr_clients, sizeof(struct epoll_event)))) {
		die();
		goto free_svs;
	}
	if (!(iocbs = calloc(nr_clients, 2*sizeof(struct iocb)))) {
		die();
		goto free_events;
	}
	if (!(iocbps = calloc(nr_clients, 2*sizeof(struct iocb *)))) {
		die();
		goto free_iocbs;
	}
	if (!(io_events = calloc(nr_clients, sizeof(struct io_event)))) {
		die();
		goto free_iocbps;
	}
	memset(clients, 0, sizeof(pthread_t)*nr_clients);
	memset(svs, 0, 2*sizeof(int)*nr_clients);
	memset(iocbs, 0, 2*sizeof(struct iocb)*nr_clients);
	memset(iocbps, 0, 2*sizeof(struct iocb *)*nr_clients);
	memset(io_events, 0, 2*sizeof(struct io_event)*nr_clients);
	if ((epfd = epoll_create(nr_clients)) < 0) {
		die();
		goto free_io_events;
	}
	if (io_setup(nr_clients, &context)) {
		die();
		goto free_io_events;
	}
	for (i = 0; i < nr_clients; ++i) {
		struct epoll_event event = {
			.events = EPOLLIN|EPOLLET,
			.data.ptr = &svs[2*i],
		};
		if (socketpair(AF_UNIX, SOCK_STREAM, 0, &svs[2*i])) {
			die();
		}
		if (epoll_add(epfd, svs[2*i], &event))
			die();
		if (pthread_create(&clients[i], NULL, client, &svs[2*i]))
			abort();
	}
	while (1) {
		struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
		j = epoll_wait(epfd, events, nr_clients, 0);
		k = io_getevents(context, 1, nr_clients, io_events, &ts);
		for (i = 0; i < k; ++i) {
			struct iocb *cb =
				(struct iocb *)(unsigned long)io_events[i].obj;
			cb->aio_data = 0;
			if (cb->aio_lio_opcode == IOCB_CMD_PREAD)
				bufs[2*(cb - iocbs)+1] = bufs[2*(cb - iocbs)];
		}
		for (i = 0; i < j; ++i) {
			int m, n = ((int *)events[i].data.ptr - svs)/2;

			m = (n + 1) % nr_clients;
			bufs[2*i+1] = clients[m];
			iocbs[2*n].aio_lio_opcode = IOCB_CMD_PREAD;
			iocbs[2*n+1].aio_lio_opcode = IOCB_CMD_PWRITE;
			iocbs[2*n].aio_fildes = svs[2*n];
			iocbs[2*n+1].aio_fildes = svs[2*n];
			iocbs[2*n].aio_buf = (unsigned long)&bufs[2*i];
			iocbs[2*n+1].aio_buf = (unsigned long)&bufs[2*i+1];
			iocbs[2*n].aio_data = (unsigned long)&bufs[2*i];
			iocbs[2*n+1].aio_data = (unsigned long)&bufs[2*i+1];
			iocbs[2*n].aio_nbytes = sizeof(pthread_t);
			iocbs[2*n+1].aio_nbytes = sizeof(pthread_t);
			iocbs[2*n].aio_offset = 0;
			iocbs[2*n+1].aio_offset = 0;
			iocbps[2*i] = &iocbs[2*n];
			iocbps[2*i+1] = &iocbs[2*n+1];
		}
		if (j)
			io_submit(context, 2*j, iocbps);
	}
free_io_events:
	free(io_events);
free_iocbps:
	free(iocbps);
free_iocbs:
	free(iocbs);
free_events:
	free(events);
free_svs:
	free(svs);
free_clients:
	free(clients);
free_bufs:
	free(bufs);
	die();
}

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-06-29  2:02 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-06-28  8:08 kiocb->private is too large for kiocb's on-stack William Lee Irwin III
2004-06-28  8:12 ` Andrew Morton
2004-06-28  8:20   ` William Lee Irwin III
2004-06-28 18:25     ` David S. Miller
2004-06-29  2:02     ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).