All of lore.kernel.org
 help / color / mirror / Atom feed
From: Denys Vlasenko <vda.linux@googlemail.com>
To: Davide Libenzi <davidel@xmailserver.org>
Cc: Ulrich Drepper <drepper@redhat.com>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: Re: F_DUPFD_CLOEXEC implementation
Date: Mon, 1 Oct 2007 11:07:15 +0100	[thread overview]
Message-ID: <200710011107.15846.vda.linux@googlemail.com> (raw)
In-Reply-To: <Pine.LNX.4.64.0709301709490.19355@alien.or.mcafeemobile.com>

[-- Attachment #1: Type: text/plain, Size: 2193 bytes --]

On Monday 01 October 2007 04:15, Davide Libenzi wrote:
> On Mon, 1 Oct 2007, Denys Vlasenko wrote:
> 
> > My use case is: I want to do a nonblocking read on descriptor 0 (stdin).
> > It may be a pipe or a socket.
> > 
> > There may be other processes which share this descriptor with me,
> > I simply cannot know that. And they, too, may want to do reads on it.
> > 
> > I want to do nonblocking read in such a way that neither those other
> > processes will ever see fd switching to O_NONBLOCK and back, and
> > I also want to be safe from other processes doing the same.
> > 
> > I don't see how this can be done using standard unix primitives.
> 
> Indeed. You could simulate non-blocking using poll with zero timeout, but 
> if another task may read/write on it, your following read/write may end up 
> blocking even after a poll returned the required events.
> One way to solve this would be some sort of readx/writex where you pass an 
> extra flags parameter

We have that already. They are called send and recv. ;)

> (this could be done with sys_indirect, assuming  
> we'll ever get that mainline) where you specify the non-blocking 
> requirement for-this-call, and not as global per-file flag. Then, of 
> course, you'll have to modify all the "file->f_flags & O_NONBLOCK" tests 
> (and there are many of them) to check for that flag too (that can be a 
> per task_struct flag).

Attached patch detects send/recv(fd, buf, size, MSG_DONTWAIT) on
non-sockets and turns them into non-blocking write/read.
Since filp->f_flags appear to be read and modified without any locking,
I cannot modify it without potentially affecting other processes
accessing the same file through shared struct file.

Therefore I simply make a temporary copy of struct file, set
O_NONBLOCK in it and pass it to vfs_read/write.
Is this heresy? ;) I see only one spinlock in struct file:

#ifdef CONFIG_EPOLL
        spinlock_t              f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */

Do I need to take it?

Also attached is ndelaytest.c which can be used to test that
send(MSG_DONTWAIT) indeed is failing with EAGAIN if write would block
and that other processes never see O_NONBLOCK set.

Comments?
--
vda

[-- Attachment #2: ndelaytest.c --]
[-- Type: text/x-csrc, Size: 1463 bytes --]

#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
#include <signal.h>

#define SECONDS 10

#define STR "."
//#define STR "123456789 123456789 123456789 123456789 "

/* To see send() resulting in EAGAIN:
 * strace -ff -o log ndelaytest | while sleep 11; do break; done
 * log.$PID:
 * send(1, "123456789 123456789 123456789 12"..., 40, MSG_DONTWAIT)
 *                = -1 EAGAIN (Resource temporarily unavailable)
 */

int main()
{
	pid_t pid;
	time_t t;
	int fl;

	puts("starting");
	t = time(0);

	pid = fork();
	if (pid == 0) {
		/* child */
		while ((time(0) - t) < SECONDS-1) {
#if 0 
			/* Uncomment this part and simply run the executable
			 * to see race detection code in action */
#define OP "write"
			fcntl(1, F_SETFL, fcntl(1, F_GETFL) | O_NONBLOCK);
			fl = write(1, STR, sizeof(STR) - 1);
			fcntl(1, F_SETFL, fcntl(1, F_GETFL) & ~O_NONBLOCK);
#else
			/* This part tests whether send(MSG_DONTWAIT)
			 * is racy or not */
#define OP "send"
			fl = send(1, STR, sizeof(STR) - 1, MSG_DONTWAIT);
#endif
			if (fl < 0) {
				perror(OP);
				kill(getppid(), SIGKILL);
				return 1;
			}
		}
		return 0;
	}

	while ((time(0) - t) < SECONDS) {
		fl = fcntl(1, F_GETFL);
		if (fl & O_NONBLOCK) {
			fprintf(stderr, "NONBLOCK:1\n");
			kill(pid, SIGKILL);
			fcntl(1, F_SETFL, fl & ~O_NONBLOCK);
			return 1;
		}
	}
	fprintf(stderr, "NONBLOCK:0\n");
	return 0;
}

[-- Attachment #3: nonblock_linux-2.6.22-rc6.patch --]
[-- Type: text/x-diff, Size: 2902 bytes --]

--- linux-2.6.22-rc6.src/fs/read_write.c	Fri Jun 15 19:30:05 2007
+++ linux-2.6.22-rc6_ndelay/fs/read_write.c	Sun Aug 19 10:43:24 2007
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
+#include <linux/socket.h>
 #include "read_write.h"
 
 #include <asm/uaccess.h>
@@ -351,6 +352,36 @@
 static inline void file_pos_write(struct file *file, loff_t pos)
 {
 	file->f_pos = pos;
+}
+
+/* Helper for send/recv on non-sockets */
+ssize_t rw_with_flags(struct file *file, int fput_needed, void __user *buf, size_t count, unsigned flags)
+{
+	int err;
+	loff_t pos;
+	struct file *file_copy;
+
+	file_copy = file;
+	if (flags & MSG_DONTWAIT) {
+		/* We make copy even if O_NONBLOCK is already set. */
+		/* We don't want it to change under our feet. */
+		file_copy = kmalloc(sizeof(*file_copy), GFP_KERNEL);
+		memcpy(file_copy, file, sizeof(*file_copy));
+		file_copy->f_flags |= O_NONBLOCK;
+	}
+
+	pos = file_pos_read(file);
+	if (flags & MSG_OOB) /* MSG_OOB is reused to mean 'write' */
+		err = vfs_write(file_copy, buf, count, &pos);
+	else
+		err = vfs_read(file_copy, buf, count, &pos);
+	file_pos_write(file, pos);
+
+	if (flags & MSG_DONTWAIT) {
+		kfree(file_copy);
+	}
+	fput_light(file, fput_needed);
+	return err;
 }
 
 asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
--- linux-2.6.22-rc6.src/include/linux/fs.h	Wed Jun 27 21:24:18 2007
+++ linux-2.6.22-rc6_ndelay/include/linux/fs.h	Sun Aug 19 10:32:20 2007
@@ -1154,6 +1154,9 @@
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 
+extern ssize_t rw_with_flags(struct file *, int, void __user *, size_t,
+		unsigned);
+
 /*
  * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
  * without the big kernel lock held in all filesystems.
--- linux-2.6.22-rc6.src/net/socket.c	Fri Jun 15 19:30:08 2007
+++ linux-2.6.22-rc6_ndelay/net/socket.c	Sun Aug 19 11:34:07 2007
@@ -1585,8 +1585,17 @@
 		goto out;
 
 	sock = sock_from_file(sock_file, &err);
-	if (!sock)
-		goto out_put;
+	if (!sock) {
+		if (addr)
+			goto out_put;
+		if (flags & ~MSG_DONTWAIT)
+			goto out_put;
+		/* it's not a socket, but we support a special case:
+		 * send(fd, buf, count, MSG_DONTWAIT)
+		 * (MSG_OOB is reused to mean 'write') */
+		return rw_with_flags(sock_file, fput_needed, buff, len, flags | MSG_OOB);
+	}
+
 	iov.iov_base = buff;
 	iov.iov_len = len;
 	msg.msg_name = NULL;
@@ -1646,8 +1655,15 @@
 		goto out;
 
 	sock = sock_from_file(sock_file, &err);
-	if (!sock)
-		goto out_put;
+	if (!sock) {
+		if (addr)
+			goto out_put;
+		if (flags & ~MSG_DONTWAIT)
+			goto out_put;
+		/* it's not a socket, but we support a special case:
+		 * recv(fd, ubuf, size, MSG_DONTWAIT) */
+		return rw_with_flags(sock_file, fput_needed, ubuf, size, flags);
+	}
 
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;

  reply	other threads:[~2007-10-01 10:07 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-09-28 17:34 F_DUPFD_CLOEXEC implementation Ulrich Drepper
2007-09-28 18:19 ` Davide Libenzi
2007-09-28 18:23   ` Ulrich Drepper
2007-09-30  0:31 ` Denys Vlasenko
2007-09-30 23:11   ` Davide Libenzi
2007-09-30 23:58     ` Denys Vlasenko
2007-10-01  3:15       ` Davide Libenzi
2007-10-01 10:07         ` Denys Vlasenko [this message]
2007-10-01 18:16           ` Al Viro
2007-10-01 18:49             ` Denys Vlasenko
2007-10-01 19:04               ` Davide Libenzi
2007-10-02  9:28                 ` Denys Vlasenko
2007-10-02 19:52                   ` Davide Libenzi
2007-10-01 18:53             ` Michael Tokarev
2007-10-01  0:59   ` Miquel van Smoorenburg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200710011107.15846.vda.linux@googlemail.com \
    --to=vda.linux@googlemail.com \
    --cc=akpm@linux-foundation.org \
    --cc=davidel@xmailserver.org \
    --cc=drepper@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.