public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [patch 6/9] signalfd/timerfd v1 - timerfd core ...
@ 2007-03-09 23:41 Davide Libenzi
  2007-03-10  6:33 ` Nicholas Miell
  0 siblings, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-09 23:41 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Andrew Morton, Linus Torvalds

This patch introduces a new system call for timers events delivered
though file descriptors. This allows timer event to be used with
standard POSIX poll(2), select(2) and read(2). As a consequence of
supporting the Linux f_op->poll subsystem, they can be used with
epoll(2) too.
The system call is defined as:

int timerfd(int ufd, int tmrtype, const struct timespec *utmr);

The "ufd" parameter allows for re-use (re-programming) of an existing
timerfd w/out going through the close/open cycle (same as signalfd).
If "ufd" is -1, s new file descriptor will be created, otherwise the
existing "ufd" will be re-programmed.
The "tmrtype" parameter allows to specify the timer type. The following
values are supported:

    TFD_TIMER_REL
        The time specified in the "utmr" parameter is a relative time
	from NOW.

    TFD_TIMER_ABS
        The timer specified in the "utmr" parameter is an absolute time.

    TFD_TIMER_SEQ
        The time specified in the "utmr" parameter is an interval at
	which a continuous clock rate will be generated.

The function returns the new (or same, in case "ufd" is a valid timerfd
descriptor) file, or -1 in case of error.
As stated before, the timerfd file descriptor supports poll(2), select(2)
and epoll(2). When a timer event happened on the timerfd, a POLLIN mask
will be returned.
The read(2) call can be used, and it will return a u32 variable holding
the number of "ticks" that happened on the interface since the last call
to read(2). The read(2) call supportes the O_NONBLOCK flag too, and EAGAIN
will be returned if no ticks happened.
A quick test program, shows timerfd working correctly on my amd64 box:

http://www.xmailserver.org/timerfd-test.c




Signed-off-by: Davide Libenzi <davidel@xmailserver.org>



- Davide



Index: linux-2.6.20.ep2/fs/timerfd.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20.ep2/fs/timerfd.c	2007-03-09 15:09:52.000000000 -0800
@@ -0,0 +1,263 @@
+/*
+ *  fs/timerfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/anon_inodes.h>
+#include <linux/timerfd.h>
+
+#include <asm/uaccess.h>
+
+
+
+struct timerfd_ctx {
+	struct hrtimer tmr;
+	ktime_t tval;
+	int tmrtype;
+	spinlock_t lock;
+	wait_queue_head_t wqh;
+	unsigned long ticks;
+};
+
+
+static int timerfd_tmrproc(struct hrtimer *htmr);
+static void timerfd_cleanup(struct timerfd_ctx *ctx);
+static int timerfd_close(struct inode *inode, struct file *file);
+static unsigned int timerfd_poll(struct file *file, poll_table *wait);
+static ssize_t timerfd_read(struct file *file, char *buf, size_t count, loff_t *ppos);
+
+
+
+static const struct file_operations timerfd_fops = {
+	.release	= timerfd_close,
+	.poll		= timerfd_poll,
+	.read		= timerfd_read,
+};
+static struct kmem_cache *timerfd_ctx_cachep;
+
+
+
+static int timerfd_tmrproc(struct hrtimer *htmr)
+{
+	struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
+	int rval = HRTIMER_NORESTART;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->ticks++;
+	__wake_up_locked(&ctx->wqh, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
+	if (ctx->tmrtype == TFD_TIMER_SEQ) {
+		hrtimer_forward(htmr, htmr->base->softirq_time, ctx->tval);
+		rval = HRTIMER_RESTART;
+	}
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	return rval;
+}
+
+/*
+ * Create a file descriptor that is associated with our signal
+ * state. We can pass it around to others if we want to, but
+ * it will always be _our_ signal state.
+ */
+asmlinkage long sys_timerfd(int ufd, int tmrtype, const struct timespec __user *utmr)
+{
+	int error;
+	struct timerfd_ctx *ctx;
+	struct file *file;
+	struct inode *inode;
+	ktime_t tval, tnow;
+	struct timespec ktmr, tmrnow;
+
+	error = -EFAULT;
+	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
+		goto err_exit;
+
+	tval = timespec_to_ktime(ktmr);
+	error = -EINVAL;
+	switch (tmrtype) {
+	case TFD_TIMER_REL:
+	case TFD_TIMER_SEQ:
+		break;
+	case TFD_TIMER_ABS:
+		getnstimeofday(&tmrnow);
+		tnow = timespec_to_ktime(tmrnow);
+		if (ktime_to_ns(tval) <= ktime_to_ns(tnow))
+			goto err_exit;
+		tval = ktime_sub(tval, tnow);
+		break;
+	default:
+		goto err_exit;
+	}
+
+	if (ufd == -1) {
+		error = -ENOMEM;
+		ctx = kmem_cache_alloc(timerfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx)
+			goto err_exit;
+
+		init_waitqueue_head(&ctx->wqh);
+		spin_lock_init(&ctx->lock);
+		ctx->ticks = 0;
+		ctx->tmrtype = tmrtype;
+		ctx->tval = tval;
+		hrtimer_init(&ctx->tmr, CLOCK_MONOTONIC, HRTIMER_REL);
+		ctx->tmr.expires = ctx->tval;
+		ctx->tmr.function = timerfd_tmrproc;
+
+		hrtimer_start(&ctx->tmr, ctx->tval, HRTIMER_REL);
+
+		/*
+		 * When we call this, the initialization must be complete, since
+		 * aino_getfd() will install the fd.
+		 */
+		error = aino_getfd(&ufd, &inode, &file, "[timerfd]",
+				   &timerfd_fops, ctx);
+		if (error)
+			goto err_fdalloc;
+	} else {
+		error = -EBADF;
+		file = fget(ufd);
+		if (!file)
+			goto err_exit;
+		ctx = file->private_data;
+		error = -EINVAL;
+		if (file->f_op != &timerfd_fops) {
+			fput(file);
+			goto err_exit;
+		}
+
+		/*
+		 * We need to stop the exiting timer before. We call
+		 * hrtimer_cancel() w/out holding our lock.
+		 */
+		spin_lock_irq(&ctx->lock);
+		while (hrtimer_active(&ctx->tmr)) {
+			spin_unlock_irq(&ctx->lock);
+			hrtimer_cancel(&ctx->tmr);
+			spin_lock_irq(&ctx->lock);
+		}
+		/*
+		 * Re-program the timer to the new value ...
+		 */
+		ctx->ticks = 0;
+		ctx->tmrtype = tmrtype;
+		ctx->tval = tval;
+		hrtimer_init(&ctx->tmr, CLOCK_MONOTONIC, HRTIMER_REL);
+		ctx->tmr.expires = ctx->tval;
+		ctx->tmr.function = timerfd_tmrproc;
+
+		hrtimer_start(&ctx->tmr, ctx->tval, HRTIMER_REL);
+
+		spin_unlock_irq(&ctx->lock);
+
+		fput(file);
+	}
+
+	return ufd;
+
+err_fdalloc:
+	timerfd_cleanup(ctx);
+err_exit:
+	return error;
+}
+
+
+static void timerfd_cleanup(struct timerfd_ctx *ctx)
+{
+	hrtimer_cancel(&ctx->tmr);
+	kmem_cache_free(timerfd_ctx_cachep, ctx);
+}
+
+
+static int timerfd_close(struct inode *inode, struct file *file)
+{
+	timerfd_cleanup(file->private_data);
+	return 0;
+}
+
+
+static unsigned int timerfd_poll(struct file *file, poll_table *wait)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	return ctx->ticks ? POLLIN: 0;
+}
+
+
+static ssize_t timerfd_read(struct file *file, char *buf, size_t count, loff_t *ppos)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	u32 ticks;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(ticks))
+		return -EINVAL;
+	spin_lock_irq(&ctx->lock);
+	res = -EAGAIN;
+	if ((ticks = (u32) ctx->ticks) == 0 &&
+	    !(file->f_flags & O_NONBLOCK)) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (res = 0;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if ((ticks = (u32) ctx->ticks) != 0) {
+				res = 0;
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -EINTR;
+				break;
+			}
+			spin_unlock_irq(&ctx->lock);
+			schedule();
+			spin_lock_irq(&ctx->lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		set_current_state(TASK_RUNNING);
+	}
+	if (ticks)
+		ctx->ticks = 0;
+	spin_unlock_irq(&ctx->lock);
+	if (ticks)
+		res = put_user(ticks, buf) ? -EFAULT: sizeof(ticks);
+	return res;
+}
+
+
+static int __init timerfd_init(void)
+{
+	timerfd_ctx_cachep = kmem_cache_create("timerfd_ctx_cache",
+						sizeof(struct timerfd_ctx),
+						0, SLAB_PANIC, NULL, NULL);
+	return 0;
+}
+
+
+static void __exit timerfd_exit(void)
+{
+	kmem_cache_destroy(timerfd_ctx_cachep);
+}
+
+module_init(timerfd_init);
+module_exit(timerfd_exit);
+
+MODULE_LICENSE("GPL");
Index: linux-2.6.20.ep2/include/linux/timerfd.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20.ep2/include/linux/timerfd.h	2007-03-09 10:56:08.000000000 -0800
@@ -0,0 +1,20 @@
+/*
+ *  include/linux/timerfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_TIMERFD_H
+#define _LINUX_TIMERFD_H
+
+
+#define TFD_TIMER_REL 1
+#define TFD_TIMER_ABS 2
+#define TFD_TIMER_SEQ 3
+
+
+
+
+#endif /* _LINUX_TIMERFD_H */
+
Index: linux-2.6.20.ep2/fs/Makefile
===================================================================
--- linux-2.6.20.ep2.orig/fs/Makefile	2007-03-09 10:43:17.000000000 -0800
+++ linux-2.6.20.ep2/fs/Makefile	2007-03-09 10:56:08.000000000 -0800
@@ -11,7 +11,7 @@
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o anon_inodes.o signalfd.o
+		stack.o anon_inodes.o signalfd.o timerfd.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
Index: linux-2.6.20.ep2/include/linux/syscalls.h
===================================================================
--- linux-2.6.20.ep2.orig/include/linux/syscalls.h	2007-03-09 11:52:08.000000000 -0800
+++ linux-2.6.20.ep2/include/linux/syscalls.h	2007-03-09 11:52:29.000000000 -0800
@@ -603,6 +603,7 @@
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
+asmlinkage long sys_timerfd(int ufd, int tmrtype, const struct timespec __user *utmr);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-09 23:41 [patch 6/9] signalfd/timerfd v1 - timerfd core Davide Libenzi
@ 2007-03-10  6:33 ` Nicholas Miell
  2007-03-10  6:38   ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-10  6:33 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 2007-03-09 at 15:41 -0800, Davide Libenzi wrote:
> This patch introduces a new system call for timers events delivered
> though file descriptors. This allows timer event to be used with
> standard POSIX poll(2), select(2) and read(2). As a consequence of
> supporting the Linux f_op->poll subsystem, they can be used with
> epoll(2) too.
> The system call is defined as:
> 
> int timerfd(int ufd, int tmrtype, const struct timespec *utmr);
> 
> The "ufd" parameter allows for re-use (re-programming) of an existing
> timerfd w/out going through the close/open cycle (same as signalfd).
> If "ufd" is -1, s new file descriptor will be created, otherwise the
> existing "ufd" will be re-programmed.
> The "tmrtype" parameter allows to specify the timer type. The following
> values are supported:
> 
>     TFD_TIMER_REL
>         The time specified in the "utmr" parameter is a relative time
> 	from NOW.
> 
>     TFD_TIMER_ABS
>         The timer specified in the "utmr" parameter is an absolute time.
> 
>     TFD_TIMER_SEQ
>         The time specified in the "utmr" parameter is an interval at
> 	which a continuous clock rate will be generated.
> 
> The function returns the new (or same, in case "ufd" is a valid timerfd
> descriptor) file, or -1 in case of error.
> As stated before, the timerfd file descriptor supports poll(2), select(2)
> and epoll(2). When a timer event happened on the timerfd, a POLLIN mask
> will be returned.
> The read(2) call can be used, and it will return a u32 variable holding
> the number of "ticks" that happened on the interface since the last call
> to read(2). The read(2) call supportes the O_NONBLOCK flag too, and EAGAIN
> will be returned if no ticks happened.
> A quick test program, shows timerfd working correctly on my amd64 box:
> 
> http://www.xmailserver.org/timerfd-test.c
> 

Why did you ignore the existing POSIX timer API?

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10  6:33 ` Nicholas Miell
@ 2007-03-10  6:38   ` Davide Libenzi
  2007-03-10  6:43     ` Nicholas Miell
  0 siblings, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-10  6:38 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 9 Mar 2007, Nicholas Miell wrote:

> Why did you ignore the existing POSIX timer API?

The existing POSIX API is a standard and a very good one. Too bad it does 
not deliver to files. The timerfd code is, as you can probably read from 
the code, a really thin wrapper around the existing hrtimer.c Linux code.


- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10  6:38   ` Davide Libenzi
@ 2007-03-10  6:43     ` Nicholas Miell
  2007-03-10  6:53       ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-10  6:43 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 2007-03-09 at 22:38 -0800, Davide Libenzi wrote:
> On Fri, 9 Mar 2007, Nicholas Miell wrote:
> 
> > Why did you ignore the existing POSIX timer API?
> 
> The existing POSIX API is a standard and a very good one. Too bad it does 
> not deliver to files. The timerfd code is, as you can probably read from 
> the code, a really thin wrapper around the existing hrtimer.c Linux code.

So extend the existing POSIX timer API to deliver expiry events via a
fd.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10  6:43     ` Nicholas Miell
@ 2007-03-10  6:53       ` Davide Libenzi
  2007-03-10  7:09         ` Nicholas Miell
  0 siblings, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-10  6:53 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 9 Mar 2007, Nicholas Miell wrote:

> On Fri, 2007-03-09 at 22:38 -0800, Davide Libenzi wrote:
> > On Fri, 9 Mar 2007, Nicholas Miell wrote:
> > 
> > > Why did you ignore the existing POSIX timer API?
> > 
> > The existing POSIX API is a standard and a very good one. Too bad it does 
> > not deliver to files. The timerfd code is, as you can probably read from 
> > the code, a really thin wrapper around the existing hrtimer.c Linux code.
> 
> So extend the existing POSIX timer API to deliver expiry events via a
> fd.

It'll be out of standard as timerfd is, w/out code savings. Look at the 
code and tell me what could be saved. Prolly the ten lines of the timer 
callback. Lines that you'll have to drop inside the current posix timer 
layer. Better leave standards alone, especially like in this case, when 
the savings are not there.



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10  6:53       ` Davide Libenzi
@ 2007-03-10  7:09         ` Nicholas Miell
  2007-03-10  7:36           ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-10  7:09 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 2007-03-09 at 22:53 -0800, Davide Libenzi wrote:
> On Fri, 9 Mar 2007, Nicholas Miell wrote:
> 
> > On Fri, 2007-03-09 at 22:38 -0800, Davide Libenzi wrote:
> > > On Fri, 9 Mar 2007, Nicholas Miell wrote:
> > > 
> > > > Why did you ignore the existing POSIX timer API?
> > > 
> > > The existing POSIX API is a standard and a very good one. Too bad it does 
> > > not deliver to files. The timerfd code is, as you can probably read from 
> > > the code, a really thin wrapper around the existing hrtimer.c Linux code.
> > 
> > So extend the existing POSIX timer API to deliver expiry events via a
> > fd.
> 
> It'll be out of standard as timerfd is, w/out code savings. Look at the 
> code and tell me what could be saved. Prolly the ten lines of the timer 
> callback. Lines that you'll have to drop inside the current posix timer 
> layer. Better leave standards alone, especially like in this case, when 
> the savings are not there.
> 

OK, here's a more formal listing of my objections to the introduction of
timerfd in this form:

A) It is a new general-purpose ABI intended for wide-scale usage, and
thus must be maintained forever.

B) It is less functional than the existing ABIs -- modulo their
"delivery via signals only" limitation, which can be corrected (and has
been already in other operating systems).

C) Being an entirely new creation that completely ignores past work in
this area, it has no hope of ever getting into POSIX.

which means

D) At some point in time, Linux is going to get the POSIX version (in
whatever form it takes), making this new ABI useless dead weight (see
point A).


-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10  7:09         ` Nicholas Miell
@ 2007-03-10  7:36           ` Davide Libenzi
  2007-03-10 19:52             ` Nicholas Miell
  0 siblings, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-10  7:36 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 9 Mar 2007, Nicholas Miell wrote:

> On Fri, 2007-03-09 at 22:53 -0800, Davide Libenzi wrote:
> > On Fri, 9 Mar 2007, Nicholas Miell wrote:
> > > 
> > > So extend the existing POSIX timer API to deliver expiry events via a
> > > fd.
> > 
> > It'll be out of standard as timerfd is, w/out code savings. Look at the 
> > code and tell me what could be saved. Prolly the ten lines of the timer 
> > callback. Lines that you'll have to drop inside the current posix timer 
> > layer. Better leave standards alone, especially like in this case, when 
> > the savings are not there.
> > 
> 
> OK, here's a more formal listing of my objections to the introduction of
> timerfd in this form:
> 
> A) It is a new general-purpose ABI intended for wide-scale usage, and
> thus must be maintained forever.

Yup


> B) It is less functional than the existing ABIs -- modulo their
> "delivery via signals only" limitation, which can be corrected (and has
> been already in other operating systems).

Less functional? Please, do tell me ...



> C) Being an entirely new creation that completely ignores past work in
> this area, it has no hope of ever getting into POSIX.
> 
> which means
> 
> D) At some point in time, Linux is going to get the POSIX version (in
> whatever form it takes), making this new ABI useless dead weight (see
> point A).

Adding parameters/fields to a standard is going to create even more 
confusion than a new *single* function. And the code to cross-link the 
timerfd and the current posix timers is going to end up in being more 
complex than the current one.



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10  7:36           ` Davide Libenzi
@ 2007-03-10 19:52             ` Nicholas Miell
  2007-03-10 20:41               ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-10 19:52 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Fri, 2007-03-09 at 23:36 -0800, Davide Libenzi wrote:
> On Fri, 9 Mar 2007, Nicholas Miell wrote:
> 
> > On Fri, 2007-03-09 at 22:53 -0800, Davide Libenzi wrote:
> > > On Fri, 9 Mar 2007, Nicholas Miell wrote:
> > > > 
> > > > So extend the existing POSIX timer API to deliver expiry events via a
> > > > fd.
> > > 
> > > It'll be out of standard as timerfd is, w/out code savings. Look at the 
> > > code and tell me what could be saved. Prolly the ten lines of the timer 
> > > callback. Lines that you'll have to drop inside the current posix timer 
> > > layer. Better leave standards alone, especially like in this case, when 
> > > the savings are not there.
> > > 
> > 
> > OK, here's a more formal listing of my objections to the introduction of
> > timerfd in this form:
> > 
> > A) It is a new general-purpose ABI intended for wide-scale usage, and
> > thus must be maintained forever.
> 
> Yup
> 
> 
> > B) It is less functional than the existing ABIs -- modulo their
> > "delivery via signals only" limitation, which can be corrected (and has
> > been already in other operating systems).
> 
> Less functional? Please, do tell me ...
> 

Try reading the timer_create man page.

In short, you're limited to a single clock, so you can't set timers
based on wall-clock time (subject to NTP correction), monotomic time
(not subject to NTP, will not ever go backwards or skip ticks), the
high-res versions of the previous two clocks, per-thread or per-process
CPU usage time, or any other clocks that may get introduced in the
future.

In addition, you've introduced an entirely new incompatible API that
probably doesn't fit easily into existing software that already uses
POSIX timers.


> 
> > C) Being an entirely new creation that completely ignores past work in
> > this area, it has no hope of ever getting into POSIX.
> > 
> > which means
> > 
> > D) At some point in time, Linux is going to get the POSIX version (in
> > whatever form it takes), making this new ABI useless dead weight (see
> > point A).
> 
> Adding parameters/fields to a standard is going to create even more 
> confusion than a new *single* function. And the code to cross-link the 
> timerfd and the current posix timers is going to end up in being more 
> complex than the current one.
> 

Yes, but the standard explicitly allows you to do this. Furthermore, if
you work within the existing framework, you can lobby for the inclusion
of your API in the next version of POSIX.

Simplicity of the code is only a virtue if you don't have to do the
exact same thing again with a different interface later while keeping
the maintenance burden of the existing proprietary (and, thus,
unpopular) interface.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 19:52             ` Nicholas Miell
@ 2007-03-10 20:41               ` Davide Libenzi
  2007-03-10 21:01                 ` Nicholas Miell
  0 siblings, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-10 20:41 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Sat, 10 Mar 2007, Nicholas Miell wrote:

> Try reading the timer_create man page.
> 
> In short, you're limited to a single clock, so you can't set timers
> based on wall-clock time (subject to NTP correction), monotomic time
> (not subject to NTP, will not ever go backwards or skip ticks), the
> high-res versions of the previous two clocks, per-thread or per-process
> CPU usage time, or any other clocks that may get introduced in the
> future.

One timer per fd yes. So?
The real-time and monotonic selection can be added. 
If you look at the posix timers code, that's a bunch of code over the real 
meat of it, that is hrtimer.c. The timerfd interface goes straight to 
that, without adding yet another meaning to the sigevent structure, and 
yet another case inside the posix timers trigger functions. That will be 
as unstandard as timerfd is, and even more, since you cannot use that 
interface and hope to be portable in any case.
On top of that, handing over files to the posix timers will creates 
problems with references kept around.
The timerfd code is just a *really* thin layer (if you exclude the 
includes, the structure definitions and the fd setup code, there's 
basically *nothing*) over hrtimer.c and does not mess up with other kernel 
code in any way, and offers the same functionalities. I'd like to keep it 
that way.



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 20:41               ` Davide Libenzi
@ 2007-03-10 21:01                 ` Nicholas Miell
  2007-03-10 21:44                   ` Linus Torvalds
  2007-03-10 22:30                   ` Davide Libenzi
  0 siblings, 2 replies; 25+ messages in thread
From: Nicholas Miell @ 2007-03-10 21:01 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Sat, 2007-03-10 at 12:41 -0800, Davide Libenzi wrote:
> On Sat, 10 Mar 2007, Nicholas Miell wrote:
> 
> > Try reading the timer_create man page.
> > 
> > In short, you're limited to a single clock, so you can't set timers
> > based on wall-clock time (subject to NTP correction), monotomic time
> > (not subject to NTP, will not ever go backwards or skip ticks), the
> > high-res versions of the previous two clocks, per-thread or per-process
> > CPU usage time, or any other clocks that may get introduced in the
> > future.
> 
> One timer per fd yes. So?

I never complained about one timer per fd (although, now that you
mention it, that would get a bit excessive if you have thousands of
outstanding timers).

> The real-time and monotonic selection can be added. 

IOW, the timerfd patch is not suitable for inclusion as-is. (While
you're at it, you should probably add a flags argument for future
expansion.)

> If you look at the posix timers code, that's a bunch of code over the real 
> meat of it, that is hrtimer.c. The timerfd interface goes straight to 
> that, without adding yet another meaning to the sigevent structure,

That's what the sigevent structure is for -- to describe how events
should be signaled to userspace, whether by signal delivery, thread
creation, or queuing to event completion ports. If if you think
extending it would be bad, I can show you the line in POSIX where it
encourages the contrary.

>  and 
> yet another case inside the posix timers trigger functions. That will be 
> as unstandard as timerfd is, and even more, since you cannot use that 
> interface and hope to be portable in any case.

If Linux were to do a wholesale theft of the Solaris interface (warts
and all), you'd be portable (and, now that I think of it, more
efficient).

Two major unixes using the same interface would probably make it a
shoe-in for the next POSIX, too. (c.f. openat(2) and friends)

> On top of that, handing over files to the posix timers will creates 
> problems with references kept around.
> The timerfd code is just a *really* thin layer (if you exclude the 
> includes, the structure definitions and the fd setup code, there's 
> basically *nothing*) over hrtimer.c and does not mess up with other kernel 
> code in any way, and offers the same functionalities. I'd like to keep it 
> that way.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 21:01                 ` Nicholas Miell
@ 2007-03-10 21:44                   ` Linus Torvalds
  2007-03-10 21:56                     ` Nicholas Miell
  2007-03-10 22:30                   ` Davide Libenzi
  1 sibling, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2007-03-10 21:44 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton



On Sat, 10 Mar 2007, Nicholas Miell wrote:
> 
> That's what the sigevent structure is for -- to describe how events
> should be signaled to userspace, whether by signal delivery, thread
> creation, or queuing to event completion ports. If if you think
> extending it would be bad, I can show you the line in POSIX where it
> encourages the contrary.

I'm sorry, but by pointing to the POSIX timer stuff, you're just making 
your argument weaker.

POSIX timers are a horrible crock and over-designed to be a union of 
everything that has ever been done. Nasty. We had tons of bugs in the 
original setup because they were so damn nasty.

I'd rather look at just about *anything* else for good design than from 
some of the abortions that are posix-timers.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 21:44                   ` Linus Torvalds
@ 2007-03-10 21:56                     ` Nicholas Miell
  2007-03-10 22:42                       ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-10 21:56 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton

On Sat, 2007-03-10 at 13:44 -0800, Linus Torvalds wrote:
> 
> On Sat, 10 Mar 2007, Nicholas Miell wrote:
> > 
> > That's what the sigevent structure is for -- to describe how events
> > should be signaled to userspace, whether by signal delivery, thread
> > creation, or queuing to event completion ports. If if you think
> > extending it would be bad, I can show you the line in POSIX where it
> > encourages the contrary.
> 
> I'm sorry, but by pointing to the POSIX timer stuff, you're just making 
> your argument weaker.
> 
> POSIX timers are a horrible crock and over-designed to be a union of 
> everything that has ever been done. Nasty. We had tons of bugs in the 
> original setup because they were so damn nasty.
> 

Care to elaborate on why they're a horrible crock?

And are the bugs fixed? If so, why replace them? They work now.

> I'd rather look at just about *anything* else for good design than from 
> some of the abortions that are posix-timers.
> 
> 		Linus

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 21:01                 ` Nicholas Miell
  2007-03-10 21:44                   ` Linus Torvalds
@ 2007-03-10 22:30                   ` Davide Libenzi
  1 sibling, 0 replies; 25+ messages in thread
From: Davide Libenzi @ 2007-03-10 22:30 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Linux Kernel Mailing List, Andrew Morton, Linus Torvalds

On Sat, 10 Mar 2007, Nicholas Miell wrote:

> I never complained about one timer per fd (although, now that you
> mention it, that would get a bit excessive if you have thousands of
> outstanding timers).

Right, of course.



> > The real-time and monotonic selection can be added. 
> 
> IOW, the timerfd patch is not suitable for inclusion as-is. (While
> you're at it, you should probably add a flags argument for future
> expansion.)

That's already in.



> > If you look at the posix timers code, that's a bunch of code over the real 
> > meat of it, that is hrtimer.c. The timerfd interface goes straight to 
> > that, without adding yet another meaning to the sigevent structure,
> 
> That's what the sigevent structure is for -- to describe how events
> should be signaled to userspace, whether by signal delivery, thread
> creation, or queuing to event completion ports. If if you think
> extending it would be bad, I can show you the line in POSIX where it
> encourages the contrary.

I'm sorry, I already explained you that linking the two (files and posix 
timers) is going to create more troubles than it actually solves.
The timerfd code provides the same functionality, with zero intrusion in 
existing code, and basically zero code (once if you remove the usual fd 
creation/cleanup).
The code of adding posix timers support would be *all* the existing one
(that is already a thin wrapper that calls hrtimer.c support - like 
posix timers do), plus adding more crud into the posix timers code, plus 
adding file references handling. If *you* want to do that, I can open you 
a door into the timerfd.




- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 21:56                     ` Nicholas Miell
@ 2007-03-10 22:42                       ` Linus Torvalds
  2007-03-11  0:25                         ` Nicholas Miell
  2007-03-11  3:42                         ` Davide Libenzi
  0 siblings, 2 replies; 25+ messages in thread
From: Linus Torvalds @ 2007-03-10 22:42 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton



On Sat, 10 Mar 2007, Nicholas Miell wrote:
> 
> Care to elaborate on why they're a horrible crock?

It's a *classic* case of an interface that tries to do everything under 
the sun.

Here's a clue: look at any system call that takes a union as part of its 
arguments. Count them. I think we have two:
 - struct siginfo
 - struct sigevent
and they are both broken horrible interfaces where the data structures 
depend on various flags.

It's just not the UNIX system call way. And none of it really makes sense 
if you already have a file descriptor, since at that point you know what 
the notification mechanism is.

I'd actually much rather do POSIX timers the other way around: associate a 
generic notification mechanism with the file descriptor, and then 
implement posix_timer_create() on top of timerfd. Now THAT sounds like a 
clean unix-like interface ("everything is a file") and would imply that 
you'd be able to do the same kind of notification for any file descriptor, 
not just timers.

But posix timers as they are done now are just an abomination. They are 
not unix-like at all.

> And are the bugs fixed? If so, why replace them? They work now.

.. but the reason for the bugs was largely a very baroque interface, which 
didn't get fixed (because it's specified by the standard).

I'd rather have straightforward interfaces. The timerfd() one lookedalot 
more straightforward than posix timers.

(That said, using "struct itimerspec" might be a good idea. That would 
also obviate the need for TFD_TIMER_SEQ, since an itimerspec automatically 
has both "base" and "incremental" parts).

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 22:42                       ` Linus Torvalds
@ 2007-03-11  0:25                         ` Nicholas Miell
  2007-03-11  0:35                           ` Linus Torvalds
  2007-03-11  3:42                         ` Davide Libenzi
  1 sibling, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-11  0:25 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton

On Sat, 2007-03-10 at 14:42 -0800, Linus Torvalds wrote:
> 
> On Sat, 10 Mar 2007, Nicholas Miell wrote:
> > 
> > Care to elaborate on why they're a horrible crock?
> 
> It's a *classic* case of an interface that tries to do everything under 
> the sun.
> 
> Here's a clue: look at any system call that takes a union as part of its 
> arguments. Count them. I think we have two:
>  - struct siginfo

No argument here -- just about everything related to signals is stupidly
complex.

>  - struct sigevent

However, this I take issue with.

Conceptually (and what the user ends up actually using), struct sigevent
is just:

struct sigevent
{
	int sigev_notify;			 /* delivery method */
	sigval_t sigev_value			 /* user cookie */
	int sigev_signo;			 /* signal number */
	void (*sigev_notify_function)(sigval_t); /* thread fn */
	pthread_attr_t *sigev_notify_attributes; /* thread attr */
};

You could complain about sigval_t being a union, but that's probably
just because it predates uintptr_t. (Plus, no ugly casting.)

You also could complain that the above isn't what you actually see when
you look at /usr/include/bits/siginfo.h -- there's a union involved and
some macros to hide the fact, but that's just internal implementation
details related to how threads are created and padding out the struct
for any future expansion. 

The actual complexity for understanding and using struct sigevent isn't
all that much, and once you've figured that out, you know how to
configure event delivery for AIO completion, DNS resolution, and
messages queues, not just timers.

> and they are both broken horrible interfaces where the data structures 
> depend on various flags.
> 
> It's just not the UNIX system call way. And none of it really makes sense 
> if you already have a file descriptor, since at that point you know what 
> the notification mechanism is.
> 
> I'd actually much rather do POSIX timers the other way around: associate a 
> generic notification mechanism with the file descriptor, and then 
> implement posix_timer_create() on top of timerfd. Now THAT sounds like a 
> clean unix-like interface ("everything is a file") and would imply that 
> you'd be able to do the same kind of notification for any file descriptor, 
> not just timers.
> 

But timers aren't files or even remotely file-like -- if they were a
real files, you could just
open /dev/timers/realtime/2007/June/3rd/half-past-teatime and get a
timer. (Or, more realisticly, open /dev/timer and use ioctl().)

timerfd() had to be created to coerce them into some semblance of
filehood just to make them work with existing (and new) polling/queuing
interfaces just because those interfaces can only deal with file
descriptors.

Making non-file things look like files just because that's what poll()
and friends can deal with isn't much different from holding a hammer in
your hand and looking for what you have to do in order to turn every
problem into a nail.

Sometimes you need to go back to your toolbox for a screwdriver or a
saw.


> But posix timers as they are done now are just an abomination. They are 
> not unix-like at all.
> 
> > And are the bugs fixed? If so, why replace them? They work now.
> 
> .. but the reason for the bugs was largely a very baroque interface, which 
> didn't get fixed (because it's specified by the standard).
>

But the API isn't baroque.

There's a veritable boutique of clock sources to choose from, but they
all serve specific needs, it's just one parameter to timer_create, and
you probably want CLOCK_MONOTONIC anyway.

struct sigevent  might be a bit complex, but the difficultly in learning
that is amortized across all the other APIs that also use it to specify
how their events are delivered.

Delivering via signals and dealing with struct siginfo is painful, but
everything related to signals is painful. This is what you get when you
take an interface designed essentially for exception handling and start
abusing it for general information delivery. But, hey!, that's what
SIGEV_THREAD and SIGEV_PORT are for.[1]

About the worst that can be said of it is that using timer_settime to
both arm and disarm the timer and set the interval is awkward.






[1] A SIGEV_FUNCTION which skips all the signal baggage and just passes
a supplied cookie and a purpose-specific struct pointer to an
object-specific user-supplied function pointer might be interesting, but
then you run into all of the reentrancy/masking/choosing which thread to
deliver to and other issues that signals already have without the
benefit of the existing signal infrastructure for all that stuff. Gah, I
don't want to think about this anymore.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  0:25                         ` Nicholas Miell
@ 2007-03-11  0:35                           ` Linus Torvalds
  2007-03-11  1:49                             ` Nicholas Miell
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2007-03-11  0:35 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton



On Sat, 10 Mar 2007, Nicholas Miell wrote:
> > 
> > I'd actually much rather do POSIX timers the other way around: associate a 
> > generic notification mechanism with the file descriptor, and then 
> > implement posix_timer_create() on top of timerfd. Now THAT sounds like a 
> > clean unix-like interface ("everything is a file") and would imply that 
> > you'd be able to do the same kind of notification for any file descriptor, 
> > not just timers.
> > 
> 
> But timers aren't files or even remotely file-like

What do you think "a file" is?

In UNIX, a file descriptor is pretty much anything. You could say that 
sockets aren't remotely file-like, and you'd be right. What's your point? 
If you can read on it, it's a file.

And the real point of the whole signalfd() is that there really *are* a 
lot of UNIX interfaces that basically only work with file descriptors. Not 
just read, but select/poll/epoll.

They currently have just one timeout, but the thing is, if UNIX had just 
had "timer file descriptors", they'd not need even that one. And even with 
the timeout, Davide's patch actually makes for a *better* timeout than the 
ones provided by select/poll/epoll, exactly because you can do things like 
repeating timers and absolute time etc.

Much more naturally than the timer interface we currently have for those 
system calls.

The same goes for signals. The whole "pselect()" thing shows that signals 
really *should* have been file descriptors, and suddenly you don't need 
"pselect()" at all.

So the "not remotely file-like" is not actually a real argument. One of 
the big *points* of UNIX was that it unified a lot under the general 
umbrella of a "file descriptor". Davide just unifies even more.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  0:35                           ` Linus Torvalds
@ 2007-03-11  1:49                             ` Nicholas Miell
  2007-03-11  1:57                               ` Davide Libenzi
  2007-03-11  5:31                               ` Linus Torvalds
  0 siblings, 2 replies; 25+ messages in thread
From: Nicholas Miell @ 2007-03-11  1:49 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton

On Sat, 2007-03-10 at 16:35 -0800, Linus Torvalds wrote:
> 
> On Sat, 10 Mar 2007, Nicholas Miell wrote:
> > > 
> > > I'd actually much rather do POSIX timers the other way around: associate a 
> > > generic notification mechanism with the file descriptor, and then 
> > > implement posix_timer_create() on top of timerfd. Now THAT sounds like a 
> > > clean unix-like interface ("everything is a file") and would imply that 
> > > you'd be able to do the same kind of notification for any file descriptor, 
> > > not just timers.
> > > 
> > 
> > But timers aren't files or even remotely file-like
> 
> What do you think "a file" is?
> 
> In UNIX, a file descriptor is pretty much anything. You could say that 
> sockets aren't remotely file-like, and you'd be right. What's your point? 
> If you can read on it, it's a file.

Ah, I see. You're just interested in fds as a generic handle concept,
and not a more Plan 9 type thing.

If that's the goal, somebody should start thinking about reducing the
contents of struct file to the bare minimum (i.e. not much more than a
file_operations pointer).

> 
> And the real point of the whole signalfd() is that there really *are* a 
> lot of UNIX interfaces that basically only work with file descriptors. Not 
> just read, but select/poll/epoll.

It'd be useful if the polling interfaces could return small datums
beyond just the POLL* flags -- having to do a read on timerfd just to
get the overrun count has a lot of overhead for just an integer, and I
imagine other things would like to pass back stuff too.


> They currently have just one timeout, but the thing is, if UNIX had just 
> had "timer file descriptors", they'd not need even that one. And even with 
> the timeout, Davide's patch actually makes for a *better* timeout than the 
> ones provided by select/poll/epoll, exactly because you can do things like 
> repeating timers and absolute time etc.
> 
> Much more naturally than the timer interface we currently have for those 
> system calls.
> 

You still want timeouts, creating/setting/destroying at timer just for
a single call to select/poll/epoll is probably too heavy weight.

timerfd() still leaves out the basic clock selection functionality
provided by both setitimer() and timer_create().

> The same goes for signals. The whole "pselect()" thing shows that signals 
> really *should* have been file descriptors, and suddenly you don't need 
> "pselect()" at all.
> 
> So the "not remotely file-like" is not actually a real argument. One of 
> the big *points* of UNIX was that it unified a lot under the general 
> umbrella of a "file descriptor". Davide just unifies even more.
>
> 		Linus
-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  1:49                             ` Nicholas Miell
@ 2007-03-11  1:57                               ` Davide Libenzi
  2007-03-11  2:09                                 ` Nicholas Miell
  2007-03-11  5:31                               ` Linus Torvalds
  1 sibling, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-11  1:57 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton

On Sat, 10 Mar 2007, Nicholas Miell wrote:

> If that's the goal, somebody should start thinking about reducing the
> contents of struct file to the bare minimum (i.e. not much more than a
> file_operations pointer).

That's already pretty smal, and the single inode (and maybe dentry) will 
make it even smaller. Unless you want to create brazillions of signalfds,
timerfds or asyncfds.



> > And the real point of the whole signalfd() is that there really *are* a 
> > lot of UNIX interfaces that basically only work with file descriptors. Not 
> > just read, but select/poll/epoll.
> 
> It'd be useful if the polling interfaces could return small datums
> beyond just the POLL* flags -- having to do a read on timerfd just to
> get the overrun count has a lot of overhead for just an integer, and I
> imagine other things would like to pass back stuff too.
...

> You still want timeouts, creating/setting/destroying at timer just for
> a single call to select/poll/epoll is probably too heavy weight.

Take a look at what timerfd does and what posix timers has to do to 
implement the interface. You'll prolly stop trolling with things like "a 
lot of overhead" or "too heavy weight".



> timerfd() still leaves out the basic clock selection functionality
> provided by both setitimer() and timer_create().

That is coming as soon as I fixed my send-serie script ...




- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  1:57                               ` Davide Libenzi
@ 2007-03-11  2:09                                 ` Nicholas Miell
  0 siblings, 0 replies; 25+ messages in thread
From: Nicholas Miell @ 2007-03-11  2:09 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton

On Sat, 2007-03-10 at 17:57 -0800, Davide Libenzi wrote:
> On Sat, 10 Mar 2007, Nicholas Miell wrote:
> 
> > If that's the goal, somebody should start thinking about reducing the
> > contents of struct file to the bare minimum (i.e. not much more than a
> > file_operations pointer).
> 
> That's already pretty smal, and the single inode (and maybe dentry) will 
> make it even smaller. Unless you want to create brazillions of signalfds,
> timerfds or asyncfds.
> 

Timers don't need dentry or inode pointers or readahead state, etc., do
they? (Beyond the existing VFS expectation, that is.)

> > > And the real point of the whole signalfd() is that there really *are* a 
> > > lot of UNIX interfaces that basically only work with file descriptors. Not 
> > > just read, but select/poll/epoll.
> > 
> > It'd be useful if the polling interfaces could return small datums
> > beyond just the POLL* flags -- having to do a read on timerfd just to
> > get the overrun count has a lot of overhead for just an integer, and I
> > imagine other things would like to pass back stuff too.
> ...
> 
> > You still want timeouts, creating/setting/destroying at timer just for
> > a single call to select/poll/epoll is probably too heavy weight.
> 
> Take a look at what timerfd does and what posix timers has to do to 
> implement the interface. You'll prolly stop trolling with things like "a 
> lot of overhead" or "too heavy weight".

That wasn't a troll. I was talking about the timerfd()/close() overhead
and the corresponding bookkeeping necessary to keep that fd around
compared to just passing a struct timespec to poll or a millisecond
count to epoll_wait.

> > timerfd() still leaves out the basic clock selection functionality
> > provided by both setitimer() and timer_create().
> 
> That is coming as soon as I fixed my send-serie script ...

Nice.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-10 22:42                       ` Linus Torvalds
  2007-03-11  0:25                         ` Nicholas Miell
@ 2007-03-11  3:42                         ` Davide Libenzi
  2007-03-11  5:35                           ` Linus Torvalds
  1 sibling, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-11  3:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Nicholas Miell, Linux Kernel Mailing List, Andrew Morton

On Sat, 10 Mar 2007, Linus Torvalds wrote:

> (That said, using "struct itimerspec" might be a good idea. That would 
> also obviate the need for TFD_TIMER_SEQ, since an itimerspec automatically 
> has both "base" and "incremental" parts).

But TFD_TIMER_SEQ is a simple auto-rearm case of TFD_TIMER_REL. So the 
timespec is sufficent too (in all three cases we just need *one* time). 
Actually, the only place where I can find the itimerspec usefull, is 
indeed with TFD_TIMER_SEQ. In cases where you want you clock starting at a 
given time (it_value) *and* with the given frequency (it_interval).



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  1:49                             ` Nicholas Miell
  2007-03-11  1:57                               ` Davide Libenzi
@ 2007-03-11  5:31                               ` Linus Torvalds
  2007-03-11  6:18                                 ` Nicholas Miell
  1 sibling, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2007-03-11  5:31 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton



On Sat, 10 Mar 2007, Nicholas Miell wrote:
> 
> Ah, I see. You're just interested in fds as a generic handle concept,
> and not a more Plan 9 type thing.

Indeed. It's a "handle".

UNIX has pid's for "process" handles, and "file descriptors" for just 
about everything else.

> If that's the goal, somebody should start thinking about reducing the
> contents of struct file to the bare minimum (i.e. not much more than a
> file_operations pointer).

Well, there's more there, but it really is fairly close. If you look at 
it, a "struct file" ends up not having a lot more than the minimal stuff 
required to use it as a a handle: it really isn't a very big structure. 

The biggest part is actually the read-ahead state, which is arguably a 
generic thing for a file handle, even though not all kinds will be able to 
use it. We *could* make that be behind a pointer (along with the "f_pos" 
thing, that really logically goes along with the read-ahead thing), of 
course, but since most files probably do end up being "traditional file" 
structures, it's probably not wrong to just have it in the file.

> It'd be useful if the polling interfaces could return small datums
> beyond just the POLL* flags -- having to do a read on timerfd just to
> get the overrun count has a lot of overhead for just an integer, and I
> imagine other things would like to pass back stuff too.

Well, since a lot of the interfaces harken back to "select()", we really 
are stuck with basically a couple of bits total (poll extends on the 
number of bits, but not a whole lot). So right now we have just "an event 
happened", and if you want to know more, you do need to do a read() or 
similar. That's true of all the traditional file descriptors too, of 
course.

> You still want timeouts, creating/setting/destroying at timer just for
> a single call to select/poll/epoll is probably too heavy weight.

Well, since the interfaces for that already exists, I'm certainly not 
going to disagree.

> timerfd() still leaves out the basic clock selection functionality
> provided by both setitimer() and timer_create().

Well, the setitimer ones do not really make sense for a timer that isn't 
directly associated with one particular process. Once it's associated with 
a file descriptor, it really isn't bound to any particular execution 
context, and as such, virtual and profiling timers really don't make any 
sense any more!

The only thing that exists outside of an execution context is really just 
"relative" and "absolute". Of course, you could still specify just what 
you want your timers to be based on (ie the "realtime" vs "monotonic" 
thing), and possibly the resolution, but it really does boil down to just 
those two choices (and the rest is just confusion).

So I really don't think you lose a lot by just limiting it to "real time" 
vs "relative time". Those really *are* the choices.

			Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  3:42                         ` Davide Libenzi
@ 2007-03-11  5:35                           ` Linus Torvalds
  2007-03-11  5:44                             ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2007-03-11  5:35 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Nicholas Miell, Linux Kernel Mailing List, Andrew Morton



On Sat, 10 Mar 2007, Davide Libenzi wrote:

> On Sat, 10 Mar 2007, Linus Torvalds wrote:
> 
> > (That said, using "struct itimerspec" might be a good idea. That would 
> > also obviate the need for TFD_TIMER_SEQ, since an itimerspec automatically 
> > has both "base" and "incremental" parts).
> 
> But TFD_TIMER_SEQ is a simple auto-rearm case of TFD_TIMER_REL. So the 
> timespec is sufficent too (in all three cases we just need *one* time). 

Well, people actually do use itimers like "give me a timer every second, 
starting five seconds from now".

> Actually, the only place where I can find the itimerspec usefull, is 
> indeed with TFD_TIMER_SEQ. In cases where you want you clock starting at a 
> given time (it_value) *and* with the given frequency (it_interval).

.. and this is where itimerspec is even better: once you have absolute 
time, *and* a process that might miss ticks (because it does something 
else), the "absolute time start + interval" thing can avoid drifting 
(which a "relative interval" has a really hard time doing).

So if you want a "timer tick every second, *on* the second" kind of 
interface, you really do want a absolute time starting point, and then a 
fixed interval. Two different times.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  5:35                           ` Linus Torvalds
@ 2007-03-11  5:44                             ` Davide Libenzi
  0 siblings, 0 replies; 25+ messages in thread
From: Davide Libenzi @ 2007-03-11  5:44 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Nicholas Miell, Linux Kernel Mailing List, Andrew Morton

On Sat, 10 Mar 2007, Linus Torvalds wrote:

> > Actually, the only place where I can find the itimerspec usefull, is 
> > indeed with TFD_TIMER_SEQ. In cases where you want you clock starting at a 
> > given time (it_value) *and* with the given frequency (it_interval).
> 
> .. and this is where itimerspec is even better: once you have absolute 
> time, *and* a process that might miss ticks (because it does something 
> else), the "absolute time start + interval" thing can avoid drifting 
> (which a "relative interval" has a really hard time doing).
> 
> So if you want a "timer tick every second, *on* the second" kind of 
> interface, you really do want a absolute time starting point, and then a 
> fixed interval. Two different times.

Alrighty, I'll use a itimerspec ...



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  5:31                               ` Linus Torvalds
@ 2007-03-11  6:18                                 ` Nicholas Miell
  2007-03-11 16:29                                   ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Nicholas Miell @ 2007-03-11  6:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton

On Sat, 2007-03-10 at 21:31 -0800, Linus Torvalds wrote:
> 
> On Sat, 10 Mar 2007, Nicholas Miell wrote:
> > 
> > Ah, I see. You're just interested in fds as a generic handle concept,
> > and not a more Plan 9 type thing.
> 
> Indeed. It's a "handle".
> 
> UNIX has pid's for "process" handles, and "file descriptors" for just 
> about everything else.

And I imagine that somebody will come up with way of getting a fd for a
process sooner or later. 

> > If that's the goal, somebody should start thinking about reducing the
> > contents of struct file to the bare minimum (i.e. not much more than a
> > file_operations pointer).
> 
> Well, there's more there, but it really is fairly close. If you look at 
> it, a "struct file" ends up not having a lot more than the minimal stuff 
> required to use it as a a handle: it really isn't a very big structure. 
> 
> The biggest part is actually the read-ahead state, which is arguably a 
> generic thing for a file handle, even though not all kinds will be able to 
> use it. We *could* make that be behind a pointer (along with the "f_pos" 
> thing, that really logically goes along with the read-ahead thing), of 
> course, but since most files probably do end up being "traditional file" 
> structures, it's probably not wrong to just have it in the file.
> 

Actually, I was thinking reducing struct file to the bare minimum, and
then using that as the common header shared by object-specific
structures. I don't know how unpleasant that would be from a memory
allocation perspective, though.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 6/9] signalfd/timerfd v1 - timerfd core ...
  2007-03-11  6:18                                 ` Nicholas Miell
@ 2007-03-11 16:29                                   ` Linus Torvalds
  0 siblings, 0 replies; 25+ messages in thread
From: Linus Torvalds @ 2007-03-11 16:29 UTC (permalink / raw)
  To: Nicholas Miell; +Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton



On Sat, 10 Mar 2007, Nicholas Miell wrote:
> > 
> > UNIX has pid's for "process" handles, and "file descriptors" for just 
> > about everything else.
> 
> And I imagine that somebody will come up with way of getting a fd for a
> process sooner or later. 

Well, /proc/<pid>/ is about as close as you get. And that's largely 
inspired by a Plan-9'ish thing that does indeed expose processes as files.

The problem with processes is that they are actually so *complicated* that 
trying to describe them with a single file isn't all that useful (you 
could use tons of different ioctl's to do different operations, but that's 
against the "stream of bytes" model in UNIX, and even more so against the 
whole Plan-9 model).

> Actually, I was thinking reducing struct file to the bare minimum, and
> then using that as the common header shared by object-specific
> structures. I don't know how unpleasant that would be from a memory
> allocation perspective, though.

It would probably not be a bad idea, but I just doubt that it makes much 
of a difference, at least not for timerfd/signalfd files. There likely 
just won't be that many of them (I'd expect that processes that use them 
would normally just have one or two of each).

It might be more relevant for things like sockets and pty's: do a

	ls -l /proc/*/fd

and see what kind of files you have open, and I suspect most of the files 
will actually be sockets on a normal desktop setup, and even more so on 
some network server thing. And yes, it might be nice to avoid allocating 
memory for the (unnecessary) readahead and f_pos state, but in the end 
you seldom really have all *that* much memory allocated for file 
descriptors. The real memory use ends up being elsewhere..

IOW, I don't think it's a bad idea per se, I just doubt that it is worth 
the complexity and effort.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2007-03-11 16:29 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-09 23:41 [patch 6/9] signalfd/timerfd v1 - timerfd core Davide Libenzi
2007-03-10  6:33 ` Nicholas Miell
2007-03-10  6:38   ` Davide Libenzi
2007-03-10  6:43     ` Nicholas Miell
2007-03-10  6:53       ` Davide Libenzi
2007-03-10  7:09         ` Nicholas Miell
2007-03-10  7:36           ` Davide Libenzi
2007-03-10 19:52             ` Nicholas Miell
2007-03-10 20:41               ` Davide Libenzi
2007-03-10 21:01                 ` Nicholas Miell
2007-03-10 21:44                   ` Linus Torvalds
2007-03-10 21:56                     ` Nicholas Miell
2007-03-10 22:42                       ` Linus Torvalds
2007-03-11  0:25                         ` Nicholas Miell
2007-03-11  0:35                           ` Linus Torvalds
2007-03-11  1:49                             ` Nicholas Miell
2007-03-11  1:57                               ` Davide Libenzi
2007-03-11  2:09                                 ` Nicholas Miell
2007-03-11  5:31                               ` Linus Torvalds
2007-03-11  6:18                                 ` Nicholas Miell
2007-03-11 16:29                                   ` Linus Torvalds
2007-03-11  3:42                         ` Davide Libenzi
2007-03-11  5:35                           ` Linus Torvalds
2007-03-11  5:44                             ` Davide Libenzi
2007-03-10 22:30                   ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox