Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH RFC 3/6] epoll: Add definition for epoll_mod_wait structures
From: Fam Zheng @ 2015-01-20  9:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Alexander Viro,
	Andrew Morton, Kees Cook, Andy Lutomirski, David Herrmann,
	Alexei Starovoitov, Miklos Szeredi, David Drysdale, Oleg Nesterov,
	David S. Miller, Vivek Goyal, Mike Frysinger, Theodore Ts'o,
	Heiko Carstens, Rasmus Villemoes, Rashika Kheria, Hugh Dickins,
	Mathieu Desnoyers, Fam Zheng, Peter Zijlstra <peter>
In-Reply-To: <1421747878-30744-1-git-send-email-famz@redhat.com>

Two structs involved in the coming syscall is defined. Flags in epoll_mod_cmd
are reserved, which makes better word alignment and may allow future extension.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 include/uapi/linux/eventpoll.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bc81fb2..e32a804 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -18,6 +18,8 @@
 #include <linux/fcntl.h>
 #include <linux/types.h>
 
+#include <linux/signal.h>
+
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
 
@@ -61,6 +63,24 @@ struct epoll_event {
 	__u64 data;
 } EPOLL_PACKED;
 
+struct epoll_mod_cmd {
+	int flags;
+	int op;
+	int fd;
+	__u32 events;
+	__u64 data;
+	int error;
+} EPOLL_PACKED;
+
+struct epoll_wait_spec {
+	int maxevents;
+	struct epoll_event *events;
+	int clockid;
+	struct timespec timeout;
+	sigset_t *sigmask;
+	size_t sigsetsize;
+} EPOLL_PACKED;
+
 #ifdef CONFIG_PM_SLEEP
 static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
 {
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC 2/6] epoll: Specify clockid explicitly
From: Fam Zheng @ 2015-01-20  9:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Alexander Viro,
	Andrew Morton, Kees Cook, Andy Lutomirski, David Herrmann,
	Alexei Starovoitov, Miklos Szeredi, David Drysdale, Oleg Nesterov,
	David S. Miller, Vivek Goyal, Mike Frysinger, Theodore Ts'o,
	Heiko Carstens, Rasmus Villemoes, Rashika Kheria, Hugh Dickins,
	Mathieu Desnoyers, Fam Zheng, Peter Zijlstra <peter>
In-Reply-To: <1421747878-30744-1-git-send-email-famz@redhat.com>

Later we will add clockid in the interface, so let's start using explicit
clockid internally. Now we specify CLOCK_MONOTONIC, which is the same as before.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 fs/eventpoll.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4cf359d..6da143f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1570,7 +1570,7 @@ static int ep_send_events(struct eventpoll *ep,
  *          error code, in case of error.
  */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
-		   int maxevents, const ktime_t timeout)
+		   int maxevents, int clockid, const ktime_t timeout)
 {
 	int res = 0, eavail, timed_out = 0;
 	unsigned long flags;
@@ -1624,7 +1624,8 @@ fetch_events:
 			}
 
 			spin_unlock_irqrestore(&ep->lock, flags);
-			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+			if (!schedule_hrtimeout_range_clock(to, slack,
+						HRTIMER_MODE_ABS, clockid))
 				timed_out = 1;
 
 			spin_lock_irqsave(&ep->lock, flags);
@@ -1945,7 +1946,8 @@ error_return:
 }
 
 static inline int epoll_wait_do(int epfd, struct epoll_event __user *events,
-				int maxevents, const ktime_t timeout)
+				int maxevents, int clockid, 
+				const ktime_t timeout)
 {
 	int error;
 	struct fd f;
@@ -1979,7 +1981,7 @@ static inline int epoll_wait_do(int epfd, struct epoll_event __user *events,
 	ep = f.file->private_data;
 
 	/* Time to fish for events ... */
-	error = ep_poll(ep, events, maxevents, timeout);
+	error = ep_poll(ep, events, maxevents, clockid, timeout);
 
 error_fput:
 	fdput(f);
@@ -1994,12 +1996,13 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 		int, maxevents, int, timeout)
 {
 	ktime_t kt = ms_to_ktime(timeout);
-	return epoll_wait_do(epfd, events, maxevents, kt);
+	return epoll_wait_do(epfd, events, maxevents, CLOCK_MONOTONIC, kt);
 }
 
 static inline int epoll_pwait_do(int epfd, struct epoll_event __user *events,
-				 int maxevents, ktime_t timeout,
-				 sigset_t *sigmask, size_t sigsetsize)
+				 int maxevents,
+				 int clockid, ktime_t timeout,
+				 sigset_t *sigmask)
 {
 	int error;
 	sigset_t sigsaved;
@@ -2013,7 +2016,7 @@ static inline int epoll_pwait_do(int epfd, struct epoll_event __user *events,
 		set_current_blocked(sigmask);
 	}
 
-	error = epoll_wait_do(epfd, events, maxevents, timeout);
+	error = epoll_wait_do(epfd, events, maxevents, clockid, timeout);
 
 	/*
 	 * If we changed the signal mask, we need to restore the original one.
@@ -2050,8 +2053,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 			return -EFAULT;
 	}
-	return epoll_pwait_do(epfd, events, maxevents, kt,
-			      sigmask ? &ksigmask : NULL, sigsetsize);
+	return epoll_pwait_do(epfd, events, maxevents, CLOCK_MONOTONIC, kt,
+			      sigmask ? &ksigmask : NULL);
 }
 
 #ifdef CONFIG_COMPAT
@@ -2073,8 +2076,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 		sigset_from_compat(&ksigmask, &csigmask);
 	}
 
-	return epoll_pwait_do(epfd, events, maxevents, kt,
-			      sigmask ? &ksigmask : NULL, sigsetsize);
+	return epoll_pwait_do(epfd, events, maxevents, CLOCK_MONOTONIC, kt,
+			      sigmask ? &ksigmask : NULL);
 }
 #endif
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC 1/6] epoll: Extract epoll_wait_do and epoll_pwait_do
From: Fam Zheng @ 2015-01-20  9:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Alexander Viro,
	Andrew Morton, Kees Cook, Andy Lutomirski, David Herrmann,
	Alexei Starovoitov, Miklos Szeredi, David Drysdale, Oleg Nesterov,
	David S. Miller, Vivek Goyal, Mike Frysinger, Theodore Ts'o,
	Heiko Carstens, Rasmus Villemoes, Rashika Kheria, Hugh Dickins,
	Mathieu Desnoyers, Fam Zheng, Peter Zijlstra <peter>
In-Reply-To: <1421747878-30744-1-git-send-email-famz@redhat.com>

In preparation of epoll_mod_wait, this patch allows reusing the code from
epoll_pwait implementation. The new functions uses ktime_t for more accuracy.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 fs/eventpoll.c | 130 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 59 insertions(+), 71 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f944..4cf359d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1554,17 +1554,6 @@ static int ep_send_events(struct eventpoll *ep,
 	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
 }
 
-static inline struct timespec ep_set_mstimeout(long ms)
-{
-	struct timespec now, ts = {
-		.tv_sec = ms / MSEC_PER_SEC,
-		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
-	};
-
-	ktime_get_ts(&now);
-	return timespec_add_safe(now, ts);
-}
-
 /**
  * ep_poll - Retrieves ready events, and delivers them to the caller supplied
  *           event buffer.
@@ -1573,17 +1562,15 @@ static inline struct timespec ep_set_mstimeout(long ms)
  * @events: Pointer to the userspace buffer where the ready events should be
  *          stored.
  * @maxevents: Size (in terms of number of events) of the caller event buffer.
- * @timeout: Maximum timeout for the ready events fetch operation, in
- *           milliseconds. If the @timeout is zero, the function will not block,
- *           while if the @timeout is less than zero, the function will block
- *           until at least one event has been retrieved (or an error
- *           occurred).
+ * @timeout: Maximum timeout for the ready events fetch operation.  If 0, the
+ *           function will not block. If negative, the function will block until
+ *           at least one event has been retrieved (or an error occurred).
  *
  * Returns: Returns the number of ready events which have been fetched, or an
  *          error code, in case of error.
  */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
-		   int maxevents, long timeout)
+		   int maxevents, const ktime_t timeout)
 {
 	int res = 0, eavail, timed_out = 0;
 	unsigned long flags;
@@ -1591,13 +1578,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	wait_queue_t wait;
 	ktime_t expires, *to = NULL;
 
-	if (timeout > 0) {
-		struct timespec end_time = ep_set_mstimeout(timeout);
-
-		slack = select_estimate_accuracy(&end_time);
-		to = &expires;
-		*to = timespec_to_ktime(end_time);
-	} else if (timeout == 0) {
+	if (!ktime_to_ns(timeout)) {
 		/*
 		 * Avoid the unnecessary trip to the wait queue loop, if the
 		 * caller specified a non blocking operation.
@@ -1605,6 +1586,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		timed_out = 1;
 		spin_lock_irqsave(&ep->lock, flags);
 		goto check_events;
+	} else if (ktime_to_ns(timeout) > 0) {
+		struct timespec now, end_time;
+
+		ktime_get_ts(&now);
+		end_time = timespec_add_safe(now, ktime_to_timespec(timeout));
+
+		slack = select_estimate_accuracy(&end_time);
+		to = &expires;
+		*to = timespec_to_ktime(end_time);
 	}
 
 fetch_events:
@@ -1954,12 +1944,8 @@ error_return:
 	return error;
 }
 
-/*
- * Implement the event wait interface for the eventpoll file. It is the kernel
- * part of the user space epoll_wait(2).
- */
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
-		int, maxevents, int, timeout)
+static inline int epoll_wait_do(int epfd, struct epoll_event __user *events,
+				int maxevents, const ktime_t timeout)
 {
 	int error;
 	struct fd f;
@@ -2002,29 +1988,32 @@ error_fput:
 
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
- * part of the user space epoll_pwait(2).
+ * part of the user space epoll_wait(2).
  */
-SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
-		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
-		size_t, sigsetsize)
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
+{
+	ktime_t kt = ms_to_ktime(timeout);
+	return epoll_wait_do(epfd, events, maxevents, kt);
+}
+
+static inline int epoll_pwait_do(int epfd, struct epoll_event __user *events,
+				 int maxevents, ktime_t timeout,
+				 sigset_t *sigmask, size_t sigsetsize)
 {
 	int error;
-	sigset_t ksigmask, sigsaved;
+	sigset_t sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
 	if (sigmask) {
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
 		sigsaved = current->blocked;
-		set_current_blocked(&ksigmask);
+		set_current_blocked(sigmask);
 	}
 
-	error = sys_epoll_wait(epfd, events, maxevents, timeout);
+	error = epoll_wait_do(epfd, events, maxevents, timeout);
 
 	/*
 	 * If we changed the signal mask, we need to restore the original one.
@@ -2044,49 +2033,48 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 	return error;
 }
 
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
+{
+	ktime_t kt = ms_to_ktime(timeout);
+	sigset_t ksigmask;
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+	}
+	return epoll_pwait_do(epfd, events, maxevents, kt,
+			      sigmask ? &ksigmask : NULL, sigsetsize);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
-			struct epoll_event __user *, events,
-			int, maxevents, int, timeout,
-			const compat_sigset_t __user *, sigmask,
-			compat_size_t, sigsetsize)
+		       struct epoll_event __user *, events,
+		       int, maxevents, int, timeout,
+		       const compat_sigset_t __user *, sigmask,
+		       compat_size_t, sigsetsize)
 {
-	long err;
 	compat_sigset_t csigmask;
-	sigset_t ksigmask, sigsaved;
+	sigset_t ksigmask;
+	ktime_t kt = ms_to_ktime(timeout);
 
-	/*
-	 * If the caller wants a certain signal mask to be set during the wait,
-	 * we apply it here.
-	 */
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
 		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
 			return -EFAULT;
 		sigset_from_compat(&ksigmask, &csigmask);
-		sigsaved = current->blocked;
-		set_current_blocked(&ksigmask);
-	}
-
-	err = sys_epoll_wait(epfd, events, maxevents, timeout);
-
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (err == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			set_current_blocked(&sigsaved);
 	}
 
-	return err;
+	return epoll_pwait_do(epfd, events, maxevents, kt,
+			      sigmask ? &ksigmask : NULL, sigsetsize);
 }
 #endif
 
-- 
1.9.3


^ permalink raw reply related

* [PATCH RFC 0/6] epoll: Introduce new syscall "epoll_mod_wait"
From: Fam Zheng @ 2015-01-20  9:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Alexander Viro,
	Andrew Morton, Kees Cook, Andy Lutomirski, David Herrmann,
	Alexei Starovoitov, Miklos Szeredi, David Drysdale, Oleg Nesterov,
	David S. Miller, Vivek Goyal, Mike Frysinger, Theodore Ts'o,
	Heiko Carstens, Rasmus Villemoes, Rashika Kheria, Hugh Dickins,
	Mathieu Desnoyers, Fam Zheng, Peter Zijlstra <peter>

This adds a new system call, epoll_mod_wait. It's described as below:

NAME
       epoll_mod_wait - modify and wait for I/O events on an epoll file
                        descriptor

SYNOPSIS

       int epoll_mod_wait(int epfd, int flags,
                          int ncmds, struct epoll_mod_cmd *cmds,
                          struct epoll_wait_spec *spec);

DESCRIPTION

       The epoll_mod_wait() system call can be seen as an enhanced combination
       of several epoll_ctl(2) calls, which are followed by an epoll_pwait(2)
       call. It is superior in two cases:
       
       1) When epoll_ctl(2) are followed by epoll_wait(2), using epoll_mod_wait
       will save context switches between user mode and kernel mode;
       
       2) When you need higher precision than microsecond for wait timeout.

       The epoll_ctl(2) operations are embedded into this call by with ncmds
       and cmds. The latter is an array of command structs:

           struct epoll_mod_cmd {

                  /* Reserved flags for future extension, must be 0 for now. */
                  int flags;

                  /* The same as epoll_ctl() op parameter. */
                  int op;

                  /* The same as epoll_ctl() fd parameter. */
                  int fd;

                  /* The same as the "events" field in struct epoll_event. */
                  uint32_t events;

                  /* The same as the "data" field in struct epoll_event. */
                  uint64_t data;

                  /* Output field, will be set to the return code once this
                   * command is executed by kernel */
                  int error;
           };
       
       There is no guartantee that all the commands are executed in order. Only
       if all the commands are successfully executed (all the error fields are
       set to 0), events are polled.

       The last parameter "spec" is a pointer to struct epoll_wait_spec, which
       contains the information about how to poll the events. If it's NULL, this
       call will immediately return after running all the commands in cmds.

       The structure is defined as below:

           struct epoll_wait_spec {

                  /* The same as "maxevents" in epoll_pwait() */
                  int maxevents;

                  /* The same as "events" in epoll_pwait() */
                  struct epoll_event *events;

                  /* Which clock to use for timeout */
                  int clockid;

                  /* Maximum time to wait if there is no event */
                  struct timespec timeout;

                  /* The same as "sigmask" in epoll_pwait() */
                  sigset_t *sigmask;

                  /* The same as "sigsetsize" in epoll_pwait() */
                  size_t sigsetsize;
           } EPOLL_PACKED;

RETURN VALUE

       When any error occurs, epoll_mod_wait() returns -1 and errno is set
       appropriately. All the "error" fields in cmds are unchanged before they
       are executed, and if any cmds are executed, the "error" fields are set
       to a return code accordingly. See also epoll_ctl for more details of the
       return code.

       When successful, epoll_mod_wait() returns the number of file
       descriptors ready for the requested I/O, or zero if no file descriptor
       became ready during the requested timeout milliseconds.

       If spec is NULL, it returns 0 if all the commands are successful, and -1
       if an error occured.

ERRORS

       These errors apply on either the return value of epoll_mod_wait or error
       status for each command, respectively.

       EBADF  epfd or fd is not a valid file descriptor.

       EFAULT The memory area pointed to by events is not accessible with write
              permissions.

       EINTR  The call was interrupted by a signal handler before either any of
              the requested events occurred or the timeout expired; see
              signal(7).

       EINVAL epfd is not an epoll file descriptor, or maxevents is less than
              or equal to zero, or fd is the same as epfd, or the requested
              operation op is not supported by this interface.

       EEXIST op was EPOLL_CTL_ADD, and the supplied file descriptor fd is
              already registered with this epoll instance.

       ENOENT op was EPOLL_CTL_MOD or EPOLL_CTL_DEL, and fd is not registered
              with this epoll instance.

       ENOMEM There was insufficient memory to handle the requested op control
              operation.

       ENOSPC The limit imposed by /proc/sys/fs/epoll/max_user_watches was
              encountered while trying to register (EPOLL_CTL_ADD) a new file
              descriptor on an epoll instance.  See epoll(7) for further
              details.

       EPERM  The target file fd does not support epoll.

CONFORMING TO

       epoll_mod_wait() is Linux-specific.

SEE ALSO

       epoll_create(2), epoll_ctl(2), epoll_wait(2), epoll_pwait(2), epoll(7)

Fam Zheng (6):
  epoll: Extract epoll_wait_do and epoll_pwait_do
  epoll: Specify clockid explicitly
  epoll: Add definition for epoll_mod_wait structures
  epoll: Extract ep_ctl_do
  epoll: Add implementation for epoll_mod_wait
  x86: Hook up epoll_mod_wait syscall

 arch/x86/syscalls/syscall_32.tbl |   1 +
 arch/x86/syscalls/syscall_64.tbl |   1 +
 fs/eventpoll.c                   | 219 +++++++++++++++++++++++++--------------
 include/linux/syscalls.h         |   5 +
 include/uapi/linux/eventpoll.h   |  20 ++++
 5 files changed, 167 insertions(+), 79 deletions(-)

-- 
1.9.3

^ permalink raw reply

* Re: [PATCH v5 5/5] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Orson Zhai @ 2015-01-20  8:41 UTC (permalink / raw)
  To: Lyra Zhang
  Cc: Rob Herring, Chunyan Zhang, Greg Kroah-Hartman, Mark Rutland,
	Arnd Bergmann, One Thousand Gnomes, Mark Brown, Rob Herring,
	Pawel Moll, Ian Campbell, Kumar Gala, Will Deacon,
	Catalin Marinas, Jiri Slaby, Jason Cooper, Heiko Stübner,
	Florian Vaussard, Andrew Lunn, Robert Richter, Hayato Suzuki,
	Grant Likely, Antony Pavlov
In-Reply-To: <CAAfSe-tAwURc_P+-0+m22ao9r+Fud6Ae89JF8FGsWgg_49Mdhg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Jan 20, 2015 at 3:37 PM, Lyra Zhang <zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> Hi, Rob
>
> I still have a question to be conform, specific describes below:
>
> On Mon, Jan 19, 2015 at 10:11 PM, Rob Herring <robherring2-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> On Mon, Jan 19, 2015 at 3:55 AM, Lyra Zhang <zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>>> On Sat, Jan 17, 2015 at 12:41 AM, Rob Herring <robherring2-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>>>> On Fri, Jan 16, 2015 at 4:00 AM, Chunyan Zhang
>>>> <chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org> wrote:
>>>>> Add a full sc9836-uart driver for SC9836 SoC which is based on the
>>>>> spreadtrum sharkl64 platform.
>>>>> This driver also support earlycon.
>>>>> This patch also replaced the spaces between the macros and their
>>>>> values with the tabs in serial_core.h
>>
>> [...]
>>
>>>>> +static int __init sprd_serial_init(void)
>>>>> +{
>>>>> +       int ret = 0;
>>>>> +
>>>>> +       ret = uart_register_driver(&sprd_uart_driver);
>>>>
>>>> This can be done in probe now. Then you can use module_platform_driver().
>>>>
>>>
>>> Question:
>>> 1. there are 4 uart ports configured in dt for sprd_serial, so probe
>>> will be called 4 times, but uart_register_driver only needs to be
>>> called one time, so can we use uart_driver.state to check if
>>> uart_register_driver has already been called ?
>>
>> Yes.
>>
>>> 2. if I use module_platform_driver() instead of
>>> module_init(sprd_serial_init)  and  module_exit(sprd_serial_exit) , I
>>> must move uart_unregister_driver() which is now processed in
>>> sprd_serial_exit() to sprd_remove(), there is a similar problem with
>>> probe(), sprd_remove() will also be called 4 times, and actually it
>>> should be called only one time. How can we deal with this case?
>>
>> Look at pl01x or Samsung UART drivers which have done this conversion.
>
> Samsung UART does use module_platform_driver, but pl010/pl011 doesn't.
> In the Samsung UART driver, uart_unregister_driver is processed in
> remove(), like below:
>
> static int s3c24xx_serial_remove(struct platform_device *dev)
> {
>     struct uart_port *port = s3c24xx_dev_to_port(&dev->dev);
>
>     if (port) {
>         s3c24xx_serial_cpufreq_deregister(to_ourport(port));
>         uart_remove_one_port(&s3c24xx_uart_drv, port);
>     }
>
>     uart_unregister_driver(&s3c24xx_uart_drv);
> }
>
> if this serial has more than one ports, uart_unregister_driver() must
> be called multiple times when the device need to be removed.
> I think there may be a problem because that uart_unregister_driver()
> will do kfree(drv->state) every time when it's called.

I think it is no appropriate to call uart_unregister_driver() at first
port removing.
The drv->state buffer was shared with all uart ports.
If there are some cases that only 1 port is needed to be removed, that
will destroy all others, isn't it?

Regards,
Orson


>
> Thanks,
> Chunyan
>
>>
>>> 3. for the second question, we can check the platform_device->id, if
>>> it is equal to the index of last port (e.g. 4 for this case), then
>>> uart_unregister_driver() can be called. Does it work correctly? since
>>> for this case, we must keep the order of releasing ports.
>>
>> The id will not be the line index in the DT case. I don't think you
>> can guarantee the order either.
>>
>> It would be better to make uart_{un}register_driver deal with being
>> called multiple times so drivers don't have to deal with getting this
>> correct. I'm not sure if that is feasible though.
>>
>> Rob
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: kdbus: add documentation
From: Daniel Mack @ 2015-01-20  8:25 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages), Florian Weimer, David Herrmann,
	Greg Kroah-Hartman
  Cc: Arnd Bergmann, Eric W. Biederman, One Thousand Gnomes,
	Tom Gundersen, Jiri Kosina, Andy Lutomirski, Linux API,
	linux-kernel, Djalal Harouni
In-Reply-To: <54BE0D56.5090301-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Hi Michael,

On 01/20/2015 09:09 AM, Michael Kerrisk (man-pages) wrote:
> On 11/30/2014 06:23 PM, Florian Weimer wrote:
>> * David Herrmann:
>>
>>> On Sun, Nov 30, 2014 at 10:02 AM, Florian Weimer <fw-d32yF4oPJVt0XxTmqZlbVQ@public.gmane.org> wrote:
>>>> * Greg Kroah-Hartman:
>>>>
>>>>> +7.4 Receiving messages
>>
>>>> What happens if this is not possible because the file descriptor limit
>>>> of the processes would be exceeded?  EMFILE, and the message will not
>>>> be received?
>>>
>>> The message is returned without installing the FDs. This is signaled
>>> by EMFILE, but a valid pool offset.
>>
>> Oh.  This is really surprising, so it needs documentation.  But it's
>> probably better than the alternative (return EMFILE and leave the
>> message stuck, so that you receive it immediately again—this behavior
>> makes non-blocking accept rather difficult to use correctly).
> 
> So, was this point in the end explicitly documented? I not
> obvious that it is documented in the revised kdbus.txt that
> Greg K-H sent out 4 days ago.

No, we've revisited this point and changed the kernel behavior again in
v3. We're no longer returning -EMFILE in this case, but rather set
KDBUS_RECV_RETURN_INCOMPLETE_FDS in a new field in the receive ioctl
struct called 'return_flags'. We believe that's a nicer way of signaling
specific errors. The message will carry -1 for all FDs that failed to
get installed, so the user can actually see which one is missing.

That's also documented in kdbus.txt, but we missed putting it into the
Changelog - sorry for that.


Hope this helps,
Daniel

^ permalink raw reply

* Re: kdbus: add documentation
From: Michael Kerrisk (man-pages) @ 2015-01-20  8:09 UTC (permalink / raw)
  To: Florian Weimer, David Herrmann, Greg Kroah-Hartman, Daniel Mack
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w, Arnd Bergmann,
	Eric W. Biederman, One Thousand Gnomes, Tom Gundersen,
	Jiri Kosina, Andy Lutomirski, Linux API, linux-kernel,
	Djalal Harouni
In-Reply-To: <87vblwtxee.fsf-ZqZwdwZz9NfTBotR3TxKnbNAH6kLmebB@public.gmane.org>

Daniel,  David,

On 11/30/2014 06:23 PM, Florian Weimer wrote:
> * David Herrmann:
> 
>> On Sun, Nov 30, 2014 at 10:02 AM, Florian Weimer <fw-d32yF4oPJVt0XxTmqZlbVQ@public.gmane.org> wrote:
>>> * Greg Kroah-Hartman:
>>>
>>>> +7.4 Receiving messages
> 
>>> What happens if this is not possible because the file descriptor limit
>>> of the processes would be exceeded?  EMFILE, and the message will not
>>> be received?
>>
>> The message is returned without installing the FDs. This is signaled
>> by EMFILE, but a valid pool offset.
> 
> Oh.  This is really surprising, so it needs documentation.  But it's
> probably better than the alternative (return EMFILE and leave the
> message stuck, so that you receive it immediately again—this behavior
> makes non-blocking accept rather difficult to use correctly).

So, was this point in the end explicitly documented? I not
obvious that it is documented in the revised kdbus.txt that
Greg K-H sent out 4 days ago.

Thanks,

Michael


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: [PATCH v5 5/5] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Lyra Zhang @ 2015-01-20  7:37 UTC (permalink / raw)
  To: Rob Herring
  Cc: Chunyan Zhang, Greg Kroah-Hartman, Mark Rutland, Arnd Bergmann,
	One Thousand Gnomes, Mark Brown, Rob Herring, Pawel Moll,
	Ian Campbell, Kumar Gala, Will Deacon, Catalin Marinas,
	Jiri Slaby, Jason Cooper, Heiko Stübner, Florian Vaussard,
	Andrew Lunn, Robert Richter, Hayato Suzuki, Grant Likely,
	Antony Pavlov, Joel Schopp
In-Reply-To: <CAL_JsqLTArm1v8ka39_KLq2jMKkK2ZXQ6-an=JSWVgTK1LWmOg@mail.gmail.com>

Hi, Rob

I still have a question to be conform, specific describes below:

On Mon, Jan 19, 2015 at 10:11 PM, Rob Herring <robherring2@gmail.com> wrote:
> On Mon, Jan 19, 2015 at 3:55 AM, Lyra Zhang <zhang.lyra@gmail.com> wrote:
>> On Sat, Jan 17, 2015 at 12:41 AM, Rob Herring <robherring2@gmail.com> wrote:
>>> On Fri, Jan 16, 2015 at 4:00 AM, Chunyan Zhang
>>> <chunyan.zhang@spreadtrum.com> wrote:
>>>> Add a full sc9836-uart driver for SC9836 SoC which is based on the
>>>> spreadtrum sharkl64 platform.
>>>> This driver also support earlycon.
>>>> This patch also replaced the spaces between the macros and their
>>>> values with the tabs in serial_core.h
>
> [...]
>
>>>> +static int __init sprd_serial_init(void)
>>>> +{
>>>> +       int ret = 0;
>>>> +
>>>> +       ret = uart_register_driver(&sprd_uart_driver);
>>>
>>> This can be done in probe now. Then you can use module_platform_driver().
>>>
>>
>> Question:
>> 1. there are 4 uart ports configured in dt for sprd_serial, so probe
>> will be called 4 times, but uart_register_driver only needs to be
>> called one time, so can we use uart_driver.state to check if
>> uart_register_driver has already been called ?
>
> Yes.
>
>> 2. if I use module_platform_driver() instead of
>> module_init(sprd_serial_init)  and  module_exit(sprd_serial_exit) , I
>> must move uart_unregister_driver() which is now processed in
>> sprd_serial_exit() to sprd_remove(), there is a similar problem with
>> probe(), sprd_remove() will also be called 4 times, and actually it
>> should be called only one time. How can we deal with this case?
>
> Look at pl01x or Samsung UART drivers which have done this conversion.

Samsung UART does use module_platform_driver, but pl010/pl011 doesn't.
In the Samsung UART driver, uart_unregister_driver is processed in
remove(), like below:

static int s3c24xx_serial_remove(struct platform_device *dev)
{
    struct uart_port *port = s3c24xx_dev_to_port(&dev->dev);

    if (port) {
        s3c24xx_serial_cpufreq_deregister(to_ourport(port));
        uart_remove_one_port(&s3c24xx_uart_drv, port);
    }

    uart_unregister_driver(&s3c24xx_uart_drv);
}

if this serial has more than one ports, uart_unregister_driver() must
be called multiple times when the device need to be removed.
I think there may be a problem because that uart_unregister_driver()
will do kfree(drv->state) every time when it's called.

Thanks,
Chunyan

>
>> 3. for the second question, we can check the platform_device->id, if
>> it is equal to the index of last port (e.g. 4 for this case), then
>> uart_unregister_driver() can be called. Does it work correctly? since
>> for this case, we must keep the order of releasing ports.
>
> The id will not be the line index in the DT case. I don't think you
> can guarantee the order either.
>
> It would be better to make uart_{un}register_driver deal with being
> called multiple times so drivers don't have to deal with getting this
> correct. I'm not sure if that is feasible though.
>
> Rob

^ permalink raw reply

* Re: Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-01-20  3:55 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Hannes Frederic Sowa, Brendan Gregg, Linux API,
	Network Development, LKML, zhangwei(Jovi),
	yrl.pp-manager.tt-FCd8Q96Dh0JBDgjK7y7TUQ@public.gmane.org

On Mon, Jan 19, 2015 at 6:58 PM, Masami Hiramatsu
<masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ@public.gmane.org> wrote:
>>
>> it's done already... one can do the same skb->dev->name logic
>> in kprobe attached program... so from bpf program point of view,
>> tracepoints and kprobes feature-wise are exactly the same.
>> Only input is different.
>
> No, I meant that the input should also be same, at least for the first step.
> I guess it is easy to hook the ring buffer committing and fetch arguments
> from the event entry.

No. That would be very slow. See my comment to Steven
and more detailed numbers below.
Allocating ring buffer takes too much time.

> And what I expected scenario was
>
> 1. setup kprobe traceevent with fd, buf, count by using perf-probe.
> 2. load bpf module
> 3. the module processes given event arguments.

from ring buffer? that's too slow.
It's not usable for high frequency events which
need this in-kernel aggregation.
If events are rare, then just dumping everything
into trace buffer is just fine. No in-kernel program is needed.

> Hmm, it sounds making another systemtap on top of tracepoint and kprobes.
> Why don't you just reuse the existing facilities (perftools and ftrace)
> instead of co-exist?

hmm. I don't think we're on the same page yet...
ring buffer and tracing interface is fully reused.
programs are run as soon as event triggers.
They can return non-zero and kernel will allocate ring
buffer which user space will consume.
Please take a look at tracex1

>> Just look how ktap scripts look alike for kprobes and tracepoints.
>
> Ktap is a good example, it provides only a language parser and a runtime engine.
> Actually, currently it lacks a feature to execute "perf-probe" helper from
> script, but it is easy to add such feature.
...
> For this usecase, I've made --output option for perf probe
> https://lkml.org/lkml/2014/10/31/210

you're proposing to call perf binary from ktap binary?
I think packaging headaches and error conditions
will make such approach very hard to use.
it would be much cleaner to have ktap as part of perf
generating bpf on the fly and feeding into kernel.
'perf probe' parsing and functions don't belong in kernel
when userspace can generate them in more efficient way.

Speaking of performance...
I've added temporary tracepoint like this:
TRACE_EVENT(sys_write,
        TP_PROTO(int count),
        TP_fast_assign(
                __entry->cnt = count;
        ),
and call it from SYSCALL_DEFINE3(write,..., count):
 trace_sys_write(count);

and run the following test:
dd if=/dev/zero of=/dev/null count=5000000

1.19343 s, 2.1 GB/s - raw base line
1.53301 s, 1.7 GB/s - echo 1 > enable
1.62742 s, 1.6 GB/s - echo cnt==1234 > filter
and profile looks like:
     6.23%  dd       [kernel.vmlinux]  [k] __clear_user
     6.19%  dd       [kernel.vmlinux]  [k] __srcu_read_lock
     5.94%  dd       [kernel.vmlinux]  [k] system_call
     4.54%  dd       [kernel.vmlinux]  [k] __srcu_read_unlock
     4.14%  dd       [kernel.vmlinux]  [k] system_call_after_swapgs
     3.96%  dd       [kernel.vmlinux]  [k] fsnotify
     3.74%  dd       [kernel.vmlinux]  [k] ring_buffer_discard_commit
     3.18%  dd       [kernel.vmlinux]  [k] rb_reserve_next_event
     1.69%  dd       [kernel.vmlinux]  [k] rb_add_time_stamp

the slowdown due to unconditional buffer allocation
is too high to use this in production for aggregation
of high frequency events.
There is little reason to run bpf program in kernel after
such penalty. User space can just read trace_pipe_raw
and process data there.

Now if program is run right after tracepoint fires
the profile will look like:
    10.01%  dd             [kernel.vmlinux]            [k] __clear_user
     7.50%  dd             [kernel.vmlinux]            [k] system_call
     6.95%  dd             [kernel.vmlinux]            [k] __srcu_read_lock
     6.02%  dd             [kernel.vmlinux]            [k] __srcu_read_unlock
...
     1.15%  dd             [kernel.vmlinux]            [k]
ftrace_raw_event_sys_write
     0.90%  dd             [kernel.vmlinux]            [k] __bpf_prog_run
this is much more usable.
For empty bpf program that does 'return 0':
1.23418 s, 2.1 GB/s
For full tracex4 example that does map[log2(count)]++
1.2589 s, 2.0 GB/s

so the cost of doing such in-kernel aggregation is
1.19/1.25 is ~ 5%
which makes the whole solution usable as live
monitoring/analytics tool.
We would only need good set of tracepoints.
kprobe via fentry overhead is also not cheap.
Same tracex4 example via kprobe (instead of tracepoint)
1.45673 s, 1.8 GB/s
So tracepoints are 1.45/1.25 ~ 15% faster than kprobes.
which is huge when the cost of running bpf program
is just 5%.

^ permalink raw reply

* Re: [PATCH 2/6] selftests: Add install target
From: Michael Ellerman @ 2015-01-20  3:13 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, mmarek, gregkh, akpm, rostedt, mingo, davem,
	keescook, tranmanphong, cov, dh.herrmann, hughd, bobby.prani,
	serge.hallyn, ebiederm, tim.bird, josh, koct9i, linux-kbuild,
	linux-api, netdev
In-Reply-To: <54BD326A.6030409@osg.samsung.com>

On Mon, 2015-01-19 at 09:35 -0700, Shuah Khan wrote:
> On 01/18/2015 05:35 PM, Michael Ellerman wrote:
> > On Fri, 2015-01-16 at 10:46 -0700, Shuah Khan wrote:
> >> On 01/09/2015 02:06 AM, Michael Ellerman wrote:
> >>> This adds make install support to selftests. The basic usage is:
> >>>
> >>> $ cd tools/testing/selftests
> >>> $ make install
> >>>
> >>> That installs into tools/testing/selftests/install, which can then be
> >>> copied where ever necessary.
> >>>
> >>> The install destination is also configurable using eg:
> >>>
> >>> $ INSTALL_PATH=/mnt/selftests make install
> >>
> >> Please see my response to [PATCH 4/6] kbuild: add a new
> >> kselftest_install make target to install selftests
> >>
> >> These are addressed by the current approach to use existing
> >> INSTALL_MOD_PATH.
> > 
> > No that's a separate issue.
> > 
> > This patch adds install support for tools/testing/selftests, *completely
> > separate* from the kbuild infrastructure. 
> 
> What's the use-case for this feature? I don't see why we need multiple
> ways to do the install?

Exactly the use case I described in the sentence above.

Currently the selftests directory is usable on its own. You can copy the
selftests directory somewhere and it is functional. That is a useful feature,
and there's no reason to break it.

cheers

^ permalink raw reply

* Re: [PATCH v8 1/2] crypto: AF_ALG: add AEAD support
From: Stephan Mueller @ 2015-01-20  3:08 UTC (permalink / raw)
  To: Herbert Xu
  Cc: 'Quentin Gouchet', Daniel Borkmann,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150120030017.GA10475-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>

Am Dienstag, 20. Januar 2015, 14:00:17 schrieb Herbert Xu:

Hi Herbert,

>On Fri, Jan 09, 2015 at 04:30:45AM +0100, Stephan Mueller wrote:
>> Am Donnerstag, 8. Januar 2015, 22:09:31 schrieb Herbert Xu:
>> 
>> Hi Herbert,
>> 
>> > On Wed, Jan 07, 2015 at 04:51:38PM +0100, Stephan Mueller wrote:
>> > > +		if (!aead_writable(sk)) {
>> > > +			/*
>> > > +			 * If there is more data to be expected, 
but we cannot
>> > > +			 * write more data, forcefully define 
that we do not
>> > > +			 * expect more data to invoke the AEAD 
operation. This
>> > > +			 * prevents a deadlock in user space.
>> > > +			 */
>> > > +			ctx->more = 0;
>> > 
>> > We should return EMSGSIZE here.  Also we should clear out the
>> > existing data so that the socket may be reused again.
>> 
>> Is this really wise considering that we want to support a threaded
>> caller? For example, one thread sends data and another reads data.
>> For some reason, the reading thread is throttled or slower than the
>> sender. Now, with the current solution, the sender is put on hold
>> (i.e. throttled) until the reader can catch up. I.e. we have an
>> automated synchronization between sender/receiver.
>> 
>> Thus, when we remove the wait here and return an error, the sender
>> will be shut down and there is no synchronization of the
>> reader/writer any more.
>> 
>> Note, the same applies to the very similar code in aead_sendpage too.
>
>No, if we're in this case then something seriously wrong has
>happened.  IOW the application writer has screwed up.  We're
>not able to carry out the wish of user-space because of resource
>limits on the socket.  Attempting to continue at this point will
>only cause confusion.
>
>So we should loudly declare that there was an error.

Ok. Your suggestion implies that it needs to be removed in aead_sendmsg 
and aead_sendpage. That in turn implies aead_wait_for_wmem can go as 
well.

Also, my previous suggestion with MSG_TRUNC can be removed as well.

I will do that with my next installment.

Ciao
Stephan

^ permalink raw reply

* Re: [PATCH 1/6] selftests: Introduce minimal shared logic for running tests
From: Michael Ellerman @ 2015-01-20  3:08 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, mmarek, gregkh, akpm, rostedt, mingo, davem,
	keescook, tranmanphong, cov, dh.herrmann, hughd, bobby.prani,
	serge.hallyn, ebiederm, tim.bird, josh, koct9i, linux-kbuild,
	linux-api, netdev
In-Reply-To: <54BD332D.6010907@osg.samsung.com>

On Mon, 2015-01-19 at 09:39 -0700, Shuah Khan wrote:
> On 01/18/2015 05:35 PM, Michael Ellerman wrote:
> > On Fri, 2015-01-16 at 10:53 -0700, Shuah Khan wrote:
> >> On 01/09/2015 02:06 AM, Michael Ellerman wrote:
> >>> This adds a Make include file which most selftests can then include to
> >>> get the run_tests logic.
> >>>
> >>> On its own this has the advantage of some reduction in repetition, and
> >>> also means the pass/fail message is defined in fewer places.
> >>>
> >>> However the key advantage is it will allow us to implement install very
> >>> simply in a subsequent patch.
> >>>
> >>> The default implementation just executes each program in $(TEST_PROGS).
> >>>
> >>> We use a variable to hold the default implementation of $(RUN_TESTS)
> >>> because that gives us a clean way to override it if necessary, ie. using
> >>> override. The mount, memory-hotplug and mqueue tests use that to provide
> >>> a different implementation.
> >>>
> >>> Tests are not run via /bin/bash, so if they are scripts they must be
> >>> executable, we add u+x to several.
> >>>
> >>> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> >>
> >> I like the shared logic approach in general provided it leaves the
> >> flexibility to not use the shared logic if a test have the need to
> >> do so.
> > 
> > Yes of course it does, it's entirely optional to include lib.mk.
> > 
> >> This series requires some patch planning. shared logic patch
> >> followed by individual test patches as opposed a single patch.
> > 
> > It could be a single patch too, but there's no reason to do it that way. The
> > series works fine as I sent it.
> > 
> >> I would like to see the shared logic work done on top of my patch v4
> >> series.
> > 
> > That's a waste of time. This series replaces your v4. Doing this "on top" of
> > your v4 would just mean reverting your v4 series and then applying this.
> 
> No necessarily if the work is done as evolutionary step. In any case,
> I want the first step install target support going into the upcoming
> release and then make improvements to it. Please send separate patch
> for the shared logic and individual test patches that use the shared
> logic if you would like to make the improvements.

No that's pointless.

My series does everything yours does, and more, and is less code.

It is ready to merge in the next release, you just need to remove your series
and merge it.

I'm happy to change the default install path or change other minor details, but
it's pointless to merge your series and then remove it all to merge mine.

cheers

^ permalink raw reply

* Re: [PATCH v8 1/2] crypto: AF_ALG: add AEAD support
From: Herbert Xu @ 2015-01-20  3:00 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: 'Quentin Gouchet', Daniel Borkmann,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1639027.yRSjDuRfFC-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

On Fri, Jan 09, 2015 at 04:30:45AM +0100, Stephan Mueller wrote:
> Am Donnerstag, 8. Januar 2015, 22:09:31 schrieb Herbert Xu:
> 
> Hi Herbert,
> 
> > On Wed, Jan 07, 2015 at 04:51:38PM +0100, Stephan Mueller wrote:
> > > +		if (!aead_writable(sk)) {
> > > +			/*
> > > +			 * If there is more data to be expected, but we cannot
> > > +			 * write more data, forcefully define that we do not
> > > +			 * expect more data to invoke the AEAD operation. This
> > > +			 * prevents a deadlock in user space.
> > > +			 */
> > > +			ctx->more = 0;
> > 
> > We should return EMSGSIZE here.  Also we should clear out the
> > existing data so that the socket may be reused again.
> 
> Is this really wise considering that we want to support a threaded caller? For 
> example, one thread sends data and another reads data. For some reason, the 
> reading thread is throttled or slower than the sender. Now, with the current 
> solution, the sender is put on hold (i.e. throttled) until the reader can 
> catch up. I.e. we have an automated synchronization between sender/receiver.
> 
> Thus, when we remove the wait here and return an error, the sender will be 
> shut down and there is no synchronization of the reader/writer any more.
> 
> Note, the same applies to the very similar code in aead_sendpage too.

No, if we're in this case then something seriously wrong has
happened.  IOW the application writer has screwed up.  We're
not able to carry out the wish of user-space because of resource
limits on the socket.  Attempting to continue at this point will
only cause confusion.

So we should loudly declare that there was an error.

> > > +	ctx->more = msg->msg_flags & MSG_MORE;
> > > +	if (!ctx->more && !aead_sufficient_data(ctx))
> > > +		err = -EINVAL;
> > 
> > Ditto, we should discard the data that's queued up.  Also perhaps
> > use EBADMSG instead of EINVAL.
> 
> Agreed that we should clear out the buffer. I will provide that in the next 
> release for both, the sendmsg and sendpage implementations.
> 
> However, I am not sure whether using EBADMSG is a good idea. The error of 
> EBADMSG in the kernel crypto API is only used for integrity errors of AEAD 
> ciphers. But our error case here has nothing to do with the integrity error.
> 
> I would be fine with any other error number -- EMSGSIZE as you suggested 
> above?

Sure.

> Do you think whether such approach makes sense? If yes, which limit to the 
> number of rsgl should we apply -- is ALG_MAX_PAGES good?

Yes I think your solution in v10 is fine.  The current kernel
AEAD interface isn't the best but we're stuck with it for the
time being so this is the best we can do.

Cheers,
-- 
Email: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Masami Hiramatsu @ 2015-01-20  2:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Hannes Frederic Sowa, Brendan Gregg, Linux API,
	Network Development, LKML, zhangwei(Jovi),
	yrl.pp-manager.tt@hitachi.com
In-Reply-To: <CAMEtUuwzx-HZqEaTS30JFfF_RX5kGaZfsgE8wmogzcb2k5=k1g@mail.gmail.com>

(2015/01/20 5:48), Alexei Starovoitov wrote:
> On Mon, Jan 19, 2015 at 1:52 AM, Masami Hiramatsu
> <masami.hiramatsu.pt@hitachi.com> wrote:
>> If we can write the script as
>>
>> int bpf_prog4(s64 write_size)
>> {
>>    ...
>> }
>>
>> This will be much easier to play with.
> 
> yes. that's the intent for user space to do.
> 
>>>   The example of this arbitrary pointer walking is tracex1_kern.c
>>>   which does skb->dev->name == "lo" filtering.
>>
>> At least I would like to see this way on kprobes event too, since it should be
>> treated as a traceevent.
> 
> it's done already... one can do the same skb->dev->name logic
> in kprobe attached program... so from bpf program point of view,
> tracepoints and kprobes feature-wise are exactly the same.
> Only input is different.

No, I meant that the input should also be same, at least for the first step.
I guess it is easy to hook the ring buffer committing and fetch arguments
from the event entry.

>>> - kprobe programs are architecture dependent and need user scripting
>>>   language like ktap/stap/dtrace/perf that will dynamically generate
>>>   them based on debug info in vmlinux
>>
>> If we can use kprobe event as a normal traceevent, user scripting can be
>> architecture independent too. Only perf-probe fills the gap. All other
>> userspace tools can collaborate with perf-probe to setup the events.
>> If so, we can avoid redundant works on debuginfo. That is my point.
> 
> yes. perf already has infra to read debug info and it can be extended
> to understand C like script as:
> int kprobe:sys_write(int fd, char *buf, size_t count)
> {
>    // do stuff with 'count'
> }
> perf can be made to parse this text, recognize that it wants
> to create kprobe on 'sys_write' function. Then based on
> debuginfo figure out where 'count' is (either register or stack)
> and generate corresponding bpf program either
> using llvm/gcc backends or directly.

And what I expected scenario was

1. setup kprobe traceevent with fd, buf, count by using perf-probe.
2. load bpf module
3. the module processes given event arguments.

> perf facility of extracting debug info can be made into
> library too and used by ktap/dtrace tools for their
> languages.
> User space can innovate in many directions.
> and, yes, once we have a scripting language whether
> it's C like with perf or else, this language hides architecture
> depend things from users.
> Such scripting language will also hide the kernel
> side differences between tracepoint and kprobe.

Hmm, it sounds making another systemtap on top of tracepoint and kprobes.
Why don't you just reuse the existing facilities (perftools and ftrace)
instead of co-exist?

> Just look how ktap scripts look alike for kprobes and tracepoints.

Ktap is a good example, it provides only a language parser and a runtime engine.
Actually, currently it lacks a feature to execute "perf-probe" helper from
script, but it is easy to add such feature.

Jovi, if you hire perf-probe helper, you could do

trace probe:do_sys_open dfd fname flags mode {
...
}

instead of

trace probe:do_sys_open dfd=%di fname=%dx flags=%cx mode=+4($stack) {
...
}

For this usecase, I've made --output option for perf probe
https://lkml.org/lkml/2014/10/31/210

It currently stopped, but easy to resume on the latest perf.

Thank you,

> Whether ktap syntax becomes part of perf or perf invents
> its own language, it's going to be good for users regardless.
> The C examples here are just examples. Something
> users can play with already until more user friendly
> tools are being worked on.


-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Research Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Greg Kroah-Hartman @ 2015-01-20  1:13 UTC (permalink / raw)
  To: Johannes Stezenbach
  Cc: arnd-r2nGTMty4D4, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	daniel-cYrQPVfZoowdnm+yROfE0A, dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w,
	tixxdz-Umm1ozX2/EEdnm+yROfE0A
In-Reply-To: <20150119233812.GA1874-FF7aIK3TAVNeoWH0uzbU5w@public.gmane.org>

On Tue, Jan 20, 2015 at 12:38:12AM +0100, Johannes Stezenbach wrote:
> On Tue, Jan 20, 2015 at 04:31:55AM +0800, Greg Kroah-Hartman wrote:
> > On Mon, Jan 19, 2015 at 09:19:06PM +0100, Johannes Stezenbach wrote:
> > > These two statements somehow contradict. From my admittedly very
> > > limited experience, I never used D-Bus because it did not
> > > fit my usage scenarios: I never needed a bus, only point-to-point
> > > links like pipes or sockets.
> > 
> > Great, then you don't need this, no need to worry about it at all, why
> > are we having this conversation? :)
> 
> Well, for one because that's what I wanted to find out...
> 
> > > Well, it made your intentions a bit clearer, but it does
> > > not help to sell kdbus to me, sorry ;-/
> > 
> > It's not my "goal" to sell kdbus to you, if you don't want it, great,
> 
> I used this language because I think you're not providing
> the facts that would allow me to judge for myself whether
> kdbus is a good idea.  Those automotive applications you
> were talking about, what was the OS they were ported from
> and what was the messaging API they used?

They were ported from QNX and I don't know the exact api, it is wrapped
up in a library layer for them to use.  And typically, they run about
40 thousand messages in the first few seconds of startup time.  Or was
it 400 thousand?  Something huge and crazy to be doing on tiny ARM
chips, but that's the IVI industry for you :(

> > But odds are, you are using a system with D-Bus today, if not, then you
> > are using Linux in a very specific and limited manner, which is
> > wonderful, in that case this whole thread isn't really pertinent.
> > 
> > Lots of people do use D-Bus, and for those users, that is what this
> > patchset is for.
> 
> As I said before, I'm seeing about a dozen D-Bus messages per minute,
> nothing that would justify adding kdbus to the kernel for
> performance reasons.  Wrt security I'm also not aware of any
> open issues with D-Bus.  Thus I doubt normal users of D-Bus
> would see any benefit from kdbus.  I also think none of the
> applications I can install from my distribution has any performance
> issue with D-Bus.

That's because people have not done anything really needing performance
on the desktop over D-Bus in the past due to how slow the current
implementation is.  Now that this is being resolved, that can change,
and there are demos out there of even streaming audio over kdbus with no
problems.

But performance is not just the only reason we want this in the kernel,
I listed a whole long range of them.  Sure, it's great to now be faster,
cutting down the number of context switches and copies by a huge amount,
but the other things are equally important for future development
(namespaces, containers, security, early-boot, etc.)

> And this is the point where I ask myself if I missed something.

Don't focus purely on performance for your existing desktop system,
that's not the only use case here.  There are lots of others, as I
document, that can benefit and want this.

One "fun" thing I've been talking to someone about is the ability to
even port binder to be on top of kdbus.  But that's just a research
project, and requires some API changes on the userspace binder side, but
it shows real promise, and would then mean that we could deprecate the
old binder code and a few hundred million devices could then use kdbus
instead.  But that's long-term goals, not really all that relevant here,
but it shows that having a solid bus IPC mechanism is a powerful thing
that we have been missing in the past from Linux.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Johannes Stezenbach @ 2015-01-19 23:38 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: arnd-r2nGTMty4D4, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	daniel-cYrQPVfZoowdnm+yROfE0A, dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w,
	tixxdz-Umm1ozX2/EEdnm+yROfE0A
In-Reply-To: <20150119203155.GA15441-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>

On Tue, Jan 20, 2015 at 04:31:55AM +0800, Greg Kroah-Hartman wrote:
> On Mon, Jan 19, 2015 at 09:19:06PM +0100, Johannes Stezenbach wrote:
> > These two statements somehow contradict. From my admittedly very
> > limited experience, I never used D-Bus because it did not
> > fit my usage scenarios: I never needed a bus, only point-to-point
> > links like pipes or sockets.
> 
> Great, then you don't need this, no need to worry about it at all, why
> are we having this conversation? :)

Well, for one because that's what I wanted to find out...

> > Well, it made your intentions a bit clearer, but it does
> > not help to sell kdbus to me, sorry ;-/
> 
> It's not my "goal" to sell kdbus to you, if you don't want it, great,

I used this language because I think you're not providing
the facts that would allow me to judge for myself whether
kdbus is a good idea.  Those automotive applications you
were talking about, what was the OS they were ported from
and what was the messaging API they used?

> But odds are, you are using a system with D-Bus today, if not, then you
> are using Linux in a very specific and limited manner, which is
> wonderful, in that case this whole thread isn't really pertinent.
> 
> Lots of people do use D-Bus, and for those users, that is what this
> patchset is for.

As I said before, I'm seeing about a dozen D-Bus messages per minute,
nothing that would justify adding kdbus to the kernel for
performance reasons.  Wrt security I'm also not aware of any
open issues with D-Bus.  Thus I doubt normal users of D-Bus
would see any benefit from kdbus.  I also think none of the
applications I can install from my distribution has any performance
issue with D-Bus.

And this is the point where I ask myself if I missed something.

Thanks,
Johannes

^ permalink raw reply

* Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-01-19 20:48 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Hannes Frederic Sowa, Brendan Gregg, Linux API,
	Network Development, LKML
In-Reply-To: <54BCD3CF.9040205@hitachi.com>

On Mon, Jan 19, 2015 at 1:52 AM, Masami Hiramatsu
<masami.hiramatsu.pt@hitachi.com> wrote:
> If we can write the script as
>
> int bpf_prog4(s64 write_size)
> {
>    ...
> }
>
> This will be much easier to play with.

yes. that's the intent for user space to do.

>>   The example of this arbitrary pointer walking is tracex1_kern.c
>>   which does skb->dev->name == "lo" filtering.
>
> At least I would like to see this way on kprobes event too, since it should be
> treated as a traceevent.

it's done already... one can do the same skb->dev->name logic
in kprobe attached program... so from bpf program point of view,
tracepoints and kprobes feature-wise are exactly the same.
Only input is different.

>> - kprobe programs are architecture dependent and need user scripting
>>   language like ktap/stap/dtrace/perf that will dynamically generate
>>   them based on debug info in vmlinux
>
> If we can use kprobe event as a normal traceevent, user scripting can be
> architecture independent too. Only perf-probe fills the gap. All other
> userspace tools can collaborate with perf-probe to setup the events.
> If so, we can avoid redundant works on debuginfo. That is my point.

yes. perf already has infra to read debug info and it can be extended
to understand C like script as:
int kprobe:sys_write(int fd, char *buf, size_t count)
{
   // do stuff with 'count'
}
perf can be made to parse this text, recognize that it wants
to create kprobe on 'sys_write' function. Then based on
debuginfo figure out where 'count' is (either register or stack)
and generate corresponding bpf program either
using llvm/gcc backends or directly.
perf facility of extracting debug info can be made into
library too and used by ktap/dtrace tools for their
languages.
User space can innovate in many directions.
and, yes, once we have a scripting language whether
it's C like with perf or else, this language hides architecture
depend things from users.
Such scripting language will also hide the kernel
side differences between tracepoint and kprobe.
Just look how ktap scripts look alike for kprobes and tracepoints.
Whether ktap syntax becomes part of perf or perf invents
its own language, it's going to be good for users regardless.
The C examples here are just examples. Something
users can play with already until more user friendly
tools are being worked on.

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Greg Kroah-Hartman @ 2015-01-19 20:31 UTC (permalink / raw)
  To: Johannes Stezenbach
  Cc: arnd, ebiederm, gnomes, teg, jkosina, luto, linux-api,
	linux-kernel, daniel, dh.herrmann, tixxdz
In-Reply-To: <20150119201906.GA337@sig21.net>

On Mon, Jan 19, 2015 at 09:19:06PM +0100, Johannes Stezenbach wrote:
> On Tue, Jan 20, 2015 at 02:38:06AM +0800, Greg Kroah-Hartman wrote:
> > Yes, I do agree, there are lots of existing ipc solutions today that
> > kdbus is not designed for, nor would it be good to use it for.  The
> > majority of them being IPC that crosses the network layer, as there are
> > lots of good solutions today for that problem.  That being said, I do
> > know one research group that has kdbus working cross-network, just "to
> > try it out", but I don't know what ever came of it.
> ...
> > Everyone uses D-Bus today for everything on their system, so by
> > replacing the underlying library with kdbus, they will continue to use
> > it for everything without having to change any application or library
> > code at all.
> 
> These two statements somehow contradict. From my admittedly very
> limited experience, I never used D-Bus because it did not
> fit my usage scenarios: I never needed a bus, only point-to-point
> links like pipes or sockets.

Great, then you don't need this, no need to worry about it at all, why
are we having this conversation? :)

> Let me rephrase my previous, lengthy mail: Will kdbus only
> support the same IPC model as D-Bus (just with higher
> performance and some bells and whistles), or will it
> be useful for other scenarios?  Like, can two programs
> use it to communicate directly without the need of
> any daemon?  (And if so, would there be any advantage
> compared to traditional UNIX IPC methods?)

It's a totally different model, as you point out from what you are
thinking of "traditional" IPC methods (side note, which of the 15+
current IPC methods do you consider "traditional", we have a lot of them
these days...)

> You were comparing kdbus and Binder.  Why?

Why not?  :)

Seriously, they are related in a way, see my long blog post for all of
the details about it if you are curious.

> So far my impression is that D-Bus and Binder are
> completely seperate things, not just because of
> the thread vs. event-loop programming model but
> also because Binder is not a bus (i.e. no multicast messaging).

People compare them a lot, which is why I brought it up, it's a
discussion that needed to be made.

> > Hope this helps,
> 
> Well, it made your intentions a bit clearer, but it does
> not help to sell kdbus to me, sorry ;-/

It's not my "goal" to sell kdbus to you, if you don't want it, great,
don't worry about it, don't build it on your kernels, and the world will
be fine.  Consider it like any other "driver" or filesystem, if you
don't need it, there's nothing to even discuss.

But odds are, you are using a system with D-Bus today, if not, then you
are using Linux in a very specific and limited manner, which is
wonderful, in that case this whole thread isn't really pertinent.

Lots of people do use D-Bus, and for those users, that is what this
patchset is for.

Hope that helps clear things up,

greg k-h

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Johannes Stezenbach @ 2015-01-19 20:19 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: arnd-r2nGTMty4D4, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	daniel-cYrQPVfZoowdnm+yROfE0A, dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w,
	tixxdz-Umm1ozX2/EEdnm+yROfE0A
In-Reply-To: <20150119183806.GA8479-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>

On Tue, Jan 20, 2015 at 02:38:06AM +0800, Greg Kroah-Hartman wrote:
> Yes, I do agree, there are lots of existing ipc solutions today that
> kdbus is not designed for, nor would it be good to use it for.  The
> majority of them being IPC that crosses the network layer, as there are
> lots of good solutions today for that problem.  That being said, I do
> know one research group that has kdbus working cross-network, just "to
> try it out", but I don't know what ever came of it.
...
> Everyone uses D-Bus today for everything on their system, so by
> replacing the underlying library with kdbus, they will continue to use
> it for everything without having to change any application or library
> code at all.

These two statements somehow contradict. From my admittedly very
limited experience, I never used D-Bus because it did not
fit my usage scenarios: I never needed a bus, only point-to-point
links like pipes or sockets.

Let me rephrase my previous, lengthy mail: Will kdbus only
support the same IPC model as D-Bus (just with higher
performance and some bells and whistles), or will it
be useful for other scenarios?  Like, can two programs
use it to communicate directly without the need of
any daemon?  (And if so, would there be any advantage
compared to traditional UNIX IPC methods?)

You were comparing kdbus and Binder.  Why?
So far my impression is that D-Bus and Binder are
completely seperate things, not just because of
the thread vs. event-loop programming model but
also because Binder is not a bus (i.e. no multicast messaging).

> Hope this helps,

Well, it made your intentions a bit clearer, but it does
not help to sell kdbus to me, sorry ;-/

Thanks,
Johannes

^ permalink raw reply

* Re: [PATCH net-next v5 0/4] netns: allow to identify peer netns
From: David Miller @ 2015-01-19 19:16 UTC (permalink / raw)
  To: nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	luto-kltTT9wpgjJwATOyAt5JVQ, cwang-xCSkyg8dI+0RB7SZvlqPiA
In-Reply-To: <1421331078-21622-1-git-send-email-nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>

From: Nicolas Dichtel <nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>
Date: Thu, 15 Jan 2015 15:11:14 +0100

> The goal of this serie is to be able to multicast netlink messages with an
> attribute that identify a peer netns.
> This is needed by the userland to interpret some information contained in
> netlink messages (like IFLA_LINK value, but also some other attributes in case
> of x-netns netdevice (see also
> http://thread.gmane.org/gmane.linux.network/315933/focus=316064 and
> http://thread.gmane.org/gmane.linux.kernel.containers/28301/focus=4239)).
> 
> Ids of peer netns can be set by userland via a new rtnl cmd RTM_NEWNSID. When
> the kernel needs an id for a peer (for example when advertising a new x-netns
> interface via netlink), if the user didn't allocate an id, one will be
> automatically allocated.
> These ids are stored per netns and are local (ie only valid in the netns where
> they are set). To avoid allocating an int for each peer netns, I use
> idr_for_each() to retrieve the id of a peer netns. Note that it will be possible
> to add a table (struct net -> id) later to optimize this lookup if needed.
> 
> Patch 1/4 introduces the rtnetlink API mechanism to set and get these ids.
> Patch 2/4 and 3/4 implements an example of how to use these ids when advertising
> information about a x-netns interface.
> And patch 4/4 shows that the netlink messages can be symetric between a GET and
> a SET.
 ...

Seires applied, thanks.

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Greg Kroah-Hartman @ 2015-01-19 18:38 UTC (permalink / raw)
  To: Johannes Stezenbach
  Cc: arnd, ebiederm, gnomes, teg, jkosina, luto, linux-api,
	linux-kernel, daniel, dh.herrmann, tixxdz
In-Reply-To: <20150119180642.GA27957@sig21.net>

On Mon, Jan 19, 2015 at 07:06:42PM +0100, Johannes Stezenbach wrote:
> Hi Greg and Daniel,

[Fixing Daniel's email, which I messed up originally...]

> On Fri, Jan 16, 2015 at 11:16:04AM -0800, Greg Kroah-Hartman wrote:
> > kdbus is a kernel-level IPC implementation that aims for resemblance to
> > the the protocol layer with the existing userspace D-Bus daemon while
> > enabling some features that couldn't be implemented before in userspace.
> > 
> > The documentation in the first patch in this series explains the
> > protocol and the API details.
> 
> How about the big picture?
> 
> > Reasons why this should be done in the kernel, instead of userspace as
> > it is currently done today include the following:
> [abbreviated]
> > - performance
> > - security
> > - semantics for apps with heavy data payloads
> 
> 
> First of all I wonder about the relationship with D-Bus.
> http://dbus.freedesktop.org/doc/dbus-specification.html says:
> 
>    D-Bus is designed for two specific use cases:
> 
>        A "system bus" for notifications from the system to user
>        sessions, and to allow the system to request input from
>        user sessions.
> 
>        A "session bus" used to implement desktop environments such
>        as GNOME and KDE. 
> 
>    D-Bus is not intended to be a generic IPC system for any
>    possible application, and intentionally omits many features
>    found in other IPC systems for this reason. 
> 
> Does this also apply to kdbus?  If not, what are the
> suggested uses of kdbus beyond those where D-Bus is
> currently used?

I don't really know.  I have heard from lots of random people who are
starting to look into kdbus as to if it will work for their use cases,
which seem quite varied.  I'll leave it to them to pop up and say if it
will work for them outside of the above specific ways.  But even then,
the above two things are something almost all Linux boxes rely on today,
so it's not like this is a solution searching for a problem to solve :)

> Another related quote by Havoc Pennington:
> http://lists.freedesktop.org/archives/dbus/2012-March/015024.html
> 
>    In general, reading this, I think in some cases there are
>    problems that make sense to fix in dbus, and in other cases
>    there are problems that are best solved by not using dbus.
>    ...
>    there are about 10000 IPC solutions already, from ICE (both of
>    them) to ZeroMQ to AMQP to CORBA to X11 to HTTP to SOAP to
>    WebSockets to SUN-RPC to whatever-the-heck. To me, trying to
>    make dbus configurable so that it can substitute for any of
>    these is a Bad Idea (tm).
> 
> Do you think it also applies to kdbus?

Yes, I do agree, there are lots of existing ipc solutions today that
kdbus is not designed for, nor would it be good to use it for.  The
majority of them being IPC that crosses the network layer, as there are
lots of good solutions today for that problem.  That being said, I do
know one research group that has kdbus working cross-network, just "to
try it out", but I don't know what ever came of it.

> Wrt the performance improvement achieved by kdbus, my impression
> about D-Bus is that the number of messages on my system is
> about a dozen per minute.  Are there actually any existing
> applications using D-Bus that have a performance issue?
> Or is this only about future possible uses?

There are a number of existing applications that have this performance
issue today.  The majority of them have been ported from other operating
systems that have a fast message bus, so their process model is all
about messages.  They use a library layer on Linux to turn that message
bus into D-Bus messages, and have suffered a huge hit in performance
from their previous operating system.  Using kdbus has brought it back
in line to make it reasonable to use.

These applications can be usually found in the Automotive sector, which
has been playing with light-weight dbus library implementations for a
while now, and have done some initial kdbus testing to verify this will
work for them.

> Linked from http://kroah.com/log/blog/2014/01/15/kdbus-details/,
> http://lwn.net/Articles/580194/ "The unveiling of kdbus" says:
> 
>    Unlike most other kernels, Linux has never had a well-designed
>    IPC mechanism. Windows and Mac OS have this feature; even
>    Android, based on Linux, has one in the form of the "binder"
>    subsystem. Linux, instead, has only had the primitives —
>    sockets, FIFOs, and shared memory — but those have never been
>    knitted together into a reasonable application-level API. Kdbus
>    is an attempt to do that knitting and create something that is
>    at least as good as the mechanisms found on other systems.
> 
> These are bold words. I'm not sure what Windows and Mac OS
> have in terms of IPC, but the above suggests that kdbus
> is *the* new Linux IPC that everyone will use for everything,
> rather than a special purpose facility.

Everyone uses D-Bus today for everything on their system, so by
replacing the underlying library with kdbus, they will continue to use
it for everything without having to change any application or library
code at all.

Hope this helps,

greg k-h

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Johannes Stezenbach @ 2015-01-19 18:33 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: arnd-r2nGTMty4D4, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Daniel Mack,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, tixxdz-Umm1ozX2/EEdnm+yROfE0A
In-Reply-To: <1421435777-25306-1-git-send-email-gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>

(resend, fix Daniel's email address)

Hi Greg and Daniel,

I don't have a clue so I need to ask some stupid questions...

On Fri, Jan 16, 2015 at 11:16:04AM -0800, Greg Kroah-Hartman wrote:
> kdbus is a kernel-level IPC implementation that aims for resemblance to
> the the protocol layer with the existing userspace D-Bus daemon while
> enabling some features that couldn't be implemented before in userspace.
> 
> The documentation in the first patch in this series explains the
> protocol and the API details.

How about the big picture?

> Reasons why this should be done in the kernel, instead of userspace as
> it is currently done today include the following:
[abbreviated]
> - performance
> - security
> - semantics for apps with heavy data payloads

First of all I wonder about the relationship with D-Bus.
http://dbus.freedesktop.org/doc/dbus-specification.html says:

   D-Bus is designed for two specific use cases:

       A "system bus" for notifications from the system to user
       sessions, and to allow the system to request input from
       user sessions.

       A "session bus" used to implement desktop environments such
       as GNOME and KDE. 

   D-Bus is not intended to be a generic IPC system for any
   possible application, and intentionally omits many features
   found in other IPC systems for this reason. 

Does this also apply to kdbus?  If not, what are the
suggested uses of kdbus beyond those where D-Bus is
currently used?

Another related quote by Havoc Pennington:
http://lists.freedesktop.org/archives/dbus/2012-March/015024.html

   In general, reading this, I think in some cases there are
   problems that make sense to fix in dbus, and in other cases
   there are problems that are best solved by not using dbus.
   ...
   there are about 10000 IPC solutions already, from ICE (both of
   them) to ZeroMQ to AMQP to CORBA to X11 to HTTP to SOAP to
   WebSockets to SUN-RPC to whatever-the-heck. To me, trying to
   make dbus configurable so that it can substitute for any of
   these is a Bad Idea (tm).

Do you think it also applies to kdbus?

Wrt the performance improvement achieved by kdbus, my impression
about D-Bus is that the number of messages on my system is
about a dozen per minute.  Are there actually any existing
applications using D-Bus that have a performance issue?
Or is this only about future possible uses?

Linked from http://kroah.com/log/blog/2014/01/15/kdbus-details/,
http://lwn.net/Articles/580194/ "The unveiling of kdbus" says:

   Unlike most other kernels, Linux has never had a well-designed
   IPC mechanism. Windows and Mac OS have this feature; even
   Android, based on Linux, has one in the form of the "binder"
   subsystem. Linux, instead, has only had the primitives —
   sockets, FIFOs, and shared memory — but those have never been
   knitted together into a reasonable application-level API. Kdbus
   is an attempt to do that knitting and create something that is
   at least as good as the mechanisms found on other systems.

These are bold words. I'm not sure what Windows and Mac OS
have in terms of IPC, but the above suggests that kdbus
is *the* new Linux IPC that everyone will use for everything,
rather than a special purpose facility.
True?

Thanks,
Johannes

^ permalink raw reply

* Re: [PATCH v3 00/13] Add kdbus implementation
From: Johannes Stezenbach @ 2015-01-19 18:06 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: arnd-r2nGTMty4D4, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	daniel-cYrQPVfZooxQFI55V6+gNQ, dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w,
	tixxdz-Umm1ozX2/EEdnm+yROfE0A
In-Reply-To: <1421435777-25306-1-git-send-email-gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>

Hi Greg and Daniel,

I don't have a clue so I need to ask some stupid questions...

On Fri, Jan 16, 2015 at 11:16:04AM -0800, Greg Kroah-Hartman wrote:
> kdbus is a kernel-level IPC implementation that aims for resemblance to
> the the protocol layer with the existing userspace D-Bus daemon while
> enabling some features that couldn't be implemented before in userspace.
> 
> The documentation in the first patch in this series explains the
> protocol and the API details.

How about the big picture?

> Reasons why this should be done in the kernel, instead of userspace as
> it is currently done today include the following:
[abbreviated]
> - performance
> - security
> - semantics for apps with heavy data payloads

First of all I wonder about the relationship with D-Bus.
http://dbus.freedesktop.org/doc/dbus-specification.html says:

   D-Bus is designed for two specific use cases:

       A "system bus" for notifications from the system to user
       sessions, and to allow the system to request input from
       user sessions.

       A "session bus" used to implement desktop environments such
       as GNOME and KDE. 

   D-Bus is not intended to be a generic IPC system for any
   possible application, and intentionally omits many features
   found in other IPC systems for this reason. 

Does this also apply to kdbus?  If not, what are the
suggested uses of kdbus beyond those where D-Bus is
currently used?

Another related quote by Havoc Pennington:
http://lists.freedesktop.org/archives/dbus/2012-March/015024.html

   In general, reading this, I think in some cases there are
   problems that make sense to fix in dbus, and in other cases
   there are problems that are best solved by not using dbus.
   ...
   there are about 10000 IPC solutions already, from ICE (both of
   them) to ZeroMQ to AMQP to CORBA to X11 to HTTP to SOAP to
   WebSockets to SUN-RPC to whatever-the-heck. To me, trying to
   make dbus configurable so that it can substitute for any of
   these is a Bad Idea (tm).

Do you think it also applies to kdbus?

Wrt the performance improvement achieved by kdbus, my impression
about D-Bus is that the number of messages on my system is
about a dozen per minute.  Are there actually any existing
applications using D-Bus that have a performance issue?
Or is this only about future possible uses?

Linked from http://kroah.com/log/blog/2014/01/15/kdbus-details/,
http://lwn.net/Articles/580194/ "The unveiling of kdbus" says:

   Unlike most other kernels, Linux has never had a well-designed
   IPC mechanism. Windows and Mac OS have this feature; even
   Android, based on Linux, has one in the form of the "binder"
   subsystem. Linux, instead, has only had the primitives —
   sockets, FIFOs, and shared memory — but those have never been
   knitted together into a reasonable application-level API. Kdbus
   is an attempt to do that knitting and create something that is
   at least as good as the mechanisms found on other systems.

These are bold words. I'm not sure what Windows and Mac OS
have in terms of IPC, but the above suggests that kdbus
is *the* new Linux IPC that everyone will use for everything,
rather than a special purpose facility.
True?

Thanks,
Johannes

^ permalink raw reply

* Re: [PATCH 1/6] selftests: Introduce minimal shared logic for running tests
From: Shuah Khan @ 2015-01-19 16:39 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linux-kernel, mmarek, gregkh, akpm, rostedt, mingo, davem,
	keescook, tranmanphong, cov, dh.herrmann, hughd, bobby.prani,
	serge.hallyn, ebiederm, tim.bird, josh, koct9i, linux-kbuild,
	linux-api, netdev
In-Reply-To: <1421627742.3787.6.camel@ellerman.id.au>

On 01/18/2015 05:35 PM, Michael Ellerman wrote:
> On Fri, 2015-01-16 at 10:53 -0700, Shuah Khan wrote:
>> On 01/09/2015 02:06 AM, Michael Ellerman wrote:
>>> This adds a Make include file which most selftests can then include to
>>> get the run_tests logic.
>>>
>>> On its own this has the advantage of some reduction in repetition, and
>>> also means the pass/fail message is defined in fewer places.
>>>
>>> However the key advantage is it will allow us to implement install very
>>> simply in a subsequent patch.
>>>
>>> The default implementation just executes each program in $(TEST_PROGS).
>>>
>>> We use a variable to hold the default implementation of $(RUN_TESTS)
>>> because that gives us a clean way to override it if necessary, ie. using
>>> override. The mount, memory-hotplug and mqueue tests use that to provide
>>> a different implementation.
>>>
>>> Tests are not run via /bin/bash, so if they are scripts they must be
>>> executable, we add u+x to several.
>>>
>>> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>>
>> I like the shared logic approach in general provided it leaves the
>> flexibility to not use the shared logic if a test have the need to
>> do so.
> 
> Yes of course it does, it's entirely optional to include lib.mk.
> 
>> This series requires some patch planning. shared logic patch
>> followed by individual test patches as opposed a single patch.
> 
> It could be a single patch too, but there's no reason to do it that way. The
> series works fine as I sent it.
> 
>> I would like to see the shared logic work done on top of my patch v4
>> series.
> 
> That's a waste of time. This series replaces your v4. Doing this "on top" of
> your v4 would just mean reverting your v4 series and then applying this.
> 

No necessarily if the work is done as evolutionary step. In any case,
I want the first step install target support going into the upcoming
release and then make improvements to it. Please send separate patch
for the shared logic and individual test patches that use the shared
logic if you would like to make the improvements.

thanks,
-- Shuah


-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh@osg.samsung.com | (970) 217-8978

^ permalink raw reply

* Re: [PATCH 2/6] selftests: Add install target
From: Shuah Khan @ 2015-01-19 16:35 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, mmarek-AlSwsSmVLrQ,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	rostedt-nx8X9YLhiw1AfugRpC6u6w, mingo-H+wXaHxf7aLQT0dZR+AlfA,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, keescook-F7+t8E8rja9g9hUCZPvPmw,
	tranmanphong-Re5JQEeQqe8AvxtiuMwx3w, cov-sgV2jX0FEOL9JmXXK+q4OQ,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
	bobby.prani-Re5JQEeQqe8AvxtiuMwx3w,
	serge.hallyn-GeWIH/nMZzLQT0dZR+AlfA,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, tim.bird-/MT0OVThwyLZJqsBc5GL+g,
	josh-iaAMLnmF4UmaiuxdJuQwMA, koct9i-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kbuild-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421627752.3787.8.camel-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>

On 01/18/2015 05:35 PM, Michael Ellerman wrote:
> On Fri, 2015-01-16 at 10:46 -0700, Shuah Khan wrote:
>> On 01/09/2015 02:06 AM, Michael Ellerman wrote:
>>> This adds make install support to selftests. The basic usage is:
>>>
>>> $ cd tools/testing/selftests
>>> $ make install
>>>
>>> That installs into tools/testing/selftests/install, which can then be
>>> copied where ever necessary.
>>>
>>> The install destination is also configurable using eg:
>>>
>>> $ INSTALL_PATH=/mnt/selftests make install
>>
>> Please see my response to [PATCH 4/6] kbuild: add a new
>> kselftest_install make target to install selftests
>>
>> These are addressed by the current approach to use existing
>> INSTALL_MOD_PATH.
> 
> No that's a separate issue.
> 
> This patch adds install support for tools/testing/selftests, *completely
> separate* from the kbuild infrastructure. 
> 

What's the use-case for this feature? I don't see why we need multiple
ways to do the install?

thanks,
-- Shuah


-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org | (970) 217-8978

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox