Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH pre-squash 08/14] fixup! virtio_pci: macros for PCI layout offsets
From: Michael S. Tsirkin @ 2015-01-21 15:00 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Rusty Russell,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Gerd Hoffmann, linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421852375-22604-1-git-send-email-mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

virtio_pci_modern: fix up vendor capability macros

Gerd Hoffmann noticed that we implemented
capability layout from an old draft.
Unfortunately the code was copied to host as well,
so we didn't notice.

Luckily we caught this in time.

This fixes commit "virtio_pci: macros for PCI layout offsets"
and should be smashed with it.

Reported-by: Gerd Hoffmann <kraxel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Tested-by: Gerd Hoffmann <kraxel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 include/uapi/linux/virtio_pci.h    | 14 +++++---------
 drivers/virtio/virtio_pci_modern.c |  6 ++++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 0911c62..3b7e4d2 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -124,11 +124,6 @@ struct virtio_pci_cap {
 	__le32 length;		/* Length of the structure, in bytes. */
 };
 
-#define VIRTIO_PCI_CAP_BAR_SHIFT	5
-#define VIRTIO_PCI_CAP_BAR_MASK		0x7
-#define VIRTIO_PCI_CAP_TYPE_SHIFT	0
-#define VIRTIO_PCI_CAP_TYPE_MASK	0x7
-
 struct virtio_pci_notify_cap {
 	struct virtio_pci_cap cap;
 	__le32 notify_off_multiplier;	/* Multiplier for queue_notify_off. */
@@ -164,11 +159,12 @@ struct virtio_pci_common_cfg {
 #define VIRTIO_PCI_CAP_VNDR		0
 #define VIRTIO_PCI_CAP_NEXT		1
 #define VIRTIO_PCI_CAP_LEN		2
-#define VIRTIO_PCI_CAP_TYPE_AND_BAR	3
-#define VIRTIO_PCI_CAP_OFFSET		4
-#define VIRTIO_PCI_CAP_LENGTH		8
+#define VIRTIO_PCI_CAP_CFG_TYPE		3
+#define VIRTIO_PCI_CAP_BAR		4
+#define VIRTIO_PCI_CAP_OFFSET		8
+#define VIRTIO_PCI_CAP_LENGTH		12
 
-#define VIRTIO_PCI_NOTIFY_CAP_MULT	12
+#define VIRTIO_PCI_NOTIFY_CAP_MULT	16
 
 #define VIRTIO_PCI_COMMON_DFSELECT	0
 #define VIRTIO_PCI_COMMON_DF		4
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index c86594e..b2e707ad 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -474,8 +474,10 @@ static inline void check_offsets(void)
 		     offsetof(struct virtio_pci_cap, cap_next));
 	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
 		     offsetof(struct virtio_pci_cap, cap_len));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_TYPE_AND_BAR !=
-		     offsetof(struct virtio_pci_cap, type_and_bar));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
+		     offsetof(struct virtio_pci_cap, cfg_type));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
+		     offsetof(struct virtio_pci_cap, bar));
 	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
 		     offsetof(struct virtio_pci_cap, offset));
 	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
-- 
MST

^ permalink raw reply related

* [PATCH post-squash 2/9] virtio-pci: define layout for virtio 1.0
From: Michael S. Tsirkin @ 2015-01-21 15:05 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-api, virtualization
In-Reply-To: <1421852670-23742-1-git-send-email-mst@redhat.com>

From: Rusty Russell <rusty@rustcorp.com.au>

Based on patches by Michael S. Tsirkin <mst@redhat.com>, but I found it
hard to follow so changed to use structures which are more
self-documenting.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Gerd Hoffmann <kraxel@redhat.com>
---
 include/uapi/linux/virtio_pci.h | 63 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 509d630..a2b2e13 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -99,4 +99,67 @@
 /* Vector value used to disable MSI for queue */
 #define VIRTIO_MSI_NO_VECTOR            0xffff
 
+#ifndef VIRTIO_PCI_NO_MODERN
+
+/* IDs for different capabilities.  Must all exist. */
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	__u8 cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
+	__u8 cap_next;		/* Generic PCI field: next ptr. */
+	__u8 cap_len;		/* Generic PCI field: capability length */
+	__u8 cfg_type;		/* Identifies the structure. */
+	__u8 bar;		/* Where to find it. */
+	__u8 padding[3];	/* Pad to full dword. */
+	__le32 offset;		/* Offset within bar. */
+	__le32 length;		/* Length of the structure, in bytes. */
+};
+
+#define VIRTIO_PCI_CAP_BAR_SHIFT	5
+#define VIRTIO_PCI_CAP_BAR_MASK		0x7
+#define VIRTIO_PCI_CAP_TYPE_SHIFT	0
+#define VIRTIO_PCI_CAP_TYPE_MASK	0x7
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	__le32 notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	__le32 device_feature_select;	/* read-write */
+	__le32 device_feature;		/* read-only */
+	__le32 guest_feature_select;	/* read-write */
+	__le32 guest_feature;		/* read-write */
+	__le16 msix_config;		/* read-write */
+	__le16 num_queues;		/* read-only */
+	__u8 device_status;		/* read-write */
+	__u8 config_generation;		/* read-only */
+
+	/* About a specific virtqueue. */
+	__le16 queue_select;		/* read-write */
+	__le16 queue_size;		/* read-write, power of 2. */
+	__le16 queue_msix_vector;	/* read-write */
+	__le16 queue_enable;		/* read-write */
+	__le16 queue_notify_off;	/* read-only */
+	__le32 queue_desc_lo;		/* read-write */
+	__le32 queue_desc_hi;		/* read-write */
+	__le32 queue_avail_lo;		/* read-write */
+	__le32 queue_avail_hi;		/* read-write */
+	__le32 queue_used_lo;		/* read-write */
+	__le32 queue_used_hi;		/* read-write */
+};
+
+#endif /* VIRTIO_PCI_NO_MODERN */
+
 #endif
-- 
MST

^ permalink raw reply related

* [PATCH post-squash 4/9] virtio_pci: macros for PCI layout offsets
From: Michael S. Tsirkin @ 2015-01-21 15:05 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-api, virtualization
In-Reply-To: <1421852670-23742-1-git-send-email-mst@redhat.com>

From: Rusty Russell <rusty@rustcorp.com.au>

QEMU wants it, so why not?  Trust, but verify.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Gerd Hoffmann <kraxel@redhat.com>
---
 include/uapi/linux/virtio_pci.h    | 36 +++++++++++++++++++----
 drivers/virtio/virtio_pci_modern.c | 60 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index a2b2e13..3b7e4d2 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -124,11 +124,6 @@ struct virtio_pci_cap {
 	__le32 length;		/* Length of the structure, in bytes. */
 };
 
-#define VIRTIO_PCI_CAP_BAR_SHIFT	5
-#define VIRTIO_PCI_CAP_BAR_MASK		0x7
-#define VIRTIO_PCI_CAP_TYPE_SHIFT	0
-#define VIRTIO_PCI_CAP_TYPE_MASK	0x7
-
 struct virtio_pci_notify_cap {
 	struct virtio_pci_cap cap;
 	__le32 notify_off_multiplier;	/* Multiplier for queue_notify_off. */
@@ -160,6 +155,37 @@ struct virtio_pci_common_cfg {
 	__le32 queue_used_hi;		/* read-write */
 };
 
+/* Macro versions of offsets for the Old Timers! */
+#define VIRTIO_PCI_CAP_VNDR		0
+#define VIRTIO_PCI_CAP_NEXT		1
+#define VIRTIO_PCI_CAP_LEN		2
+#define VIRTIO_PCI_CAP_CFG_TYPE		3
+#define VIRTIO_PCI_CAP_BAR		4
+#define VIRTIO_PCI_CAP_OFFSET		8
+#define VIRTIO_PCI_CAP_LENGTH		12
+
+#define VIRTIO_PCI_NOTIFY_CAP_MULT	16
+
+#define VIRTIO_PCI_COMMON_DFSELECT	0
+#define VIRTIO_PCI_COMMON_DF		4
+#define VIRTIO_PCI_COMMON_GFSELECT	8
+#define VIRTIO_PCI_COMMON_GF		12
+#define VIRTIO_PCI_COMMON_MSIX		16
+#define VIRTIO_PCI_COMMON_NUMQ		18
+#define VIRTIO_PCI_COMMON_STATUS	20
+#define VIRTIO_PCI_COMMON_CFGGENERATION	21
+#define VIRTIO_PCI_COMMON_Q_SELECT	22
+#define VIRTIO_PCI_COMMON_Q_SIZE	24
+#define VIRTIO_PCI_COMMON_Q_MSIX	26
+#define VIRTIO_PCI_COMMON_Q_ENABLE	28
+#define VIRTIO_PCI_COMMON_Q_NOFF	30
+#define VIRTIO_PCI_COMMON_Q_DESCLO	32
+#define VIRTIO_PCI_COMMON_Q_DESCHI	36
+#define VIRTIO_PCI_COMMON_Q_AVAILLO	40
+#define VIRTIO_PCI_COMMON_Q_AVAILHI	44
+#define VIRTIO_PCI_COMMON_Q_USEDLO	48
+#define VIRTIO_PCI_COMMON_Q_USEDHI	52
+
 #endif /* VIRTIO_PCI_NO_MODERN */
 
 #endif
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index a3d8101..b2e707ad 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -464,9 +464,67 @@ static void virtio_pci_release_dev(struct device *_d)
 	kfree(vp_dev);
 }
 
-/* TODO: validate the ABI statically. */
+/* This is part of the ABI.  Don't screw with it. */
 static inline void check_offsets(void)
 {
+	/* Note: disk space was harmed in compilation of this function. */
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
+		     offsetof(struct virtio_pci_cap, cap_vndr));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
+		     offsetof(struct virtio_pci_cap, cap_next));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
+		     offsetof(struct virtio_pci_cap, cap_len));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
+		     offsetof(struct virtio_pci_cap, cfg_type));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
+		     offsetof(struct virtio_pci_cap, bar));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
+		     offsetof(struct virtio_pci_cap, offset));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
+		     offsetof(struct virtio_pci_cap, length));
+	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
+		     offsetof(struct virtio_pci_notify_cap,
+			      notify_off_multiplier));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      device_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
+		     offsetof(struct virtio_pci_common_cfg, device_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      guest_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
+		     offsetof(struct virtio_pci_common_cfg, guest_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, msix_config));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
+		     offsetof(struct virtio_pci_common_cfg, num_queues));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
+		     offsetof(struct virtio_pci_common_cfg, device_status));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
+		     offsetof(struct virtio_pci_common_cfg, config_generation));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
+		     offsetof(struct virtio_pci_common_cfg, queue_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_size));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_enable));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
+		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
 }
 
 /* the PCI probing function */
-- 
MST

^ permalink raw reply related

* Re: [PATCH 01/13] kdbus: add documentation
From: Theodore Ts'o @ 2015-01-21 15:19 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Daniel Mack, Greg Kroah-Hartman, arnd-r2nGTMty4D4,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, tixxdz-Umm1ozX2/EEdnm+yROfE0A,
	Johannes Stezenbach
In-Reply-To: <54BF805B.4000609-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Wed, Jan 21, 2015 at 11:32:59AM +0100, Michael Kerrisk (man-pages) wrote:
> It's rather an optional driver than a core kernel feature.
> 
> Given the various things that I've seen said about kdbus, the
> preceding sentence makes little sense to me:
> 
> * kdbus will be the framework supporting user-space D-Bus in the
>   future, and also used by systemd, and so on pretty much every 
>   desktop system.

I have to agree with Michael here; it's really, **really**
disengenuous to say that that if you don't want kdbus, you can just
#ifconfig it out.  The fact that it systemd will be using it means
that it will very shortly become a core kernel feature which is
absolutely mandatory.  Sure, maybe it can be configured out for "tiny
kernels", just as in theory we can configure out the VM system for
really tiny embedded systems.  But we should be treating this as
something that is not optional, because the reality is that's the way
it's going to be in very short order.  So if that means to use proper
system calls instead of ioctls, we should do that.

       	     	     		   - Ted

^ permalink raw reply

* Re: [PATCH v4 1/3] perf: Use monotonic clock as a source for timestamps
From: Pawel Moll @ 2015-01-21 15:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Richard Cochran, Steven Rostedt, Paul Mackerras,
	Arnaldo Carvalho de Melo, John Stultz, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso, linux-kernel@vger.kernel.org,
	linux-api@vger.kernel.org
In-Reply-To: <20150105130141.GQ30905@twins.programming.kicks-ass.net>

On Mon, 2015-01-05 at 13:01 +0000, Peter Zijlstra wrote:
> On Thu, Dec 11, 2014 at 01:39:13PM +0000, Pawel Moll wrote:
> > On Thu, 2014-11-27 at 15:05 +0000, Pawel Moll wrote:
> > > It's been 3 weeks without any negative feedback (no feedback at all, but
> > > I take the optimistic view :-)...
> > > 
> > > How about queuing at least this patch alone for the incoming merge
> > > window? Or at least getting it into -next, with the view at 3.20?
> > 
> > It's been another two weeks... How about getting it queued for v3.20
> > then?
> 
> Yes sorry about that, I had me a kid and took a wee bit of time away
> from computers, then xmas and new years happened.

"a kid" as in: you have forked? :-) It will happen to me in 1.5 month
time (or earlier). All the best luck, we'll both need it ;-)

> In any case, best wishes for the new year to all.

Indeed.

Pawel

^ permalink raw reply

* Re: [PATCH v4 1/3] perf: Use monotonic clock as a source for timestamps
From: Pawel Moll @ 2015-01-21 15:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Richard Cochran, Steven Rostedt, Ingo Molnar, Paul Mackerras,
	Arnaldo Carvalho de Melo, John Stultz, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20150105130035.GP30905-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>

On Mon, 2015-01-05 at 13:00 +0000, Peter Zijlstra wrote:
> On Thu, Nov 06, 2014 at 04:51:56PM +0000, Pawel Moll wrote:
> >  Documentation/kernel-parameters.txt |  9 +++++++++
> >  kernel/events/core.c                | 37 +++++++++++++++++++++++++++++++++++++
> >  2 files changed, 46 insertions(+)
> 
> > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> > index 4c81a86..8ead8d8 100644
> > --- a/Documentation/kernel-parameters.txt
> > +++ b/Documentation/kernel-parameters.txt
> 
> > @@ -2763,6 +2764,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> >  			allocator.  This parameter is primarily	for debugging
> >  			and performance comparison.
> >  
> > +	perf_use_local_clock
> > +			[PERF]
> > +			Use local_clock() as a source for perf timestamps
> > +			generation. This was be the default behaviour and
> > +			this parameter can be used to maintain backward
> > +			compatibility or on older hardware with expensive
> > +			monotonic clock source.
> > +
> >  	pf.		[PARIDE]
> >  			See Documentation/blockdev/paride.txt.
> 
> So I'm always terminally confused on the naming of kernel parameters,
> sometimes things are modules (even when they're not actually =m capable)
> and get a module::foo naming or so and sometimes they're not.

I guess you mean module.foo?

> So we want to use the module naming thing or not?

Honestly, I don't mind either way. For one thing ftrace doesn't bother
and just uses __setup() as well.

> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 2b02c9f..5d0aa03 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> 
> > @@ -322,8 +323,41 @@ extern __weak const char *perf_pmu_name(void)
> >  	return "pmu";
> >  }
> >  
> > +static bool perf_use_local_clock;
> > +static int __init perf_use_local_clock_setup(char *__unused)
> > +{
> > +	perf_use_local_clock = true;
> > +	return 1;
> > +}
> > +__setup("perf_use_local_clock", perf_use_local_clock_setup);
> 
> >  static inline u64 perf_clock(void)
> >  {
> > +	if (likely(!perf_use_local_clock))
> > +		return ktime_get_mono_fast_ns();
> > +
> >  	return local_clock();
> >  }
> 
> Since this all is boot time, should we not use things like static_key
> and avoid the 'pointless' conditional at runtime?

Right. it's good to learn new stuff - I didn't know there was
architecture-agnostic support for jump labels. Definitely worth the
effort, will give it a try and spin the patch.

Pawel

^ permalink raw reply

* Re: [PATCH v4 2/3] perf: Userspace event
From: Pawel Moll @ 2015-01-21 16:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Richard Cochran, Steven Rostedt, Ingo Molnar, Paul Mackerras,
	Arnaldo Carvalho de Melo, John Stultz, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20150105131237.GR30905-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>

On Mon, 2015-01-05 at 13:12 +0000, Peter Zijlstra wrote:
> On Thu, Nov 06, 2014 at 04:51:57PM +0000, Pawel Moll wrote:
> > This patch adds a PR_TASK_PERF_UEVENT prctl call which can be used by
> > any process to inject custom data into perf data stream as a new
> > PERF_RECORD_UEVENT record, if such process is being observed or if it
> > is running on a CPU being observed by the perf framework.
> > 
> > The prctl call takes the following arguments:
> > 
> >         prctl(PR_TASK_PERF_UEVENT, type, size, data, flags);
> > 
> > - type: a number meaning to describe content of the following data.
> >   Kernel does not pay attention to it and merely passes it further in
> >   the perf data, therefore its use must be agreed between the events
> >   producer (the process being observed) and the consumer (performance
> >   analysis tool). The perf userspace tool will contain a repository of
> >   "well known" types and reference implementation of their decoders.
> > - size: Length in bytes of the data.
> > - data: Pointer to the data.
> > - flags: Reserved for future use. Always pass zero.
> > 
> > Perf context that are supposed to receive events generated with the
> > prctl above must be opened with perf_event_attr.uevent set to 1. The
> > PERF_RECORD_UEVENT records consist of a standard perf event header,
> > 32-bit type value, 32-bit data size and the data itself, followed by
> > padding to align the overall record size to 8 bytes and optional,
> > standard sample_id field.
> > 
> > Example use cases:
> > 
> > - "perf_printf" like mechanism to add logging messages to perf data;
> >   in the simplest case it can be just
> > 
> >         prctl(PR_TASK_PERF_UEVENT, 0, 8, "Message", 0);
> > 
> > - synchronisation of performance data generated in user space with the
> >   perf stream coming from the kernel. For example, the marker can be
> >   inserted by a JIT engine after it generated portion of the code, but
> >   before the code is executed for the first time, allowing the
> >   post-processor to pick the correct debugging information.
> 
> The think I remember being raised was a unified means of these msgs
> across perf/ftrace/lttng. I am not seeing that mentioned.

Right. I was considering the "well known types repository" an attempt in
this direction. Having said that - ftrace also takes a random blob as
the trace marker, so the unification has to happen in userspace anyway.
I'll have a look what LTTng has to say in this respect.

> Also, I would like a stronger rationale for the @type argument, if it
> has no actual meaning why is it separate from the binary msg data?

Valid point. Without type 0 defined as a string, it doesn't bring
anything into the equation. I just have a gut feeling that sooner than
later we will want to split the messages somehow. Maybe we should make
it a "reserved for future use, use 0 now" field?

        * struct {
        *      struct perf_event_header        header;
        *      u32                             __reserved; /* always 0 */
        *      u32                             size;
        *      char                            data[size];
        *      char                            __padding[-size & 7];
        *      struct sample_id                sample_id;
        * };

or, probably even better, make it a version value at a known offset
(currently always 1, with just size and random sized data following).

        * struct {
        *      struct perf_event_header        header;
        *      u32                             version; /* use 1 */
        *      u32                             size;
        *      char                            data[size];
        *      char                            __padding[-size & 7];
        *      struct sample_id                sample_id;
        * };

So that we can mutate the user events format without too much of the
pain - the parsers will simply complain about unknown format if such
occurs and with the size of the record in the header, it is possible to
skip it.

Pawel

^ permalink raw reply

* Re: [PATCH 01/13] kdbus: add documentation
From: Daniel Mack @ 2015-01-21 16:58 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages), Greg Kroah-Hartman, arnd-r2nGTMty4D4,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, tixxdz-Umm1ozX2/EEdnm+yROfE0A,
	Johannes Stezenbach
In-Reply-To: <54BF805B.4000609-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Hi Michael,

On 01/21/2015 11:32 AM, Michael Kerrisk (man-pages) wrote:
> On 01/20/2015 07:23 PM, Daniel Mack wrote:

>> It's rather an optional driver than a core kernel feature.
> 
> Given the various things that I've seen said about kdbus, the
> preceding sentence makes little sense to me:
> 
> * kdbus will be the framework supporting user-space D-Bus in the
>   future, and also used by systemd, and so on pretty much every 
>   desktop system.
> * kdbus solves much of the bandwidth problem of D-Bus1. That,
>   along with a host of other features mean that there will be
>   a lot of user-space developers interested in using this API.
> * Various parties in user space are already expressing strong 
>   interest in kdbus.
> 
> My guess from the above? This will NOT be an "optional driver". 
> It *will be* a core kernel feature.

There will be userlands that will depend on kdbus, but that will still
not turn the "driver" into a core Linux kernel feature. We really want
it to be losely coupled from the rest of the kernel and optional after
all. The kernel people are working toward making more and more things
optional these days, and there will still be lots of systems that won't
be using kdbus.

>> Also, the context the kdbus commands operate on originate from a
>> mountable special-purpose file system.
> 
> It's not clear to me how this point implies any particular API design
> choice.

It emphasizes the fact that our ioctl API can only be used with the
nodes exposed by kdbusfs, and vice versa. I think operations on
driver-specific files do not justify a new 'generic' syscall API.

> Notwithstanding the fact that there's a lot of (good) information in
> kdbus.txt, there's not nearly enough for someone to create useful, 
> robust applications that use that API (and not enough that I as a
> reviewer feel comfortable about reviewing the API). As things stand,
> user-space developers will be forced to decipher large amounts of kernel
> code and existing applications in order to actually build things. And
> when they do, they'll be using one of the worst APIs known to man: ioctl(),
> an API that provides no type safety at all.

I don't see how ioctls are any worse than syscalls with pointers to
structures. One can screw up compatibility either way. How is an ioctl
wrapper/prototype any less type-safe than a syscall wrapper?

> ioctl() is a get-out-of-jail free card when it comes to API design.

And how are syscalls different in that regard when they would both
transport the same data structures? Also note that all kdbus ioctls
necessarily operate on a file descriptor context, which an ioctl passes
along by default.

> Rather
> than thinking carefully and long about a set of coherent, stable APIs that 
> provide some degree of type-safety to user-space, one just adds/changes/removes
> an ioctl.

Adding another ioctl in the future for cases we didn't think about
earlier would of course be considered a workaround; and even though such
things happen all the time, it's certainly something we'd like to avoid.

However, we would also need to add another syscall in such cases, which
is even worse IMO. So that's really not an argument against ioctls after
all. What fundamental difference between a raw syscall and a ioctl,
especially with regards to type safety, is there which would help us here?

> And that process seems to be frequent and ongoing even now. (And 
> it's to your great credit that the API/ABI breaks are clearly and honestly 
> marked in the kdbus.h changelog.) All of this lightens the burden of API
> design for kernel developers, but I'm concerned that the long-term pain
> for user-space developers who use an API which (in my estimation) may
> come to be widely used will be enormous.

Yes, we've jointly reviewed the API details again until just recently to
unify structs and enums etc, and added fields to make the ioctls structs
more versatile for possible future additions. By that, we effectively
broke the ABI, but we did that because we know we can't do such things
again in the future.

But again - I don't see how this would be different when using syscalls
rather than ioctls to transport information between the driver and
userspace. Could you elaborate?

> Concretely, I'd like to see the following in kdbus.txt:
> * A lot more detail, adding the various pieces that are currently missing.
>   I've mentioned already the absence of detail on the item blob structures, 
>   but there's probably several other pieces as well. My problem is that the
>   API is so big and hard to grok that it's hard to even begin to work out
>   what's missing from the documentation.
> * Fleshing out the API summaries with code snippets that illustrate the
>   use of the APIs.
> * At least one simple working example application, complete with a walk
>   through of how it's built and run.
> 
> Yes, all of this is a big demand. But this is a big API that is being added 
> to the kernel, and one that may become widely used and for a long time.

Fair enough. Everything that helps people understand and use the API in
a sane way is a good thing to have, and so is getting an assessment from
people how are exposed to this API for the first time.

We'll be working on restructuring the documentation.


Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH v4 3/3] perf: Sample additional clock value
From: Pawel Moll @ 2015-01-21 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Richard Cochran, Steven Rostedt, Ingo Molnar, Paul Mackerras,
	Arnaldo Carvalho de Melo, John Stultz, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20150105134514.GS30905-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>

On Mon, 2015-01-05 at 13:45 +0000, Peter Zijlstra wrote:
> On Thu, Nov 06, 2014 at 04:51:58PM +0000, Pawel Moll wrote:
> > Currently three clocks are implemented: CLOCK_REALITME = 0,
> > CLOCK_MONOTONIC = 1 and CLOCK_MONOTONIC_RAW = 2. The clock field is
> > 5 bits wide to allow for future extension to custom, non-POSIX clock
> > sources(MAX_CLOCK for those is 16, see include/uapi/linux/time.h) like
> > ARM CoreSight (hardware trace) timestamp generator.
> 
> > @@ -304,7 +305,16 @@ struct perf_event_attr {
> >  				mmap2          :  1, /* include mmap with inode data     */
> >  				comm_exec      :  1, /* flag comm events that are due to an exec */
> >  				uevents        :  1, /* allow uevents into the buffer */
> > -				__reserved_1   : 38;
> > +
> > +				/*
> > +				 * clock: one of the POSIX clock IDs:
> > +				 *
> > +				 * 0 - CLOCK_REALTIME
> > +				 * 1 - CLOCK_MONOTONIC
> > +				 * 4 - CLOCK_MONOTONIC_RAW
> > +				 */
> > +				clock          :  5, /* clock type */
> > +				__reserved_1   : 33;
> >  
> >  	union {
> >  		__u32		wakeup_events;	  /* wakeup every n events */
> 
> This would put a constraint on actually changing MAX_CLOCKS, are the
> time people OK with that? Thomas, John?

I admit I have some doubts about it myself. But I don't think we can
afford full int for the clock id, can we?

> I'm also not quite sure of using the >MAX_CLOCKS space for 'special'
> clocks, preferably those would register themselves with the POSIX clock
> interface.

That may be a hard sell - John never was particularly keen on extending
the hard-coded clocks with random sources. And the hardware trace clock
I had on mind would be probably one of them - its meaning depends a lot
on the . Of course I'm looking forward to being surprised :-)

> > @@ -4631,6 +4637,24 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
> >  		data->cpu_entry.cpu	 = raw_smp_processor_id();
> >  		data->cpu_entry.reserved = 0;
> >  	}
> > +
> > +	if (sample_type & PERF_SAMPLE_CLOCK) {
> > +		switch (event->attr.clock) {
> > +		case CLOCK_REALTIME:
> > +			data->clock = ktime_get_real_ns();
> > +			break;
> > +		case CLOCK_MONOTONIC:
> > +			data->clock = ktime_get_mono_fast_ns();
> > +			break;
> > +		case CLOCK_MONOTONIC_RAW:
> > +			data->clock = ktime_get_raw_ns();
> > +			break;
> > +		default:
> > +			data->clock = 0;
> > +			break;
> > +		}
> > +	}
> > +
> >  }
> 
> This is broken. There is nothing stopping this from being called from
> NMI context and only the MONO one is NMI safe.

Uh, right. 

> Also, one would expect something like:
> 
> 	default: {
> 		struct k_clock *kc = clockid_to_kclock(event->attr.clock);
> 		struct timespec ts;
> 		if (kc) {
> 			kc->clock_get(event->attr.clock, &ts);
> 			data->clock = ktime_to_ns(timespec_to_ktime(ts));
> 		} else {
> 			data->clock = 0;
> 		}
> 	}
> 
> Albeit preferably slightly less horrible -- of course, one would first
> need to deal with the NMI issue.

Ok, generally this patch clearly needs more work. I'll have to revisit
what Thomas did with the monotonic clock to understand the potential
solutions.

Maybe, in this case, giving such a wide choice of clocks to be sampled
is not the right thing after all? Maybe, when we face the issue of a
trace clock for real, simple "PERF_SAMPLE_TRACE_CLOCK" will suffice?
With the monotonic clock as a normal time base merged (hopefully
soon :-) the correlation between kernel and user space (which was the
original reason for having this change) won't be an issue any more.

Pawel

^ permalink raw reply

* Re: [PATCH v4 3/3] perf: Sample additional clock value
From: John Stultz @ 2015-01-21 17:44 UTC (permalink / raw)
  To: Pawel Moll
  Cc: Peter Zijlstra, Richard Cochran, Steven Rostedt, Ingo Molnar,
	Paul Mackerras, Arnaldo Carvalho de Melo, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1421860365.14076.91.camel-5wv7dgnIgG8@public.gmane.org>

On Wed, Jan 21, 2015 at 9:12 AM, Pawel Moll <pawel.moll-5wv7dgnIgG8@public.gmane.org> wrote:
> On Mon, 2015-01-05 at 13:45 +0000, Peter Zijlstra wrote:
>> On Thu, Nov 06, 2014 at 04:51:58PM +0000, Pawel Moll wrote:
>> > Currently three clocks are implemented: CLOCK_REALITME = 0,
>> > CLOCK_MONOTONIC = 1 and CLOCK_MONOTONIC_RAW = 2. The clock field is
>> > 5 bits wide to allow for future extension to custom, non-POSIX clock
>> > sources(MAX_CLOCK for those is 16, see include/uapi/linux/time.h) like
>> > ARM CoreSight (hardware trace) timestamp generator.
>>
>> > @@ -304,7 +305,16 @@ struct perf_event_attr {
>> >                             mmap2          :  1, /* include mmap with inode data     */
>> >                             comm_exec      :  1, /* flag comm events that are due to an exec */
>> >                             uevents        :  1, /* allow uevents into the buffer */
>> > -                           __reserved_1   : 38;
>> > +
>> > +                           /*
>> > +                            * clock: one of the POSIX clock IDs:
>> > +                            *
>> > +                            * 0 - CLOCK_REALTIME
>> > +                            * 1 - CLOCK_MONOTONIC
>> > +                            * 4 - CLOCK_MONOTONIC_RAW
>> > +                            */
>> > +                           clock          :  5, /* clock type */
>> > +                           __reserved_1   : 33;
>> >
>> >     union {
>> >             __u32           wakeup_events;    /* wakeup every n events */
>>
>> This would put a constraint on actually changing MAX_CLOCKS, are the
>> time people OK with that? Thomas, John?
>
> I admit I have some doubts about it myself. But I don't think we can
> afford full int for the clock id, can we?
>
>> I'm also not quite sure of using the >MAX_CLOCKS space for 'special'
>> clocks, preferably those would register themselves with the POSIX clock
>> interface.
>
> That may be a hard sell - John never was particularly keen on extending
> the hard-coded clocks with random sources. And the hardware trace clock
> I had on mind would be probably one of them - its meaning depends a lot
> on the . Of course I'm looking forward to being surprised :-)

Yea. I'm definitely still wanting to be cautious about adding new
clockids. Basically if there's a new well defined time domain, then
I'm open to it,  (for example, I'm expecting there will be a smeared
leapsecond time domain at some point in the future), but we've already
grown more then I'm comfortable with given the existing MAX_CLOCKS
limit.

For example, I regret adding the _ALARM clockids. Those are basically
duplicative time domains from the readers perspective, and only have
unique value for setting timers, which should have been handled via a
flag to the timer interfaces.

I suspect we'll have to bump MAX_CLOCKS that at some point and hope it
doesn't break anyone.

That said, there is the dynamic posix clockids. I'm not sure if it
would make sense, but even if we don't bump MAX_CLOCKS, might there
be some case where someone wants to use a dynamic posix clock for the
perf reference?

thanks
-john

^ permalink raw reply

* Re: [PATCH v4 3/3] perf: Sample additional clock value
From: Pawel Moll @ 2015-01-21 17:54 UTC (permalink / raw)
  To: John Stultz
  Cc: Peter Zijlstra, Richard Cochran, Steven Rostedt, Ingo Molnar,
	Paul Mackerras, Arnaldo Carvalho de Melo, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <CALAqxLV6ggCO81ntWuYuDjrqNeMePkZBpq92G9-iwMHm7ONuFg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Wed, 2015-01-21 at 17:44 +0000, John Stultz wrote:
> That said, there is the dynamic posix clockids. I'm not sure if it
> would make sense, but even if we don't bump MAX_CLOCKS, might there
> be some case where someone wants to use a dynamic posix clock for the
> perf reference?

If I remember correctly, last time I tried to use dynamic posix clocks
in the perf context, one needed to open a ptp character device in order
to get a file descriptor, than translated into a clockid_t value -
correct me if I'm wrong. But here you get the fd from the
sys_perf_open() and clock_*() doesn't know anything about such
descriptor.

I was looking into a way of associating a random clock with a random fd,
so that perf could "attach" itself to the clock API at will, but it
turned out not to be trivial (I'd have to dig through old threads to
remember all the nasty details).

The good thing is that it looks like the immediate need for this was no
more, with perf using monotonic clock as the clock source. It will come
back when we get into hardware trace correlation, but one step at a
time...

Pawel

^ permalink raw reply

* Re: [PATCH v4 3/3] perf: Sample additional clock value
From: John Stultz @ 2015-01-21 18:05 UTC (permalink / raw)
  To: Pawel Moll
  Cc: Peter Zijlstra, Richard Cochran, Steven Rostedt, Ingo Molnar,
	Paul Mackerras, Arnaldo Carvalho de Melo, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1421862883.14076.99.camel-5wv7dgnIgG8@public.gmane.org>

On Wed, Jan 21, 2015 at 9:54 AM, Pawel Moll <pawel.moll-5wv7dgnIgG8@public.gmane.org> wrote:
> On Wed, 2015-01-21 at 17:44 +0000, John Stultz wrote:
>> That said, there is the dynamic posix clockids. I'm not sure if it
>> would make sense, but even if we don't bump MAX_CLOCKS, might there
>> be some case where someone wants to use a dynamic posix clock for the
>> perf reference?
>
> If I remember correctly, last time I tried to use dynamic posix clocks
> in the perf context, one needed to open a ptp character device in order
> to get a file descriptor, than translated into a clockid_t value -
> correct me if I'm wrong. But here you get the fd from the
> sys_perf_open() and clock_*() doesn't know anything about such
> descriptor.

Sorry for losing context here then. Yes, the dynamic clockid has to be
exported via some other interface to userland (likely via the driver
that provides it), but once the id is known it can be used via the
clock_*() functions.   I was thinking here since the perf_event_attr
wants to associate with a clockid, including the possibility for
dynamic clockids would be wise, but I didn't read closely enough to
see how that clockid was specified.

> I was looking into a way of associating a random clock with a random fd,
> so that perf could "attach" itself to the clock API at will, but it
> turned out not to be trivial (I'd have to dig through old threads to
> remember all the nasty details).
>
> The good thing is that it looks like the immediate need for this was no
> more, with perf using monotonic clock as the clock source. It will come
> back when we get into hardware trace correlation, but one step at a
> time...

Ok. I'm eager to see this settled (and the current approach I don't
have any objections to, although I'm not paying super close attention
now that its all in the perf core).   I know this has taken far longer
then you'd have liked, so thanks for your persistence!

-john

^ permalink raw reply

* Re: [PATCH v4 1/3] perf: Use monotonic clock as a source for timestamps
From: Pawel Moll @ 2015-01-21 19:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Richard Cochran, Steven Rostedt, Ingo Molnar, Paul Mackerras,
	Arnaldo Carvalho de Melo, John Stultz, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1421855543.14076.68.camel-5wv7dgnIgG8@public.gmane.org>

On Wed, 2015-01-21 at 15:52 +0000, Pawel Moll wrote:
> On Mon, 2015-01-05 at 13:00 +0000, Peter Zijlstra wrote:
> > On Thu, Nov 06, 2014 at 04:51:56PM +0000, Pawel Moll wrote:
> > >  Documentation/kernel-parameters.txt |  9 +++++++++
> > >  kernel/events/core.c                | 37 +++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 46 insertions(+)
> > 
> > > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> > > index 4c81a86..8ead8d8 100644
> > > --- a/Documentation/kernel-parameters.txt
> > > +++ b/Documentation/kernel-parameters.txt
> > 
> > > @@ -2763,6 +2764,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> > >  			allocator.  This parameter is primarily	for debugging
> > >  			and performance comparison.
> > >  
> > > +	perf_use_local_clock
> > > +			[PERF]
> > > +			Use local_clock() as a source for perf timestamps
> > > +			generation. This was be the default behaviour and
> > > +			this parameter can be used to maintain backward
> > > +			compatibility or on older hardware with expensive
> > > +			monotonic clock source.
> > > +
> > >  	pf.		[PARIDE]
> > >  			See Documentation/blockdev/paride.txt.
> > 
> > So I'm always terminally confused on the naming of kernel parameters,
> > sometimes things are modules (even when they're not actually =m capable)
> > and get a module::foo naming or so and sometimes they're not.
> 
> I guess you mean module.foo?
> 
> > So we want to use the module naming thing or not?
> 
> Honestly, I don't mind either way. For one thing ftrace doesn't bother
> and just uses __setup() as well.

There's one more thing to this - as far as I remember, the module name
is actually derived from the compilation unit name (at some level of
Kbuild). I may be wrong (will have to double check), but a module
parameter defined in kernel/events/core.c may be called something like
"core.parameter" ;-).

Pawel

^ permalink raw reply

* Re: [PATCH v4 1/3] perf: Use monotonic clock as a source for timestamps
From: Pawel Moll @ 2015-01-21 20:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Richard Cochran, Steven Rostedt, Ingo Molnar, Paul Mackerras,
	Arnaldo Carvalho de Melo, John Stultz, Masami Hiramatsu,
	Christopher Covington, Namhyung Kim, David Ahern, Thomas Gleixner,
	Tomeu Vizoso,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1421869684.14076.105.camel-5wv7dgnIgG8@public.gmane.org>

On Wed, 2015-01-21 at 19:48 +0000, Pawel Moll wrote:
> On Wed, 2015-01-21 at 15:52 +0000, Pawel Moll wrote:
> > On Mon, 2015-01-05 at 13:00 +0000, Peter Zijlstra wrote:
> > > On Thu, Nov 06, 2014 at 04:51:56PM +0000, Pawel Moll wrote:
> > > >  Documentation/kernel-parameters.txt |  9 +++++++++
> > > >  kernel/events/core.c                | 37 +++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 46 insertions(+)
> > > 
> > > > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> > > > index 4c81a86..8ead8d8 100644
> > > > --- a/Documentation/kernel-parameters.txt
> > > > +++ b/Documentation/kernel-parameters.txt
> > > 
> > > > @@ -2763,6 +2764,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> > > >  			allocator.  This parameter is primarily	for debugging
> > > >  			and performance comparison.
> > > >  
> > > > +	perf_use_local_clock
> > > > +			[PERF]
> > > > +			Use local_clock() as a source for perf timestamps
> > > > +			generation. This was be the default behaviour and
> > > > +			this parameter can be used to maintain backward
> > > > +			compatibility or on older hardware with expensive
> > > > +			monotonic clock source.
> > > > +
> > > >  	pf.		[PARIDE]
> > > >  			See Documentation/blockdev/paride.txt.
> > > 
> > > So I'm always terminally confused on the naming of kernel parameters,
> > > sometimes things are modules (even when they're not actually =m capable)
> > > and get a module::foo naming or so and sometimes they're not.
> > 
> > I guess you mean module.foo?
> > 
> > > So we want to use the module naming thing or not?
> > 
> > Honestly, I don't mind either way. For one thing ftrace doesn't bother
> > and just uses __setup() as well.
> 
> There's one more thing to this - as far as I remember, the module name
> is actually derived from the compilation unit name (at some level of
> Kbuild). I may be wrong (will have to double check), but a module
> parameter defined in kernel/events/core.c may be called something like
> "core.parameter" ;-).

Ok, so it's possible to enforce "perf." prefix:

/* You can override this manually, but generally this should match the
   module name. */
#ifdef MODULE
#define MODULE_PARAM_PREFIX /* empty */
#else
#define MODULE_PARAM_PREFIX KBUILD_MODNAME "."
#endif

So, perf_use_local_clock or perf.use_local_clock? :-)

Pawel

^ permalink raw reply

* [PATCH v5] perf: Use monotonic clock as a source for timestamps
From: Pawel Moll @ 2015-01-21 20:27 UTC (permalink / raw)
  To: Richard Cochran, Steven Rostedt, Ingo Molnar, Peter Zijlstra,
	Paul Mackerras, Arnaldo Carvalho de Melo, John Stultz,
	Masami Hiramatsu, Christopher Covington, Namhyung Kim,
	David Ahern, Thomas Gleixner, Tomeu Vizoso
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Pawel Moll
In-Reply-To: <1415292718-19785-2-git-send-email-pawel.moll-5wv7dgnIgG8@public.gmane.org>

Until now, perf framework never defined the meaning of the timestamps
captured as PERF_SAMPLE_TIME sample type. The values were obtaining
from local (sched) clock, which is unavailable in userspace. This made
it impossible to correlate perf data with any other events. Other
tracing solutions have the source configurable (ftrace) or just share
a common time domain between kernel and userspace (LTTng).

Follow the trend by using monotonic clock, which is readily available
as POSIX CLOCK_MONOTONIC.

Also add a sysctl "perf_sample_time_clk_id" attribute (usually available
as "/proc/sys/kernel/perf_sample_time_clk_id") which can be used by the
user to obtain the clk_id to be used with POSIX clock API (eg.
clock_gettime()) to obtain a time value comparable with perf samples.

Old behaviour can be restored by using "perf_use_local_clock" kernel
parameter.

Signed-off-by: Pawel Moll <pawel.moll-5wv7dgnIgG8@public.gmane.org>
---
Changes since v4:

- using jump labels to reduce runtime overhead of the local/mono check

 Documentation/kernel-parameters.txt |  9 ++++++++
 kernel/events/core.c                | 43 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4df73da..3cccd0e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -91,6 +91,7 @@ parameter is applicable:
 	NUMA	NUMA support is enabled.
 	NFS	Appropriate NFS support is enabled.
 	OSS	OSS sound support is enabled.
+	PERF	Performance events and counters support is enabled.
 	PV_OPS	A paravirtualized kernel is enabled.
 	PARIDE	The ParIDE (parallel port IDE) subsystem is enabled.
 	PARISC	The PA-RISC architecture is enabled.
@@ -2795,6 +2796,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			allocator.  This parameter is primarily	for debugging
 			and performance comparison.
 
+	perf_use_local_clock
+			[PERF]
+			Use local_clock() as a source for perf timestamps
+			generation. This was be the default behaviour and
+			this parameter can be used to maintain backward
+			compatibility or on older hardware with expensive
+			monotonic clock source.
+
 	pf.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835..f45434d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/sysctl.h>
+#include <linux/jump_label.h>
 
 #include "internal.h"
 
@@ -322,9 +324,43 @@ extern __weak const char *perf_pmu_name(void)
 	return "pmu";
 }
 
+static struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE;
+static bool perf_use_local_clock_param __initdata;
+static int __init perf_use_local_clock_setup(char *__unused)
+{
+	perf_use_local_clock_param = true;
+	return 1;
+}
+__setup("perf_use_local_clock", perf_use_local_clock_setup);
+
+static int sysctl_perf_sample_time_clk_id = CLOCK_MONOTONIC;
+
+static struct ctl_table perf_sample_time_kern_table[] = {
+	{
+		.procname       = "perf_sample_time_clk_id",
+		.data           = &sysctl_perf_sample_time_clk_id,
+		.maxlen         = sizeof(int),
+		.mode           = 0444,
+		.proc_handler   = proc_dointvec,
+	},
+	{}
+};
+
+static struct ctl_table perf_sample_time_root_table[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= perf_sample_time_kern_table,
+	},
+	{}
+};
+
 static inline u64 perf_clock(void)
 {
-	return local_clock();
+	if (static_key_false(&perf_use_local_clock_key))
+		return local_clock();
+	else
+		return ktime_get_mono_fast_ns();
 }
 
 static inline struct perf_cpu_context *
@@ -8271,6 +8307,11 @@ void __init perf_event_init(void)
 	 */
 	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
 		     != 1024);
+
+	if (perf_use_local_clock_param)
+		static_key_slow_inc(&perf_use_local_clock_key);
+	else
+		register_sysctl_table(perf_sample_time_root_table);
 }
 
 static int __init perf_event_sysfs_init(void)
-- 
2.1.0

^ permalink raw reply related

* Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Namhyung Kim @ 2015-01-22  1:03 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, Linux API, Network Development, LKML
In-Reply-To: <CAMEtUuxhjef+9QaFccfSyA3DbVJ3H6qP=eu-uZCkku1rVWViSQ@mail.gmail.com>

Hi Alexei,

On Fri, Jan 16, 2015 at 10:57:15AM -0800, Alexei Starovoitov wrote:
> On Fri, Jan 16, 2015 at 7:02 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> > One last thing. If the ebpf is used for anything but filtering, it
> > should go into the trigger file. The filtering is only a way to say if
> > the event should be recorded or not. But the trigger could do something
> > else (a printk, a stacktrace, etc).
> 
> it does way more than just filtering, but
> invoking program as a trigger is too slow.
> When program is called as soon as tracepoint fires,
> it can fetch other fields, evaluate them, printk some of them,
> optionally dump stack, aggregate into maps.
> We can let it call triggers too, so that user program will
> be able to enable/disable other events.
> I'm not against invoking programs as a trigger, but I don't
> see a use case for it. It's just too slow for production
> analytics that needs to act on huge number of events
> per second.

AFAIK a trigger can be fired before allocating a ring buffer if it
doesn't use the event record (i.e. has filter) or ->post_trigger bit
set (stacktrace).  Please see ftrace_trigger_soft_disabled().

This also makes it keeping events in the soft-disabled state.

Thanks,
Namhyung


> We must minimize the overhead between tracepoint
> firing and program executing, so that programs can
> be used on events like packet receive which will be
> in millions per second. Every nsec counts.
> For example:
> - raw dd if=/dev/zero of=/dev/null
>   does 760 MB/s (on my debug kernel)
> - echo 1 > events/syscalls/sys_enter_write/enable
>   drops it to 400 MB/s
> - echo "echo "count == 123 " > events/syscalls/sys_enter_write/filter
>   drops it even further down to 388 MB/s
> This slowdown is too high for this to be used on a live system.
> - tracex4 that computes histogram of sys_write sizes
>   and stores log2(count) into a map does 580 MB/s
>   This is still not great, but this slowdown is now usable
>   and we can work further on minimizing the overhead.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-01-22  1:49 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Steven Rostedt, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, Linux API, Network Development, LKML

On Wed, Jan 21, 2015 at 5:03 PM, Namhyung Kim <namhyung-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
>
> AFAIK a trigger can be fired before allocating a ring buffer if it
> doesn't use the event record (i.e. has filter) or ->post_trigger bit
> set (stacktrace).  Please see ftrace_trigger_soft_disabled().

yes, but such trigger has no arguments, so I would have to hack
ftrace_trigger_soft_disabled() to pass 'ctx' further down
and through all pointer dereferences and list walking.
Also there is no return value, so I have to add it as well
similar to post-triggers.
that's quite a bit of overhead that I would like to avoid.

Actually now I'm thinking to move condition
if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
before ftrace_trigger_soft_disabled() check.
So programs always run first and if they return non-zero
then all standard processing will follow.
May be return value from the program can influence triggers.
That will nicely replace bpf_dump_stack()...the program
will return ETT_STACKTRACE constant to trigger dump.

> This also makes it keeping events in the soft-disabled state.

I was never able to figure out the use case for soft-disabled state.
Probably historical before static_key was done.

^ permalink raw reply

* Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Steven Rostedt @ 2015-01-22  1:56 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Namhyung Kim, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, Linux API, Network Development, LKML
In-Reply-To: <CAMEtUux8v2LDtLcgpT9hCvJgnrCwT2fkzsSvAPFSuEUx+itxyQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Wed, 21 Jan 2015 17:49:08 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:


> > This also makes it keeping events in the soft-disabled state.
> 
> I was never able to figure out the use case for soft-disabled state.
> Probably historical before static_key was done.

No, it's not historical at all. The "soft-disable" is a way to enable
from any context. You can't enable a static key from NMI or interrupt
context, but you can enable a "soft-disable" there.

As you can enable or disable events from any function that the function
tracer may trace, I needed a way to enable them (make the tracepoint
active), but do nothing until something else turns them on.

-- Steve

^ permalink raw reply

* Re: [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-01-22  2:13 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Namhyung Kim, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, Linux API, Network Development, LKML
In-Reply-To: <20150121205643.4d8a3516-f9ZlEuEWxVcJvu8Pb33WZ0EMvNT87kid@public.gmane.org>

On Wed, Jan 21, 2015 at 5:56 PM, Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:
> On Wed, 21 Jan 2015 17:49:08 -0800
> Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
>
>
>> > This also makes it keeping events in the soft-disabled state.
>>
>> I was never able to figure out the use case for soft-disabled state.
>> Probably historical before static_key was done.
>
> No, it's not historical at all. The "soft-disable" is a way to enable
> from any context. You can't enable a static key from NMI or interrupt
> context, but you can enable a "soft-disable" there.
>
> As you can enable or disable events from any function that the function
> tracer may trace, I needed a way to enable them (make the tracepoint
> active), but do nothing until something else turns them on.

Thanks for explanation. Makes sense.
Speaking of nmi... I think I will add a check that if (in_nmi())
just skip running the program, since supporting this use
case is not needed at the moment.

^ permalink raw reply

* Re: [PATCH v9 0/11] fs: Introduce FALLOC_FL_INSERT_RANGE for fallocate
From: Michael Kerrisk @ 2015-01-22  8:16 UTC (permalink / raw)
  To: Namjae Jeon
  Cc: Theodore Ts'o, Namjae Jeon, Linux API, bfoster, Linux Kernel,
	xfs, a.sangwan, Linux-Fsdevel, linux-ext4
In-Reply-To: <1421853126-4505-1-git-send-email-linkinjeon@gmail.com>

[CC += linux-api]

Hello Namjae

Please, all patches such as this that touch the user-space ABI must CC
linux-api@. (The kernel source file Documentation/SubmitChecklist
notes that all Linux kernel patches that change userspace interfaces
should be CCed to linux-api@vger.kernel.org.  See also
https://www.kernel.org/doc/man-pages/linux-api-ml.html )

Thanks,

Michael


On Wed, Jan 21, 2015 at 4:11 PM, Namjae Jeon <linkinjeon@gmail.com> wrote:
> From: Namjae Jeon <namjae.jeon@samsung.com>
>
> In continuation of the work of making the process of non linear editing of
> media files faster, we introduce here the new flag FALLOC_FL_INSERT_RANGE
> for fallocate.
>
> This flag will work opposite to the FALLOC_FL_COLLAPSE_RANGE flag.
> As such, specifying FALLOC_FL_INSERT_RANGE flag will create new space inside file
> by inserting a hole within the range specified by offset and len.
> User can write new data in this space. e.g. ads.
> Like collapse range, currently we have the limitation that offset and len should
> be block size aligned for both XFS and Ext4.
>
> The semantics of the flag are :
> 1) It creates space within file by inserting a hole of  len bytes starting
>    at offset byte without overwriting any existing data. All the data blocks
>    from offset to EOF are shifted towards right to make hole space.
> 2) It should be used exclusively. No other fallocate flag in combination.
> 3) Offset and length supplied to fallocate should be fs block size aligned
>    in case of xfs and ext4.
> 4) Insert range does not work for the case when offset is overlapping/beyond
>    i_size. If the user wants to insert space at the end of file they are
>    advised to use either ftruncate(2) or fallocate(2) with mode 0.
> 5) It increses the size of file by len bytes.
>
>
> Namjae Jeon (11):
>  fs: Add support FALLOC_FL_INSERT_RANGE for fallocate
>  xfs: Add support FALLOC_FL_INSERT_RANGE for fallocate
>  ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate
>  xfsprogs: xfs_io: add finsert command for insert range via fallocate
>  xfstests: generic/039: Standard insert range tests
>  xfstests: generic/040: Delayed allocation insert range
>  xfstests: generic/041: Multi insert range tests
>  xfstests: generic/042: Delayed allocation multi insert
>  xfstests: generic/043: Test multiple fallocate insert/collapse range calls
>  xfstests: fsstress: Add fallocate insert range operation
>  xfstests: fsx: Add fallocate insert range operation
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply

* RE: [PATCH v9 0/11] fs: Introduce FALLOC_FL_INSERT_RANGE for fallocate
From: Namjae Jeon @ 2015-01-22  8:52 UTC (permalink / raw)
  To: 'Michael Kerrisk'
  Cc: 'Namjae Jeon', 'Theodore Ts'o',
	'Linux API', bfoster, 'Linux Kernel', xfs,
	a.sangwan, 'Linux-Fsdevel', linux-ext4
In-Reply-To: <CAHO5Pa3HnzP=KcBQOWDEkBqppMGXJ2kzfNKj21ruQ5SvWnms2g@mail.gmail.com>

> [CC += linux-api]
> 
> Hello Namjae
> 
> Please, all patches such as this that touch the user-space ABI must CC
> linux-api@. (The kernel source file Documentation/SubmitChecklist
> notes that all Linux kernel patches that change userspace interfaces
> should be CCed to linux-api@vger.kernel.org.  See also
> https://www.kernel.org/doc/man-pages/linux-api-ml.html )
Hi Michael,

I will keep that in mind.
Thank you!

> 
> Thanks,
> 
> Michael
> 
> 
> On Wed, Jan 21, 2015 at 4:11 PM, Namjae Jeon <linkinjeon@gmail.com> wrote:
> > From: Namjae Jeon <namjae.jeon@samsung.com>
> >
> > In continuation of the work of making the process of non linear editing of
> > media files faster, we introduce here the new flag FALLOC_FL_INSERT_RANGE
> > for fallocate.
> >
> > This flag will work opposite to the FALLOC_FL_COLLAPSE_RANGE flag.
> > As such, specifying FALLOC_FL_INSERT_RANGE flag will create new space inside file
> > by inserting a hole within the range specified by offset and len.
> > User can write new data in this space. e.g. ads.
> > Like collapse range, currently we have the limitation that offset and len should
> > be block size aligned for both XFS and Ext4.
> >
> > The semantics of the flag are :
> > 1) It creates space within file by inserting a hole of  len bytes starting
> >    at offset byte without overwriting any existing data. All the data blocks
> >    from offset to EOF are shifted towards right to make hole space.
> > 2) It should be used exclusively. No other fallocate flag in combination.
> > 3) Offset and length supplied to fallocate should be fs block size aligned
> >    in case of xfs and ext4.
> > 4) Insert range does not work for the case when offset is overlapping/beyond
> >    i_size. If the user wants to insert space at the end of file they are
> >    advised to use either ftruncate(2) or fallocate(2) with mode 0.
> > 5) It increses the size of file by len bytes.
> >
> >
> > Namjae Jeon (11):
> >  fs: Add support FALLOC_FL_INSERT_RANGE for fallocate
> >  xfs: Add support FALLOC_FL_INSERT_RANGE for fallocate
> >  ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate
> >  xfsprogs: xfs_io: add finsert command for insert range via fallocate
> >  xfstests: generic/039: Standard insert range tests
> >  xfstests: generic/040: Delayed allocation insert range
> >  xfstests: generic/041: Multi insert range tests
> >  xfstests: generic/042: Delayed allocation multi insert
> >  xfstests: generic/043: Test multiple fallocate insert/collapse range calls
> >  xfstests: fsstress: Add fallocate insert range operation
> >  xfstests: fsx: Add fallocate insert range operation
> > --
> > 1.7.9.5
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> 
> 
> 
> --
> Michael Kerrisk Linux man-pages maintainer;
> http://www.kernel.org/doc/man-pages/
> Author of "The Linux Programming Interface", http://blog.man7.org/

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply

* Re: [PATCH 01/13] kdbus: add documentation
From: Michael Kerrisk (man-pages) @ 2015-01-22 10:18 UTC (permalink / raw)
  To: Daniel Mack, Greg Kroah-Hartman, arnd-r2nGTMty4D4,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io, teg-B22kvLQNl6c,
	jkosina-AlSwsSmVLrQ, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, tixxdz-Umm1ozX2/EEdnm+yROfE0A,
	Johannes Stezenbach, Theodore T'so
In-Reply-To: <54BFDAAA.50203-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>

Hi Daniel,

On 01/21/2015 05:58 PM, Daniel Mack wrote:
> Hi Michael,
> 
> On 01/21/2015 11:32 AM, Michael Kerrisk (man-pages) wrote:
>> On 01/20/2015 07:23 PM, Daniel Mack wrote:
> 
>>> It's rather an optional driver than a core kernel feature.
>>
>> Given the various things that I've seen said about kdbus, the
>> preceding sentence makes little sense to me:
>>
>> * kdbus will be the framework supporting user-space D-Bus in the
>>   future, and also used by systemd, and so on pretty much every 
>>   desktop system.
>> * kdbus solves much of the bandwidth problem of D-Bus1. That,
>>   along with a host of other features mean that there will be
>>   a lot of user-space developers interested in using this API.
>> * Various parties in user space are already expressing strong 
>>   interest in kdbus.
>>
>> My guess from the above? This will NOT be an "optional driver". 
>> It *will be* a core kernel feature.
> 
> There will be userlands that will depend on kdbus, but that will still
> not turn the "driver" into a core Linux kernel feature. We really want
> it to be losely coupled from the rest of the kernel and optional after
> all. The kernel people are working toward making more and more things
> optional these days, and there will still be lots of systems that won't
> be using kdbus.

Make it optional and configurable if you want. But that misses my
point. kdbus is very likely to become an essential, widely used
piece of *general-purpose* piece of ABI infrastructure that will
be configured into virtually every type of system. As such, the same
standards should apply as for a "core kernel feature", whether or
not you want to cal it that.

>>> Also, the context the kdbus commands operate on originate from a
>>> mountable special-purpose file system.
>>
>> It's not clear to me how this point implies any particular API design
>> choice.
> 
> It emphasizes the fact that our ioctl API can only be used with the
> nodes exposed by kdbusfs, and vice versa. I think operations on
> driver-specific files do not justify a new 'generic' syscall API.

I see your (repeated) use of words like "driver" as just a distraction, 
I'm sorry. "Driver-specific files" is an implementation detail. The
point is that kdbus provides a complex, general-purpose feature for all
of userland. It is core infrastructure that will be used by key pieces 
of the plumbing layer, and likely by many other applications as well.
*Please* stop suggesting that it is not core infrastructure: kdbus
has the potential to be a great IPC that will be very useful to many,
many user-space developers.

(By the way, we have precedents for device/filesystem-specific system
calls. Even a recent one, in memfd_create().)

>> Notwithstanding the fact that there's a lot of (good) information in
>> kdbus.txt, there's not nearly enough for someone to create useful, 
>> robust applications that use that API (and not enough that I as a
>> reviewer feel comfortable about reviewing the API). As things stand,
>> user-space developers will be forced to decipher large amounts of kernel
>> code and existing applications in order to actually build things. And
>> when they do, they'll be using one of the worst APIs known to man: ioctl(),
>> an API that provides no type safety at all.
> 
> I don't see how ioctls are any worse than syscalls with pointers to
> structures. One can screw up compatibility either way. How is an ioctl
> wrapper/prototype any less type-safe than a syscall wrapper?

Taking that argument to the extreme, we would have no system calls 
at all, just one gigantic ioctl() ;-).

>> ioctl() is a get-out-of-jail free card when it comes to API design.
> 
> And how are syscalls different in that regard when they would both
> transport the same data structures? 

My general impression is that when it comes to system calls,
there's usually a lot more up front thought about API design.

> Also note that all kdbus ioctls
> necessarily operate on a file descriptor context, which an ioctl passes
> along by default.

I fail to see your point here. The same statement applies to multiple
special-purpose system calls (epoll, inotify, various shared memory APIs, 
and so on).

>> Rather
>> than thinking carefully and long about a set of coherent, stable APIs that 
>> provide some degree of type-safety to user-space, one just adds/changes/removes
>> an ioctl.
> 
> Adding another ioctl in the future for cases we didn't think about
> earlier would of course be considered a workaround; and even though such
> things happen all the time, it's certainly something we'd like to avoid.
> 
> However, we would also need to add another syscall in such cases, which
> is even worse IMO. So that's really not an argument against ioctls after
> all. What fundamental difference between a raw syscall and a ioctl,
> especially with regards to type safety, is there which would help us here?

Type safety helps user-space application developers. System calls have 
it, ioctl() does not.

>> And that process seems to be frequent and ongoing even now. (And 
>> it's to your great credit that the API/ABI breaks are clearly and honestly 
>> marked in the kdbus.h changelog.) All of this lightens the burden of API
>> design for kernel developers, but I'm concerned that the long-term pain
>> for user-space developers who use an API which (in my estimation) may
>> come to be widely used will be enormous.
> 
> Yes, we've jointly reviewed the API details again until just recently to
> unify structs and enums etc, and added fields to make the ioctls structs
> more versatile for possible future additions. By that, we effectively
> broke the ABI, but we did that because we know we can't do such things
> again in the future.
> 
> But again - I don't see how this would be different when using syscalls
> rather than ioctls to transport information between the driver and
> userspace. Could you elaborate?

My suspicion is that not nearly enough thinking has yet been done about
the design of the API. That's based on these observations:

* Documentation that is, considering the size of the API, *way* too thin.
* Some parts of the API not documented at all (various kdbus_item blobs)
* ABI changes happening even quite recently
* API oddities such as the 'kernel_flags' fields. Why do I need to
  be told what flags the kernel supports on *every* operation?

The above is just after a day of looking hard at kdbus.txt. I strongly
suspect I'd find a lot of other issues if I spent more time on kdbus.

>> Concretely, I'd like to see the following in kdbus.txt:
>> * A lot more detail, adding the various pieces that are currently missing.
>>   I've mentioned already the absence of detail on the item blob structures, 
>>   but there's probably several other pieces as well. My problem is that the
>>   API is so big and hard to grok that it's hard to even begin to work out
>>   what's missing from the documentation.
>> * Fleshing out the API summaries with code snippets that illustrate the
>>   use of the APIs.
>> * At least one simple working example application, complete with a walk
>>   through of how it's built and run.
>>
>> Yes, all of this is a big demand. But this is a big API that is being added 
>> to the kernel, and one that may become widely used and for a long time.
> 
> Fair enough. Everything that helps people understand and use the API in
> a sane way is a good thing to have, and so is getting an assessment from
> people how are exposed to this API for the first time.
> 
> We'll be working on restructuring the documentation.

Thanks. I know it's a big effort, and I wish you success.

Cheers,

Michael


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* [PATCH v6 0/2] Add Spreadtrum SoC bindings and serial driver support
From: Chunyan Zhang @ 2015-01-22 13:35 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A
  Cc: mark.rutland-5wv7dgnIgG8, arnd-r2nGTMty4D4,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A, pawel.moll-5wv7dgnIgG8,
	ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, jslaby-AlSwsSmVLrQ,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <sprd-serial-v6>

This patch-set split the last version, and addressed the review comments from
last version on serial driver code.

Changes from v5:
	- Used Spreadtrum instead of SPRD in menus
	- Changed TTY name to 'ttyS'
	- Moved uart_register_driver() to probe()
	- Added spinlock as needed
	- Removed register states saving and restoring in suspend() and resume()

Chunyan Zhang (2):
  Documentation: DT: Add bindings for Spreadtrum SoC Platform
  tty/serial: Add Spreadtrum sc9836-uart driver support

 Documentation/devicetree/bindings/arm/sprd.txt     |   11 +
 .../devicetree/bindings/serial/sprd-uart.txt       |    7 +
 .../devicetree/bindings/vendor-prefixes.txt        |    1 +
 drivers/tty/serial/Kconfig                         |   18 +
 drivers/tty/serial/Makefile                        |    1 +
 drivers/tty/serial/sprd_serial.c                   |  801 ++++++++++++++++++++
 include/uapi/linux/serial_core.h                   |    3 +
 7 files changed, 842 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/sprd.txt
 create mode 100644 Documentation/devicetree/bindings/serial/sprd-uart.txt
 create mode 100644 drivers/tty/serial/sprd_serial.c

-- 
1.7.9.5

^ permalink raw reply

* [PATCH v6 1/2] Documentation: DT: Add bindings for Spreadtrum SoC Platform
From: Chunyan Zhang @ 2015-01-22 13:35 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A
  Cc: mark.rutland-5wv7dgnIgG8, arnd-r2nGTMty4D4,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A, pawel.moll-5wv7dgnIgG8,
	ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, jslaby-AlSwsSmVLrQ,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421933706-4277-1-git-send-email-chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>

Adds Spreadtrum's prefix "sprd" to vendor-prefixes file.
Adds the devicetree binding documentations for Spreadtrum's sc9836-uart
and SC9836 SoC based on the Sharkl64 Platform which is a 64-bit SoC
Platform of Spreadtrum.

Signed-off-by: Chunyan Zhang <chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Signed-off-by: Orson Zhai <orson.zhai-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
---
 Documentation/devicetree/bindings/arm/sprd.txt     |   11 +++++++++++
 .../devicetree/bindings/serial/sprd-uart.txt       |    7 +++++++
 .../devicetree/bindings/vendor-prefixes.txt        |    1 +
 3 files changed, 19 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/sprd.txt
 create mode 100644 Documentation/devicetree/bindings/serial/sprd-uart.txt

diff --git a/Documentation/devicetree/bindings/arm/sprd.txt b/Documentation/devicetree/bindings/arm/sprd.txt
new file mode 100644
index 0000000..31a629d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sprd.txt
@@ -0,0 +1,11 @@
+Spreadtrum SoC Platforms Device Tree Bindings
+----------------------------------------------------
+
+Sharkl64 is a Spreadtrum's SoC Platform which is based
+on ARM 64-bit processor.
+
+SC9836 openphone board with SC9836 SoC based on the
+Sharkl64 Platform shall have the following properties.
+
+Required root node properties:
+        - compatible = "sprd,sc9836-openphone", "sprd,sc9836";
diff --git a/Documentation/devicetree/bindings/serial/sprd-uart.txt b/Documentation/devicetree/bindings/serial/sprd-uart.txt
new file mode 100644
index 0000000..2aff0f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/serial/sprd-uart.txt
@@ -0,0 +1,7 @@
+* Spreadtrum serial UART
+
+Required properties:
+- compatible: must be "sprd,sc9836-uart"
+- reg: offset and length of the register set for the device
+- interrupts: exactly one interrupt specifier
+- clocks: phandles to input clocks.
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index b1df0ad..0a8384f 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -153,6 +153,7 @@ snps	Synopsys, Inc.
 solidrun	SolidRun
 sony	Sony Corporation
 spansion	Spansion Inc.
+sprd	Spreadtrum Communications Inc.
 st	STMicroelectronics
 ste	ST-Ericsson
 stericsson	ST-Ericsson
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Chunyan Zhang @ 2015-01-22 13:35 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A
  Cc: mark.rutland-5wv7dgnIgG8, arnd-r2nGTMty4D4,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A, pawel.moll-5wv7dgnIgG8,
	ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, jslaby-AlSwsSmVLrQ,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421933706-4277-1-git-send-email-chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>

Add a full sc9836-uart driver for SC9836 SoC which is based on the
spreadtrum sharkl64 platform.
This driver also support earlycon.
This patch also replaced the spaces between the macros and their
values with the tabs in serial_core.h

Originally-by: Lanqing Liu <lanqing.liu-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Signed-off-by: Orson Zhai <orson.zhai-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Signed-off-by: Chunyan Zhang <chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
---
 drivers/tty/serial/Kconfig       |   18 +
 drivers/tty/serial/Makefile      |    1 +
 drivers/tty/serial/sprd_serial.c |  801 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/serial_core.h |    3 +
 4 files changed, 823 insertions(+)
 create mode 100644 drivers/tty/serial/sprd_serial.c

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index c79b43c..13211f7 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1577,6 +1577,24 @@ config SERIAL_MEN_Z135
 	  This driver can also be build as a module. If so, the module will be called
 	  men_z135_uart.ko
 
+config SERIAL_SPRD
+	tristate "Support for Spreadtrum serial"
+	depends on ARCH_SPRD
+	select SERIAL_CORE
+	help
+	  This enables the driver for the Spreadtrum's serial.
+
+config SERIAL_SPRD_CONSOLE
+	bool "Spreadtrum UART console support"
+	depends on SERIAL_SPRD=y
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_EARLYCON
+	help
+	  Support for early debug console using Spreadtrum's serial. This enables
+	  the console before standard serial driver is probed. This is enabled
+	  with "earlycon" on the kernel command line. The console is
+	  enabled when early_param is processed.
+
 endmenu
 
 config SERIAL_MCTRL_GPIO
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 9a548ac..4801aca 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_SERIAL_ARC)	+= arc_uart.o
 obj-$(CONFIG_SERIAL_RP2)	+= rp2.o
 obj-$(CONFIG_SERIAL_FSL_LPUART)	+= fsl_lpuart.o
 obj-$(CONFIG_SERIAL_MEN_Z135)	+= men_z135_uart.o
+obj-$(CONFIG_SERIAL_SPRD) += sprd_serial.o
 
 # GPIOLIB helpers for modem control lines
 obj-$(CONFIG_SERIAL_MCTRL_GPIO)	+= serial_mctrl_gpio.o
diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
new file mode 100644
index 0000000..fd98541
--- /dev/null
+++ b/drivers/tty/serial/sprd_serial.c
@@ -0,0 +1,801 @@
+/*
+ * Copyright (C) 2012-2015 Spreadtrum Communications Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+/* device name */
+#define UART_NR_MAX		8
+#define SPRD_TTY_NAME		"ttyS"
+#define SPRD_FIFO_SIZE		128
+#define SPRD_DEF_RATE		26000000
+#define SPRD_TIMEOUT		2048
+
+/* the offset of serial registers and BITs for them */
+/* data registers */
+#define SPRD_TXD		0x0000
+#define SPRD_RXD		0x0004
+
+/* line status register and its BITs  */
+#define SPRD_LSR		0x0008
+#define SPRD_LSR_OE		BIT(4)
+#define SPRD_LSR_FE		BIT(3)
+#define SPRD_LSR_PE		BIT(2)
+#define SPRD_LSR_BI		BIT(7)
+#define SPRD_LSR_TX_OVER	BIT(15)
+
+/* data number in TX and RX fifo */
+#define SPRD_STS1		0x000C
+
+/* interrupt enable register and its BITs */
+#define SPRD_IEN		0x0010
+#define SPRD_IEN_RX_FULL	BIT(0)
+#define SPRD_IEN_TX_EMPTY	BIT(1)
+#define SPRD_IEN_BREAK_DETECT	BIT(7)
+#define SPRD_IEN_TIMEOUT	BIT(13)
+
+/* interrupt clear register */
+#define SPRD_ICLR		0x0014
+
+/* line control register */
+#define SPRD_LCR		0x0018
+#define SPRD_LCR_STOP_1BIT	0x10
+#define SPRD_LCR_STOP_2BIT	0x30
+#define SPRD_LCR_DATA_LEN	(BIT(2) | BIT(3))
+#define SPRD_LCR_DATA_LEN5	0x0
+#define SPRD_LCR_DATA_LEN6	0x4
+#define SPRD_LCR_DATA_LEN7	0x8
+#define SPRD_LCR_DATA_LEN8	0xc
+#define SPRD_LCR_PARITY		(BIT(0) | BIT(1))
+#define SPRD_LCR_PARITY_EN	0x2
+#define SPRD_LCR_EVEN_PAR	0x0
+#define SPRD_LCR_ODD_PAR	0x1
+
+/* control register 1 */
+#define SPRD_CTL1		0x001C
+#define RX_HW_FLOW_CTL_THLD	BIT(6)
+#define RX_HW_FLOW_CTL_EN	BIT(7)
+#define TX_HW_FLOW_CTL_EN	BIT(8)
+
+/* fifo threshold register */
+#define SPRD_CTL2		0x0020
+#define THLD_TX_EMPTY		0x40
+#define THLD_RX_FULL		0x40
+
+/* config baud rate register */
+#define SPRD_CLKD0		0x0024
+#define SPRD_CLKD1		0x0028
+
+/* interrupt mask status register */
+#define SPRD_IMSR		0x002C
+#define SPRD_IMSR_RX_FIFO_FULL	BIT(0)
+#define SPRD_IMSR_TX_FIFO_EMPTY	BIT(1)
+#define SPRD_IMSR_BREAK_DETECT	BIT(7)
+#define SPRD_IMSR_TIMEOUT	BIT(13)
+
+struct reg_backup {
+	uint32_t ien;
+	uint32_t ctrl0;
+	uint32_t ctrl1;
+	uint32_t ctrl2;
+	uint32_t clkd0;
+	uint32_t clkd1;
+	uint32_t dspwait;
+};
+
+struct sprd_uart_port {
+	struct uart_port port;
+	struct reg_backup reg_bak;
+	char name[16];
+};
+
+static struct sprd_uart_port *sprd_port[UART_NR_MAX] = { NULL };
+
+static inline unsigned int serial_in(struct uart_port *port, int offset)
+{
+	return readl_relaxed(port->membase + offset);
+}
+
+static inline void serial_out(struct uart_port *port, int offset, int value)
+{
+	writel_relaxed(value, port->membase + offset);
+}
+
+static unsigned int sprd_tx_empty(struct uart_port *port)
+{
+	if (serial_in(port, SPRD_STS1) & 0xff00)
+		return 0;
+	else
+		return TIOCSER_TEMT;
+}
+
+static unsigned int sprd_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_DSR | TIOCM_CTS;
+}
+
+static void sprd_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	/* nothing to do */
+}
+
+static void sprd_stop_tx(struct uart_port *port)
+{
+	unsigned int ien, iclr;
+
+	iclr = serial_in(port, SPRD_ICLR);
+	ien = serial_in(port, SPRD_IEN);
+
+	iclr |= SPRD_IEN_TX_EMPTY;
+	ien &= ~SPRD_IEN_TX_EMPTY;
+
+	serial_out(port, SPRD_ICLR, iclr);
+	serial_out(port, SPRD_IEN, ien);
+}
+
+static void sprd_start_tx(struct uart_port *port)
+{
+	unsigned int ien;
+
+	ien = serial_in(port, SPRD_IEN);
+	if (!(ien & SPRD_IEN_TX_EMPTY)) {
+		ien |= SPRD_IEN_TX_EMPTY;
+		serial_out(port, SPRD_IEN, ien);
+	}
+}
+
+static void sprd_stop_rx(struct uart_port *port)
+{
+	unsigned int ien, iclr;
+
+	iclr = serial_in(port, SPRD_ICLR);
+	ien = serial_in(port, SPRD_IEN);
+
+	ien &= ~(SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT);
+	iclr |= SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT;
+
+	serial_out(port, SPRD_IEN, ien);
+	serial_out(port, SPRD_ICLR, iclr);
+}
+
+/* The Sprd serial does not support this function. */
+static void sprd_break_ctl(struct uart_port *port, int break_state)
+{
+	/* nothing to do */
+}
+
+static inline int handle_lsr_errors(struct uart_port *port,
+				    unsigned int *flag,
+				    unsigned int *lsr)
+{
+	int ret = 0;
+
+	/* statistics */
+	if (*lsr & SPRD_LSR_BI) {
+		*lsr &= ~(SPRD_LSR_FE | SPRD_LSR_PE);
+		port->icount.brk++;
+		ret = uart_handle_break(port);
+		if (ret)
+			return ret;
+	} else if (*lsr & SPRD_LSR_PE)
+		port->icount.parity++;
+	else if (*lsr & SPRD_LSR_FE)
+		port->icount.frame++;
+	if (*lsr & SPRD_LSR_OE)
+		port->icount.overrun++;
+
+	/* mask off conditions which should be ignored */
+	*lsr &= port->read_status_mask;
+	if (*lsr & SPRD_LSR_BI)
+		*flag = TTY_BREAK;
+	else if (*lsr & SPRD_LSR_PE)
+		*flag = TTY_PARITY;
+	else if (*lsr & SPRD_LSR_FE)
+		*flag = TTY_FRAME;
+
+	return ret;
+}
+
+static inline void sprd_rx(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	struct tty_port *tty = &port->state->port;
+	unsigned int ch, flag, lsr, max_count = SPRD_TIMEOUT;
+
+	while ((serial_in(port, SPRD_STS1) & 0x00ff) && max_count--) {
+		lsr = serial_in(port, SPRD_LSR);
+		ch = serial_in(port, SPRD_RXD);
+		flag = TTY_NORMAL;
+		port->icount.rx++;
+
+		if (lsr & (SPRD_LSR_BI | SPRD_LSR_PE
+			| SPRD_LSR_FE | SPRD_LSR_OE))
+			if (handle_lsr_errors(port, &lsr, &flag))
+				continue;
+		if (uart_handle_sysrq_char(port, ch))
+			continue;
+
+		uart_insert_char(port, lsr, SPRD_LSR_OE, ch, flag);
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+static inline void sprd_tx(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	struct circ_buf *xmit = &port->state->xmit;
+	int count;
+
+	if (port->x_char) {
+		serial_out(port, SPRD_TXD, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		sprd_stop_tx(port);
+		return;
+	}
+
+	count = THLD_TX_EMPTY;
+	do {
+		serial_out(port, SPRD_TXD, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	} while (--count > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		sprd_stop_tx(port);
+}
+
+/* this handles the interrupt from one port */
+static irqreturn_t sprd_handle_irq(int irq, void *dev_id)
+{
+	struct uart_port *port = (struct uart_port *)dev_id;
+	unsigned int ims;
+
+	spin_lock(&port->lock);
+
+	ims = serial_in(port, SPRD_IMSR);
+
+	if (!ims)
+		return IRQ_NONE;
+
+	serial_out(port, SPRD_ICLR, ~0);
+
+	if (ims & (SPRD_IMSR_RX_FIFO_FULL |
+		SPRD_IMSR_BREAK_DETECT | SPRD_IMSR_TIMEOUT))
+		sprd_rx(irq, port);
+
+	if (ims & SPRD_IMSR_TX_FIFO_EMPTY)
+		sprd_tx(irq, port);
+
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static int sprd_startup(struct uart_port *port)
+{
+	int ret = 0;
+	unsigned int ien, ctrl1;
+	unsigned int timeout;
+	struct sprd_uart_port *sp;
+
+	serial_out(port, SPRD_CTL2, ((THLD_TX_EMPTY << 8) | THLD_RX_FULL));
+
+	/* clear rx fifo */
+	timeout = SPRD_TIMEOUT;
+	while (timeout-- && serial_in(port, SPRD_STS1) & 0x00ff)
+		serial_in(port, SPRD_RXD);
+
+	/* clear tx fifo */
+	timeout = SPRD_TIMEOUT;
+	while (timeout-- && serial_in(port, SPRD_STS1) & 0xff00)
+		cpu_relax();
+
+	/* clear interrupt */
+	serial_out(port, SPRD_IEN, 0x0);
+	serial_out(port, SPRD_ICLR, ~0);
+
+	/* allocate irq */
+	sp = container_of(port, struct sprd_uart_port, port);
+	snprintf(sp->name, sizeof(sp->name), "sprd_serial%d", port->line);
+	ret = devm_request_irq(port->dev, port->irq, sprd_handle_irq,
+				IRQF_SHARED, sp->name, port);
+	if (ret) {
+		dev_err(port->dev, "fail to request serial irq %d, ret=%d\n",
+			port->irq, ret);
+		return ret;
+	}
+	ctrl1 = serial_in(port, SPRD_CTL1);
+	ctrl1 |= 0x3e00 | THLD_RX_FULL;
+	serial_out(port, SPRD_CTL1, ctrl1);
+
+	/* enable interrupt */
+	spin_lock(&port->lock);
+	ien = serial_in(port, SPRD_IEN);
+	ien |= SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT | SPRD_IEN_TIMEOUT;
+	serial_out(port, SPRD_IEN, ien);
+	spin_unlock(&port->lock);
+
+	return 0;
+}
+
+static void sprd_shutdown(struct uart_port *port)
+{
+	serial_out(port, SPRD_IEN, 0x0);
+	serial_out(port, SPRD_ICLR, ~0);
+	devm_free_irq(port->dev, port->irq, port);
+}
+
+static void sprd_set_termios(struct uart_port *port,
+				    struct ktermios *termios,
+				    struct ktermios *old)
+{
+	unsigned int baud, quot;
+	unsigned int lcr, fc;
+	unsigned long flags;
+
+	/* ask the core to calculate the divisor for us */
+	baud = uart_get_baud_rate(port, termios, old, 1200, 3000000);
+
+	quot = (unsigned int)((port->uartclk + baud / 2) / baud);
+
+	/* set data length */
+	lcr = serial_in(port, SPRD_LCR);
+	lcr &= ~SPRD_LCR_DATA_LEN;
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		lcr |= SPRD_LCR_DATA_LEN5;
+		break;
+	case CS6:
+		lcr |= SPRD_LCR_DATA_LEN6;
+		break;
+	case CS7:
+		lcr |= SPRD_LCR_DATA_LEN7;
+		break;
+	case CS8:
+	default:
+		lcr |= SPRD_LCR_DATA_LEN8;
+		break;
+	}
+
+	/* calculate stop bits */
+	lcr &= ~(SPRD_LCR_STOP_1BIT | SPRD_LCR_STOP_2BIT);
+	if (termios->c_cflag & CSTOPB)
+		lcr |= SPRD_LCR_STOP_2BIT;
+	else
+		lcr |= SPRD_LCR_STOP_1BIT;
+
+	/* calculate parity */
+	lcr &= ~SPRD_LCR_PARITY;
+	termios->c_cflag &= ~CMSPAR;	/* no support mark/space */
+	if (termios->c_cflag & PARENB) {
+		lcr |= SPRD_LCR_PARITY_EN;
+		if (termios->c_cflag & PARODD)
+			lcr |= SPRD_LCR_ODD_PAR;
+		else
+			lcr |= SPRD_LCR_EVEN_PAR;
+	}
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* update the per-port timeout */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	port->read_status_mask = SPRD_LSR_OE;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= SPRD_LSR_FE | SPRD_LSR_PE;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		port->read_status_mask |= SPRD_LSR_BI;
+
+	/* characters to ignore */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= SPRD_LSR_PE | SPRD_LSR_FE;
+	if (termios->c_iflag & IGNBRK) {
+		port->ignore_status_mask |= SPRD_LSR_BI;
+		/*
+		 * If we're ignoring parity and break indicators,
+		 * ignore overruns too (for real raw support).
+		 */
+		if (termios->c_iflag & IGNPAR)
+			port->ignore_status_mask |= SPRD_LSR_OE;
+	}
+
+	/* flow control */
+	fc = serial_in(port, SPRD_CTL1);
+	fc &= ~(RX_HW_FLOW_CTL_THLD | RX_HW_FLOW_CTL_EN | TX_HW_FLOW_CTL_EN);
+	if (termios->c_cflag & CRTSCTS) {
+		fc |= RX_HW_FLOW_CTL_THLD;
+		fc |= RX_HW_FLOW_CTL_EN;
+		fc |= TX_HW_FLOW_CTL_EN;
+	}
+
+	/* clock divider bit0~bit15 */
+	serial_out(port, SPRD_CLKD0, quot & 0xffff);
+
+	/* clock divider bit16~bit20 */
+	serial_out(port, SPRD_CLKD1, (quot & 0x1f0000) >> 16);
+	serial_out(port, SPRD_LCR, lcr);
+	fc |= 0x3e00 | THLD_RX_FULL;
+	serial_out(port, SPRD_CTL1, fc);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	/* Don't rewrite B0 */
+	if (tty_termios_baud_rate(termios))
+		tty_termios_encode_baud_rate(termios, baud, baud);
+}
+
+static const char *sprd_type(struct uart_port *port)
+{
+	return "SPX";
+}
+
+static void sprd_release_port(struct uart_port *port)
+{
+	/* nothing to do */
+}
+
+static int sprd_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+static void sprd_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE)
+		port->type = PORT_SPRD;
+}
+
+static int sprd_verify_port(struct uart_port *port,
+				   struct serial_struct *ser)
+{
+	if (ser->type != PORT_SPRD)
+		return -EINVAL;
+	if (port->irq != ser->irq)
+		return -EINVAL;
+	return 0;
+}
+
+static struct uart_ops serial_sprd_ops = {
+	.tx_empty = sprd_tx_empty,
+	.get_mctrl = sprd_get_mctrl,
+	.set_mctrl = sprd_set_mctrl,
+	.stop_tx = sprd_stop_tx,
+	.start_tx = sprd_start_tx,
+	.stop_rx = sprd_stop_rx,
+	.break_ctl = sprd_break_ctl,
+	.startup = sprd_startup,
+	.shutdown = sprd_shutdown,
+	.set_termios = sprd_set_termios,
+	.type = sprd_type,
+	.release_port = sprd_release_port,
+	.request_port = sprd_request_port,
+	.config_port = sprd_config_port,
+	.verify_port = sprd_verify_port,
+};
+
+#ifdef CONFIG_SERIAL_SPRD_CONSOLE
+static inline void wait_for_xmitr(struct uart_port *port)
+{
+	unsigned int status, tmout = 10000;
+
+	/* wait up to 10ms for the character(s) to be sent */
+	do {
+		status = serial_in(port, SPRD_STS1);
+		if (--tmout == 0)
+			break;
+		udelay(1);
+	} while (status & 0xff00);
+}
+
+static void sprd_console_putchar(struct uart_port *port, int ch)
+{
+	wait_for_xmitr(port);
+	serial_out(port, SPRD_TXD, ch);
+}
+
+static void sprd_console_write(struct console *co, const char *s,
+				      unsigned int count)
+{
+	struct uart_port *port = &sprd_port[co->index]->port;
+	int locked = 1;
+	unsigned long flags;
+
+	if (oops_in_progress)
+		locked = spin_trylock(&port->lock);
+	else
+		spin_lock_irqsave(&port->lock, flags);
+
+	uart_console_write(port, s, count, sprd_console_putchar);
+
+	/* wait for transmitter to become empty */
+	wait_for_xmitr(port);
+
+	if (locked)
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static int __init sprd_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index >= UART_NR_MAX || co->index < 0)
+		co->index = 0;
+
+	port = &sprd_port[co->index]->port;
+	if (port == NULL) {
+		pr_info("serial port %d not yet initialized\n", co->index);
+		return -ENODEV;
+	}
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver sprd_uart_driver;
+static struct console sprd_console = {
+	.name = SPRD_TTY_NAME,
+	.write = sprd_console_write,
+	.device = uart_console_device,
+	.setup = sprd_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &sprd_uart_driver,
+};
+
+#define SPRD_CONSOLE	(&sprd_console)
+
+/* Support for earlycon */
+static void sprd_putc(struct uart_port *port, int c)
+{
+	unsigned int timeout = SPRD_TIMEOUT;
+
+	while (timeout-- &&
+		   !(readl(port->membase + SPRD_LSR) & SPRD_LSR_TX_OVER))
+		cpu_relax();
+
+	writeb(c, port->membase + SPRD_TXD);
+}
+
+static void sprd_early_write(struct console *con, const char *s,
+				    unsigned n)
+{
+	struct earlycon_device *dev = con->data;
+
+	uart_console_write(&dev->port, s, n, sprd_putc);
+}
+
+static int __init sprd_early_console_setup(
+				struct earlycon_device *device,
+				const char *opt)
+{
+	if (!device->port.membase)
+		return -ENODEV;
+
+	device->con->write = sprd_early_write;
+	return 0;
+}
+
+EARLYCON_DECLARE(sprd_serial, sprd_early_console_setup);
+OF_EARLYCON_DECLARE(sprd_serial, "sprd,sc9836-uart",
+		    sprd_early_console_setup);
+
+#else /* !CONFIG_SERIAL_SPRD_CONSOLE */
+#define SPRD_CONSOLE		NULL
+#endif
+
+static struct uart_driver sprd_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "sprd_serial",
+	.dev_name = SPRD_TTY_NAME,
+	.major = 0,
+	.minor = 0,
+	.nr = UART_NR_MAX,
+	.cons = SPRD_CONSOLE,
+};
+
+static int sprd_probe_dt_alias(int index, struct device *dev)
+{
+	struct device_node *np;
+	static bool seen_dev_with_alias;
+	static bool seen_dev_without_alias;
+	int ret = index;
+
+	if (!IS_ENABLED(CONFIG_OF))
+		return ret;
+
+	np = dev->of_node;
+	if (!np)
+		return ret;
+
+	ret = of_alias_get_id(np, "serial");
+	if (IS_ERR_VALUE(ret)) {
+		seen_dev_without_alias = true;
+		ret = index;
+	} else {
+		seen_dev_with_alias = true;
+		if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) {
+			dev_warn(dev, "requested serial port %d  not available.\n", ret);
+			ret = index;
+		}
+	}
+
+	if (seen_dev_with_alias && seen_dev_without_alias)
+		dev_warn(dev, "aliased and non-aliased serial devices found in device tree. Serial port enumeration may be unpredictable.\n");
+
+	return ret;
+}
+
+static int sprd_remove(struct platform_device *dev)
+{
+	struct sprd_uart_port *sup = platform_get_drvdata(dev);
+	bool busy = false;
+	int i;
+
+	if (sup)
+		uart_remove_one_port(&sprd_uart_driver, &sup->port);
+
+	for (i = 0; i < ARRAY_SIZE(sprd_port); i++)
+		if (sprd_port[i] == sup)
+			sprd_port[i] = NULL;
+		else if (sprd_port[i])
+			busy = true;
+
+	if (!busy)
+		uart_unregister_driver(&sprd_uart_driver);
+
+	return 0;
+}
+
+static int sprd_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct uart_port *up;
+	struct clk *clk;
+	int irq;
+	int index;
+	int ret;
+
+	for (index = 0; index < ARRAY_SIZE(sprd_port); index++)
+		if (sprd_port[index] == NULL)
+			break;
+
+	if (index == ARRAY_SIZE(sprd_port))
+		return -EBUSY;
+
+	index = sprd_probe_dt_alias(index, &pdev->dev);
+
+	sprd_port[index] = devm_kzalloc(&pdev->dev,
+		sizeof(*sprd_port[index]), GFP_KERNEL);
+	if (!sprd_port[index])
+		return -ENOMEM;
+
+	up = &sprd_port[index]->port;
+	up->dev = &pdev->dev;
+	up->line = index;
+	up->type = PORT_SPRD;
+	up->iotype = SERIAL_IO_PORT;
+	up->uartclk = SPRD_DEF_RATE;
+	up->fifosize = SPRD_FIFO_SIZE;
+	up->ops = &serial_sprd_ops;
+	up->flags = ASYNC_BOOT_AUTOCONF;
+
+	clk = devm_clk_get(&pdev->dev, NULL);
+	if (!IS_ERR(clk))
+		up->uartclk = clk_get_rate(clk);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "not provide mem resource\n");
+		return -ENODEV;
+	}
+	up->mapbase = res->start;
+	up->membase = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(up->membase))
+		return PTR_ERR(up->membase);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "not provide irq resource\n");
+		return -ENODEV;
+	}
+	up->irq = irq;
+
+	if (!sprd_uart_driver.state) {
+		ret = uart_register_driver(&sprd_uart_driver);
+		if (ret < 0) {
+			pr_err("Failed to register SPRD-UART driver\n");
+			return ret;
+		}
+	}
+
+	ret = uart_add_one_port(&sprd_uart_driver, up);
+	if (ret) {
+		sprd_port[index] = NULL;
+		sprd_remove(pdev);
+	}
+
+	platform_set_drvdata(pdev, up);
+
+	return ret;
+}
+
+static int sprd_suspend(struct device *dev)
+{
+	int id = to_platform_device(dev)->id;
+	struct uart_port *port = &sprd_port[id]->port;
+
+	uart_suspend_port(&sprd_uart_driver, port);
+
+	return 0;
+}
+
+static int sprd_resume(struct device *dev)
+{
+	int id = to_platform_device(dev)->id;
+	struct uart_port *port = &sprd_port[id]->port;
+
+	uart_resume_port(&sprd_uart_driver, port);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(sprd_pm_ops, sprd_suspend, sprd_resume);
+
+static const struct of_device_id serial_ids[] = {
+	{.compatible = "sprd,sc9836-uart",},
+	{}
+};
+
+static struct platform_driver sprd_platform_driver = {
+	.probe		= sprd_probe,
+	.remove		= sprd_remove,
+	.driver		= {
+		.name	= "sprd_serial",
+		.of_match_table = of_match_ptr(serial_ids),
+		.pm	= &sprd_pm_ops,
+	},
+};
+
+module_platform_driver(sprd_platform_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Spreadtrum SoC serial driver series");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index c172180..7e6eb39 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -248,4 +248,7 @@
 /* MESON */
 #define PORT_MESON	109
 
+/* SPRD SERIAL  */
+#define PORT_SPRD	110
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
1.7.9.5

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox