Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH v8 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Lyra Zhang @ 2015-01-27 15:51 UTC (permalink / raw)
  To: Peter Hurley, gregkh@linuxfoundation.org
  Cc: Chunyan Zhang, robh+dt@kernel.org, Mark Rutland, Arnd Bergmann,
	gnomes@lxorguk.ukuu.org.uk, Pawel Moll,
	ijc+devicetree@hellion.org.uk, Kumar Gala, Grant Likely,
	jslaby@suse.cz, Heiko Stübner, jason@lakedaemon.net,
	florian.vaussard@epfl.ch, andrew@lunn.ch, Hayato Suzuki,
	antonynpavlov@gmail.com, Shawn Guo, Orson Zhai,
	geng.ren@spreadtrum.com, zhizhou.zhang
In-Reply-To: <54C7A514.6090206@hurleysoftware.com>

On Tue, Jan 27, 2015 at 10:47 PM, Peter Hurley <peter@hurleysoftware.com> wrote:
> Hi Chunyan,
>
> Minor but important fixes below.
>
> And for the v9 version, please only use "To:" for
> "Greg Kroah-Hartman <gregkh@linuxfoundation.org>"
>

Ok, thank you, I'll address your comments below and send the v9 to Greg.

Greg,
sorry, I'll send you a updated version tomorrow.

> All other recipients should only be Cc:
>
> Regards,
> Peter Hurley
>
>
> On 01/27/2015 02:56 AM, Chunyan Zhang wrote:
>> Add a full sc9836-uart driver for SC9836 SoC which is based on the
>> spreadtrum sharkl64 platform.
>> This driver also support earlycon.
>
> [...]
>
>> +static int sprd_probe_dt_alias(int index, struct device *dev)
>> +{
>> +     struct device_node *np;
>> +     static bool seen_dev_with_alias;
>> +     static bool seen_dev_without_alias;
>         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> delete these two lines; these were used for the message deleted in a
> previous patch version.
>
>> +     int ret = index;
>> +
>> +     if (!IS_ENABLED(CONFIG_OF))
>> +             return ret;
>> +
>> +     np = dev->of_node;
>> +     if (!np)
>> +             return ret;
>> +
>> +     ret = of_alias_get_id(np, "serial");
>> +     if (IS_ERR_VALUE(ret)) {
>> +             seen_dev_without_alias = true;
>                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> delete this line.
>
>> +             ret = index;
>> +     } else {
>> +             seen_dev_with_alias = true;
>                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
> delete this line.
>
>> +             if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) {
>> +                     dev_warn(dev, "requested serial port %d  not available.\n", ret);
>> +                     ret = index;
>> +             }
>> +     }
>
> Simplify the entire "if (IS_ERR_VALUE(ret))" statement to:
>
>         if (IS_ERR_VALUE(ret))
>                 ret = index;
>         else if (ret >= ..................) {
>                 dev_warn(.....);
>                 ret = index;
>         }
>
>
>> +
>> +     return ret;
>> +}
>> +
>> +static int sprd_remove(struct platform_device *dev)
>> +{
>> +     struct sprd_uart_port *sup = platform_get_drvdata(dev);
>> +
>> +     if (sup) {
>> +             uart_remove_one_port(&sprd_uart_driver, &sup->port);
>> +             sprd_port[sup->port.line] = NULL;
>> +             sprd_ports_num--;
>> +     }
>> +
>> +     if (!sprd_ports_num)
>> +             uart_unregister_driver(&sprd_uart_driver);
>> +
>> +     return 0;
>> +}
>> +
>> +static int sprd_probe(struct platform_device *pdev)
>> +{
>> +     struct resource *res;
>> +     struct uart_port *up;
>> +     struct clk *clk;
>> +     int irq;
>> +     int index;
>> +     int ret;
>> +
>> +     for (index = 0; index < ARRAY_SIZE(sprd_port); index++)
>> +             if (sprd_port[index] == NULL)
>> +                     break;
>> +
>> +     if (index == ARRAY_SIZE(sprd_port))
>> +             return -EBUSY;
>> +
>> +     index = sprd_probe_dt_alias(index, &pdev->dev);
>> +
>> +     sprd_port[index] = devm_kzalloc(&pdev->dev,
>> +             sizeof(*sprd_port[index]), GFP_KERNEL);
>> +     if (!sprd_port[index])
>> +             return -ENOMEM;
>> +
>> +     pdev->id = index;
>         ^^^^^^^^^^^^^^^^
> delete this line.
>
> The platform device id cannot be assigned by the driver.
> (This was left over from trying to fix sprd_suspend/sprd_resume
> but that's fixed correctly now.)
>
>> +
>> +     up = &sprd_port[index]->port;
>> +     up->dev = &pdev->dev;
>> +     up->line = index;
>> +     up->type = PORT_SPRD;
>> +     up->iotype = SERIAL_IO_PORT;
>> +     up->uartclk = SPRD_DEF_RATE;
>> +     up->fifosize = SPRD_FIFO_SIZE;
>> +     up->ops = &serial_sprd_ops;
>> +     up->flags = UPF_BOOT_AUTOCONF;
>> +
>> +     clk = devm_clk_get(&pdev->dev, NULL);
>> +     if (!IS_ERR(clk))
>> +             up->uartclk = clk_get_rate(clk);
>> +
>> +     res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>> +     if (!res) {
>> +             dev_err(&pdev->dev, "not provide mem resource\n");
>> +             return -ENODEV;
>> +     }
>> +     up->mapbase = res->start;
>> +     up->membase = devm_ioremap_resource(&pdev->dev, res);
>> +     if (IS_ERR(up->membase))
>> +             return PTR_ERR(up->membase);
>> +
>> +     irq = platform_get_irq(pdev, 0);
>> +     if (irq < 0) {
>> +             dev_err(&pdev->dev, "not provide irq resource\n");
>> +             return -ENODEV;
>> +     }
>> +     up->irq = irq;
>> +
>> +     if (!sprd_ports_num) {
>> +             ret = uart_register_driver(&sprd_uart_driver);
>> +             if (ret < 0) {
>> +                     pr_err("Failed to register SPRD-UART driver\n");
>> +                     return ret;
>> +             }
>> +     }
>> +     sprd_ports_num++;
>> +
>> +     ret = uart_add_one_port(&sprd_uart_driver, up);
>> +     if (ret) {
>> +             sprd_port[index] = NULL;
>> +             sprd_remove(pdev);
>> +     }
>> +
>> +     platform_set_drvdata(pdev, up);
>> +
>> +     return ret;
>> +}
>

^ permalink raw reply

* Re: [PATCH 01/13] kdbus: add documentation
From: Andy Lutomirski @ 2015-01-27 16:03 UTC (permalink / raw)
  To: David Herrmann
  Cc: Michael Kerrisk (man-pages), Greg Kroah-Hartman,
	Austin S Hemmelgarn, Daniel Mack, Arnd Bergmann,
	Eric W. Biederman, One Thousand Gnomes, Tom Gundersen,
	Theodore T'so, Linux API, linux-kernel, Djalal Harouni,
	Johannes Stezenbach, Christoph Hellwig
In-Reply-To: <CANq1E4SkHhs1pWUe-TzG7bzk1M-Q++mB2vmQGuYx0RMF53wg4Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Jan 27, 2015 at 7:05 AM, David Herrmann <dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> Hi
>
> On Mon, Jan 26, 2015 at 3:46 PM, Michael Kerrisk (man-pages)
> <mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> Hello Greg,
>>
>> On 01/23/2015 05:08 PM, Greg Kroah-Hartman wrote:
>>> On Thu, Jan 22, 2015 at 09:49:00AM -0500, Austin S Hemmelgarn wrote:
>>>> While I agree that there should be a way for userspace to get the list of
>>>> supported operations, userspace apps will only actually care about that
>>>> once, when they begin talking to kdbus, because (ignoring the live kernel
>>>> patching that people have been working on recently) the list of supported
>>>> operations isn't going to change while the system is running.  While a u64
>>>> copy has relatively low overhead, it does have overhead, and that is very
>>>> significant when you consider part of the reason some people want kdbus is
>>>> for the performance gain.  Especially for those automotive applications that
>>>> have been mentioned which fire off thousands of messages during start-up,
>>>> every little bit of performance is significant.
>>>
>>> A single u64 in a structure is not going to be measurable at all,
>>> processors just copy memory too fast these days for 4 extra bytes to be
>>> noticable.
>>
>> It depends on the definition of measurable, I suppose, but this statement
>> appears incorrect to me. In some cases (e.g., kdbus_msg_info) we're talking
>> about *two* u64 fields (kernel_gs, kernel_msg_flags) being used to pass back
>> sets of valid flags. That's 16 bytes, and it definitely makes a difference.
>> Simply running a loop that does a naive memcpy() in a tight user-space
>> loop (code below), I see the following for the execution of 1e9 loops:
>>
>>     Including the two extra u64 fields: 3.2 sec
>>     Without the two extra u64 fields:   2.6 sec
>>
>> On the same box, doing 1e9 calls to getppid() (i.e., pretty much the
>> simplest syscall, giving us a rough measure of the context switch) takes
>> 68 seconds. In other words, the cost of copying those 16 bytes is about 1%
>> of the base context switch/syscall cost. I assume the costs of copying
>> those 16 bytes across the kernel-user-space boundary would not be cheaper,
>> but have not tested that. If my assumption is correct, then 1% seems a
>> significant figure to me in an API whose raison d'être is speed.
>
> I have no idea how this is related to any kdbus ioctl?
>
> A 16byte copy does not affect the performance of kdbus message
> transactions in any way that matters.
>

Sorry for jumping in so late.  Since this version of kdbus was sent,
I've been on vacation for part of the time and I had the flu for the
rest of the time.

What are the performance goals of kdbus?  How fast is it ever intended
to be?  The reason I ask is that, in the current design, kdbus
collects "metadata" (credentials and other identifying information,
collected in kdbus_meta_proc_collect) from the sender of every message
*at send time*. [1]  This is slow, and it will always be slow.  The
slowness of this operation will, in my personal system performance
crystal ball, overshadow the cost of a 16 byte copy by several orders
of magnitude.

[1] After much discussion last time around, I'm at least convinced
that the kdbus people have reasons to like the idea of capturing
metadata for each message.  I still think the design is wrong even
from a security standpoint, but right now I'm talking about
performance.  If you want the data plane to be fast, it should be
separated from the control plane as much as possible, and this design
is the opposite.

--Andy

^ permalink raw reply

* Re: [tpmdd-devel] [PATCH] tpm: fix suspend/resume paths for TPM 2.0
From: Scot Doyle @ 2015-01-27 16:52 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: peterhuewe, ashley, christophe.ricard, jason.gunthorpe, linux-api,
	linux-kernel, tpmdd-devel, trousers-tech
In-Reply-To: <1422356564-17312-1-git-send-email-jarkko.sakkinen@linux.intel.com>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1214 bytes --]

On Tue, 27 Jan 2015, Jarkko Sakkinen wrote:
> Fixed suspend/resume paths for TPM 2.0 and consolidated all the
> associated code to the tpm_pm_suspend() and tpm_pm_resume()
> functions. Resume path should be handled by the firmware, i.e.
> Startup(CLEAR) for hibernate and Startup(STATE) for suspend.
> 
> There might be some non-PC embedded devices in the future where
> Startup() is not the handled by the FW but fixing the code for
> those IMHO should be postponed until there is hardware available
> to test the fixes although extra Startup in the driver code is
> essentially a NOP.
> 
> Reported-by: Peter Hüwe <PeterHuewe@gmx.de>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---

...

> --- a/drivers/char/tpm/tpm_tis.c
> +++ b/drivers/char/tpm/tpm_tis.c
> @@ -865,25 +865,23 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip)
>  static int tpm_tis_resume(struct device *dev)
>  {

...

> +	/* TPM 1.2 requires self-test on resume. */
> +	if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) {
> +		ret = tpm_do_selftest(chip);
> +		if (ret < 0)
> +			return ret;

Just to note, the return value from tpm_do_selftest() on TPM 1.2 chips was 
previously ignored. Mine does return 0.

^ permalink raw reply

* Re: [tpmdd-devel] [PATCH] tpm: fix suspend/resume paths for TPM 2.0
From: Jarkko Sakkinen @ 2015-01-27 16:57 UTC (permalink / raw)
  To: Scot Doyle
  Cc: peterhuewe-Mmb7MZpHnFY, ashley-fm2HMyfA2y6tG0bUXCXiUA,
	christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
	jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <alpine.DEB.2.11.1501271620520.1725-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>

On Tue, 2015-01-27 at 16:52 +0000, Scot Doyle wrote:
> On Tue, 27 Jan 2015, Jarkko Sakkinen wrote:
> > Fixed suspend/resume paths for TPM 2.0 and consolidated all the
> > associated code to the tpm_pm_suspend() and tpm_pm_resume()
> > functions. Resume path should be handled by the firmware, i.e.
> > Startup(CLEAR) for hibernate and Startup(STATE) for suspend.
> > 
> > There might be some non-PC embedded devices in the future where
> > Startup() is not the handled by the FW but fixing the code for
> > those IMHO should be postponed until there is hardware available
> > to test the fixes although extra Startup in the driver code is
> > essentially a NOP.
> > 
> > Reported-by: Peter Hüwe <PeterHuewe-Mmb7MZpHnFY@public.gmane.org>
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > ---
> 
> ...
> 
> > --- a/drivers/char/tpm/tpm_tis.c
> > +++ b/drivers/char/tpm/tpm_tis.c
> > @@ -865,25 +865,23 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip)
> >  static int tpm_tis_resume(struct device *dev)
> >  {
> 
> ...
> 
> > +	/* TPM 1.2 requires self-test on resume. */
> > +	if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) {
> > +		ret = tpm_do_selftest(chip);
> > +		if (ret < 0)
> > +			return ret;
> 
> Just to note, the return value from tpm_do_selftest() on TPM 1.2 chips was 
> previously ignored. Mine does return 0.

Right. I can update the patch to ignore return value if the majority
wants that.

/Jarkko

^ permalink raw reply

* Re: [tpmdd-devel] [PATCH] tpm: fix suspend/resume paths for TPM 2.0
From: Jason Gunthorpe @ 2015-01-27 17:03 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Scot Doyle, peterhuewe-Mmb7MZpHnFY, ashley-fm2HMyfA2y6tG0bUXCXiUA,
	christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
	jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <1422377842.2912.1.camel-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>

On Tue, Jan 27, 2015 at 06:57:22PM +0200, Jarkko Sakkinen wrote:
> > > +	/* TPM 1.2 requires self-test on resume. */
> > > +	if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) {
> > > +		ret = tpm_do_selftest(chip);
> > > +		if (ret < 0)
> > > +			return ret;
> > 
> > Just to note, the return value from tpm_do_selftest() on TPM 1.2 chips was 
> > previously ignored. Mine does return 0.
> 
> Right. I can update the patch to ignore return value if the majority
> wants that.

What happens to the system when pnp_driver.resume() returns failure?

Should tpm ever report failure on resume to the rest of the kernel?

Shouldn't this stuff be in tpm_pm_resume common code anyhow?

Jason

^ permalink raw reply

* Re: [tpmdd-devel] [PATCH] tpm: fix suspend/resume paths for TPM 2.0
From: Jarkko Sakkinen @ 2015-01-27 17:23 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Scot Doyle, peterhuewe-Mmb7MZpHnFY, ashley-fm2HMyfA2y6tG0bUXCXiUA,
	christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
	jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <20150127170308.GA10140-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

On Tue, 2015-01-27 at 10:03 -0700, Jason Gunthorpe wrote:
> On Tue, Jan 27, 2015 at 06:57:22PM +0200, Jarkko Sakkinen wrote:
> > > > +	/* TPM 1.2 requires self-test on resume. */
> > > > +	if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) {
> > > > +		ret = tpm_do_selftest(chip);
> > > > +		if (ret < 0)
> > > > +			return ret;
> > > 
> > > Just to note, the return value from tpm_do_selftest() on TPM 1.2 chips was 
> > > previously ignored. Mine does return 0.
> > 
> > Right. I can update the patch to ignore return value if the majority
> > wants that.
> 
> What happens to the system when pnp_driver.resume() returns failure?
> 
> Should tpm ever report failure on resume to the rest of the kernel?
> 
> Shouldn't this stuff be in tpm_pm_resume common code anyhow?

I think it should but not in the scope of this bug fix IMHO.

> Jason

/Jarkko

^ permalink raw reply

* Re: [tpmdd-devel] [PATCH] tpm: fix suspend/resume paths for TPM 2.0
From: Jarkko Sakkinen @ 2015-01-27 17:33 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Scot Doyle, peterhuewe, ashley, christophe.ricard,
	jason.gunthorpe, linux-api, linux-kernel, tpmdd-devel,
	trousers-tech
In-Reply-To: <1422379417.2912.4.camel@linux.intel.com>

On Tue, 2015-01-27 at 19:23 +0200, Jarkko Sakkinen wrote:
> On Tue, 2015-01-27 at 10:03 -0700, Jason Gunthorpe wrote:
> > On Tue, Jan 27, 2015 at 06:57:22PM +0200, Jarkko Sakkinen wrote:
> > > > > +	/* TPM 1.2 requires self-test on resume. */
> > > > > +	if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) {
> > > > > +		ret = tpm_do_selftest(chip);
> > > > > +		if (ret < 0)
> > > > > +			return ret;
> > > > 
> > > > Just to note, the return value from tpm_do_selftest() on TPM 1.2 chips was 
> > > > previously ignored. Mine does return 0.
> > > 
> > > Right. I can update the patch to ignore return value if the majority
> > > wants that.
> > 
> > What happens to the system when pnp_driver.resume() returns failure?
> > 
> > Should tpm ever report failure on resume to the rest of the kernel?
> > 
> > Shouldn't this stuff be in tpm_pm_resume common code anyhow?
> 
> I think it should but not in the scope of this bug fix IMHO.

This may sound stupid but maybe I should not handle the return value of
tpm_do_selftest() with the same reasoning (not in the scope of this fix)
because it modifies semantics and my fix only fixes TPM 2.0 stuff.

I could leave a comment there that this return value is not handle as a
remainder.

> > Jason

/Jarkko

^ permalink raw reply

* Re: [PATCH net-next v1 05/18] net: tx4939: use __ethtool_get_ksettings
From: David Decotigny @ 2015-01-27 17:38 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: David S. Miller, Ben Hutchings, Amir Vadai,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Eric Dumazet,
	Eugenia Emantayev, Or Gerlitz, Ido Shamay, Joe Perches,
	Saeed Mahameed, Govindarajulu Varadarajan, Venkata Duvvuru,
	Jeff Kirsher, Eyal Perry, Pravin B Shelar, Ed Swierk
In-Reply-To: <54C78540.6080404-M4DtvfQ/ZS1MRgGoP+s0PdBPR1lH4CV8@public.gmane.org>

On Tue, Jan 27, 2015 at 4:32 AM, Sergei Shtylyov
<sergei.shtylyov-M4DtvfQ/ZS1MRgGoP+s0PdBPR1lH4CV8@public.gmane.org> wrote:
> linux-mips-6z/3iImG2C/i7sgoIIk9UQ@public.gmane.org

Thanks, added mips + usnic + fcoe in my copy for the next wave of
reviews. Also updated the subject line.

^ permalink raw reply

* Re: [PATCH 01/13] kdbus: add documentation
From: Michael Kerrisk (man-pages) @ 2015-01-27 17:53 UTC (permalink / raw)
  To: David Herrmann
  Cc: mtk.manpages, Tom Gundersen, Greg Kroah-Hartman, Daniel Mack,
	Arnd Bergmann, Eric W. Biederman, One Thousand Gnomes,
	Jiri Kosina, Andy Lutomirski, Linux API, LKML, Djalal Harouni,
	Johannes Stezenbach, Theodore T'so, christoph Hellwig
In-Reply-To: <CANq1E4TtBhA6Ygu9qMU8xVxbL9q+ZBqnfdU0ceeTTfam9aT+=w@mail.gmail.com>

On 01/27/2015 04:23 PM, David Herrmann wrote:
> Hi
> 
> On Mon, Jan 26, 2015 at 5:45 PM, Michael Kerrisk (man-pages)
> <mtk.manpages@gmail.com> wrote:
>> On 01/26/2015 04:26 PM, Tom Gundersen wrote:
>>> On Mon, Jan 26, 2015 at 3:42 PM, Michael Kerrisk (man-pages)
>>> <mtk.manpages@gmail.com> wrote:
>>>> 2. Is the API to be invoked directly by applications or is intended to
>>>>    be used only behind specific libraries? You seem to be saying that
>>>>    the latter is the case (here, I'm referring to your comment above
>>>>    about sd-bus). However, when I asked David Herrmann a similar
>>>>    question I got this responser:
>>>>
>>>>       "kdbus is in no way bound to systemd. There are ongoing efforts
>>>>        to port glib and qt to kdbus natively. The API is pretty simple
>>>>        and I don't see how a libkdbus would simplify things. In fact,
>>>>        even our tests only have slim wrappers around the ioctls to
>>>>        simplify error-handling in test-scenarios."
>>>>
>>>>    To me, that implies that users will employ the raw kernel API.
>>>
>>> The way I read this is that there will (probably) be a handful of
>>> users, namely the existing dbus libraries: libdus, sd-bus, glib, Qt,
>>> ell, and maybe a few others. However, third-party developers will not
>>> know/care about the details of kdbus, they'll just be coding against
>>> the dbus libraries as before (might be minor changes, but they
>>> certainly won't need to know anything about the kernel API). Similarly
>>> to how userspace developers now code against their libc of choice,
>>> rather than use kernel syscalls directly.
>>
>> Thanks, Tom, for the input. I'm still confused though, since elsewhere
>> in this thread David Herrmann said in response to a question of mine:
>>
>>     I think we can agree that we want it to be generically useful,
>>     like other ipc mechanisms, including UDS and netlink.
>>
>> Again, that sounds to me like the vision is not "a handful of users".
>> Hopefully Greg and David can clarify.
> 
> I only expect a handful of users to call the ioctls directly. The
> libraries that implement the payload-marshaling, in particular. It's a
> similar situation with netlink.

Thanks, David, for the clarification. I think it would have been helpful
to have that more clearly stated up front, especially as some comments 
in this thread, such as the above, could be interpreted to mean quite 
the opposite. Can I suggest that some text on this point be added to 
kdbus.txt?

Thanks,

Michael

-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: [PATCH 01/13] kdbus: add documentation
From: Michael Kerrisk (man-pages) @ 2015-01-27 18:03 UTC (permalink / raw)
  To: David Herrmann
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w, Greg Kroah-Hartman,
	Austin S Hemmelgarn, Daniel Mack, Arnd Bergmann,
	Eric W. Biederman, One Thousand Gnomes, Tom Gundersen,
	Theodore T'so, Andy Lutomirski, Linux API, linux-kernel,
	Djalal Harouni, Johannes Stezenbach, Christoph Hellwig
In-Reply-To: <CANq1E4SkHhs1pWUe-TzG7bzk1M-Q++mB2vmQGuYx0RMF53wg4Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hi David,

On 01/27/2015 04:05 PM, David Herrmann wrote:
> Hi
> 
> On Mon, Jan 26, 2015 at 3:46 PM, Michael Kerrisk (man-pages)
> <mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> Hello Greg,
>>
>> On 01/23/2015 05:08 PM, Greg Kroah-Hartman wrote:
>>> On Thu, Jan 22, 2015 at 09:49:00AM -0500, Austin S Hemmelgarn wrote:
>>>> While I agree that there should be a way for userspace to get the list of
>>>> supported operations, userspace apps will only actually care about that
>>>> once, when they begin talking to kdbus, because (ignoring the live kernel
>>>> patching that people have been working on recently) the list of supported
>>>> operations isn't going to change while the system is running.  While a u64
>>>> copy has relatively low overhead, it does have overhead, and that is very
>>>> significant when you consider part of the reason some people want kdbus is
>>>> for the performance gain.  Especially for those automotive applications that
>>>> have been mentioned which fire off thousands of messages during start-up,
>>>> every little bit of performance is significant.
>>>
>>> A single u64 in a structure is not going to be measurable at all,
>>> processors just copy memory too fast these days for 4 extra bytes to be
>>> noticable.
>>
>> It depends on the definition of measurable, I suppose, but this statement
>> appears incorrect to me. In some cases (e.g., kdbus_msg_info) we're talking
>> about *two* u64 fields (kernel_gs, kernel_msg_flags) being used to pass back
>> sets of valid flags. That's 16 bytes, and it definitely makes a difference.
>> Simply running a loop that does a naive memcpy() in a tight user-space
>> loop (code below), I see the following for the execution of 1e9 loops:
>>
>>     Including the two extra u64 fields: 3.2 sec
>>     Without the two extra u64 fields:   2.6 sec
>>
>> On the same box, doing 1e9 calls to getppid() (i.e., pretty much the
>> simplest syscall, giving us a rough measure of the context switch) takes
>> 68 seconds. In other words, the cost of copying those 16 bytes is about 1%
>> of the base context switch/syscall cost. I assume the costs of copying
>> those 16 bytes across the kernel-user-space boundary would not be cheaper,
>> but have not tested that. If my assumption is correct, then 1% seems a
>> significant figure to me in an API whose raison d'être is speed.
> 
> I have no idea how this is related to any kdbus ioctl?
> 
> A 16byte copy does not affect the performance of kdbus message
> transactions in any way that matters.

I'm not sure if it's related/significant or not, since I'm ignorant
of the performance figures for kdbus. I just got curious when Greg
stated that the cost of copying would not be noticeable. (I got curious 
also about my assumption, and did an experiment with a dummy system call
that throws bytes across the fence into user space. The cost of an
extra 16 bytes (56 to 72 bytes) is about 3% of the base syscall/context 
switch cost.)

>>> So let's make this as easy as possible for userspace, making
>>> it simpler logic there, which is much more important than saving
>>> theoretical time in the kernel.
>>
>> But this also missed the other part of the point. Copying these fields on
>> every operation, when in fact they are only needed once, clutters the API,
>> in my opinion. Good APIs are as simple as they can be to do their job.
>> Redundancy is an enemy of simplicity. Simplest would have been a one time
>> API that returns a structure containing all of the supported flags across
>> the API. Alternatively, the traditional EINVAL approach is well understood,
>> and suffices.
> 
> We're going to drop "kernel_flags" in favor of a new
> KDBUS_FLAG_NEGOTIATE flag which asks the kernel to do feature
> negotiation for this ioctl and return the supported flags/items inline
> (overwriting the passed data). The ioctl will not be executed and will
> not affect the state of the FD.
> I hope this keeps the API simple.

Not sure I quite understand the details from your description, but I assume 
the it'll end up in the doc, and I'll try to take a look later.

Thanks,

Michael

-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: [PATCH 01/13] kdbus: add documentation
From: Daniel Mack @ 2015-01-27 18:14 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages), David Herrmann
  Cc: Tom Gundersen, Greg Kroah-Hartman, Arnd Bergmann,
	Eric W. Biederman, One Thousand Gnomes, Jiri Kosina,
	Andy Lutomirski, Linux API, LKML, Djalal Harouni,
	Johannes Stezenbach, Theodore T'so, christoph Hellwig
In-Reply-To: <54C7D0A3.4000900-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Hi Michael,

On 01/27/2015 06:53 PM, Michael Kerrisk (man-pages) wrote:
> On 01/27/2015 04:23 PM, David Herrmann wrote:

>> I only expect a handful of users to call the ioctls directly. The
>> libraries that implement the payload-marshaling, in particular. It's a
>> similar situation with netlink.
> 
> Thanks, David, for the clarification. I think it would have been helpful
> to have that more clearly stated up front, especially as some comments 
> in this thread, such as the above, could be interpreted to mean quite 
> the opposite. Can I suggest that some text on this point be added to 
> kdbus.txt?

We're currently working on an a set of comprehensive man pages to
document all the commands in the API, along with every struct, enum etc.
We do that so that developers are able to actually understand every
detail of the API, even though most people - as David explained - will
not use that interface directly in the first place but let one of the
high-level libraries help them integrate D-Bus functionality into their
applications.

If you want, have a look at the upstream repository for a 	preliminary
version of the new docs.


Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH] arm: sunxi: input: RFC: Add sysfs voltage for sun4i-lradc driver
From: Dmitry Torokhov @ 2015-01-27 19:31 UTC (permalink / raw)
  To: Hans de Goede
  Cc: Priit Laes, linux-sunxi-/JYPxA39Uh5TLH3MbocFFw, Maxime Ripard,
	ABI/API, moderated list:ARM/Allwinner A1X..., open list,
	open list:SUN4I LOW RES ADC...
In-Reply-To: <54C75467.7010909-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Tue, Jan 27, 2015 at 10:03:35AM +0100, Hans de Goede wrote:
> Hi,
> 
> On 26-01-15 23:06, Dmitry Torokhov wrote:
> >On Mon, Jan 26, 2015 at 08:28:29PM +0100, Hans de Goede wrote:
> >>Hi,
> >>
> >>On 26-01-15 17:58, Priit Laes wrote:
> >>
> >>No commit message? Please write an informative commit msg, like why we want this patch,
> >>I guess it is to help figuring out the voltage levels for various buttons when creating
> >>a dts, but I would prefer to not guess, which is where a good commit message would
> >>come in handy ...
> >>
> >>>---
> >>>  .../ABI/testing/sysfs-driver-input-sun4i-lradc     |  4 ++
> >>>  drivers/input/keyboard/sun4i-lradc-keys.c          | 49 +++++++++++++++++-----
> >>>  2 files changed, 43 insertions(+), 10 deletions(-)
> >>>  create mode 100644 Documentation/ABI/testing/sysfs-driver-input-sun4i-lradc
> >>>
> >>>diff --git a/Documentation/ABI/testing/sysfs-driver-input-sun4i-lradc b/Documentation/ABI/testing/sysfs-driver-input-sun4i-lradc
> >>>new file mode 100644
> >>>index 0000000..e4e6448
> >>>--- /dev/null
> >>>+++ b/Documentation/ABI/testing/sysfs-driver-input-sun4i-lradc
> >>>@@ -0,0 +1,4 @@
> >>>+What:		/sys/class/input/input(x)/device/voltage
> >>>+Date:		February 2015
> >>>+Contact:	Priit Laes <plaes-q/aMd4JkU83YtjvyW6yDsg@public.gmane.org>
> >>>+Description:	ADC output voltage in microvolts or 0 if device is not opened.
> >>>diff --git a/drivers/input/keyboard/sun4i-lradc-keys.c b/drivers/input/keyboard/sun4i-lradc-keys.c
> >>>index cc8f7dd..c0ab8ec 100644
> >>>--- a/drivers/input/keyboard/sun4i-lradc-keys.c
> >>>+++ b/drivers/input/keyboard/sun4i-lradc-keys.c
> >>>@@ -79,10 +79,27 @@ struct sun4i_lradc_data {
> >>>  	u32 vref;
> >>>  };
> >>>
> >>>+static u32 sun4i_lradc_read_voltage(struct sun4i_lradc_data *lradc)
> >>>+{
> >>>+	u32 val = readl(lradc->base + LRADC_DATA0) & 0x3f;
> >>>+	return val * lradc->vref / 63;
> >>>+};
> >>>+
> >>>+static ssize_t
> >>>+sun4i_lradc_dev_voltage_show(struct device *dev,
> >>>+			struct device_attribute *attr, char *buf)
> >>>+{
> >>>+	struct sun4i_lradc_data *lradc = dev_get_drvdata(dev);
> >>>+
> >>>+	return sprintf(buf, "%u\n", sun4i_lradc_read_voltage(lradc));
> >>>+}
> >>>+
> >>>+static const DEVICE_ATTR(voltage, S_IRUGO, sun4i_lradc_dev_voltage_show, NULL);
> >>>+
> >>>  static irqreturn_t sun4i_lradc_irq(int irq, void *dev_id)
> >>>  {
> >>>  	struct sun4i_lradc_data *lradc = dev_id;
> >>>-	u32 i, ints, val, voltage, diff, keycode = 0, closest = 0xffffffff;
> >>>+	u32 i, ints, voltage, diff, keycode = 0, closest = 0xffffffff;
> >>>
> >>>  	ints  = readl(lradc->base + LRADC_INTS);
> >>>
> >>>@@ -97,8 +114,7 @@ static irqreturn_t sun4i_lradc_irq(int irq, void *dev_id)
> >>>  	}
> >>>
> >>>  	if ((ints & CHAN0_KEYDOWN_IRQ) && lradc->chan0_keycode == 0) {
> >>>-		val = readl(lradc->base + LRADC_DATA0) & 0x3f;
> >>>-		voltage = val * lradc->vref / 63;
> >>>+		voltage = sun4i_lradc_read_voltage(lradc);
> >>>
> >>>  		for (i = 0; i < lradc->chan0_map_count; i++) {
> >>>  			diff = abs(lradc->chan0_map[i].voltage - voltage);
> >>>@@ -156,7 +172,7 @@ static void sun4i_lradc_close(struct input_dev *dev)
> >>>  }
> >>>
> >>>  static int sun4i_lradc_load_dt_keymap(struct device *dev,
> >>>-				      struct sun4i_lradc_data *lradc)
> >>>+				    struct sun4i_lradc_data *lradc)
> >>>  {
> >>>  	struct device_node *np, *pp;
> >>>  	int i;
> >>
> >>Why this identation change ?
> >>
> >>>@@ -168,8 +184,8 @@ static int sun4i_lradc_load_dt_keymap(struct device *dev,
> >>>
> >>>  	lradc->chan0_map_count = of_get_child_count(np);
> >>>  	if (lradc->chan0_map_count == 0) {
> >>>-		dev_err(dev, "keymap is missing in device tree\n");
> >>>-		return -EINVAL;
> >>>+		dev_info(dev, "keymap is missing in device tree\n");
> >>>+		return 0;
> >>>  	}
> >>>
> >>>  	lradc->chan0_map = devm_kmalloc_array(dev, lradc->chan0_map_count,
> >>
> >>I assume this is so that people can still use the sysfs node, to create a dts, right
> >>not sure I like this, might be better to document to simple create a dts with
> >>a single button mapping for 200 mV (most board use 200 mV steps between the buttons).
> >>
> >>>@@ -185,19 +201,19 @@ static int sun4i_lradc_load_dt_keymap(struct device *dev,
> >>>
> >>>  		error = of_property_read_u32(pp, "channel", &channel);
> >>>  		if (error || channel != 0) {
> >>>-			dev_err(dev, "%s: Inval channel prop\n", pp->name);
> >>>+			dev_err(dev, "%s: Invalid 'channel' property\n", pp->name);
> >>>  			return -EINVAL;
> >>>  		}
> >>>
> >>>  		error = of_property_read_u32(pp, "voltage", &map->voltage);
> >>>  		if (error) {
> >>>-			dev_err(dev, "%s: Inval voltage prop\n", pp->name);
> >>>+			dev_err(dev, "%s: Invalid 'voltage' property\n", pp->name);
> >>>  			return -EINVAL;
> >>>  		}
> >>>
> >>>  		error = of_property_read_u32(pp, "linux,code", &map->keycode);
> >>>  		if (error) {
> >>>-			dev_err(dev, "%s: Inval linux,code prop\n", pp->name);
> >>>+			dev_err(dev, "%s: Invalid 'linux,code' property\n", pp->name);
> >>>  			return -EINVAL;
> >>>  		}
> >>>
> >>
> >>This hunk / 3 changes belong in a separate patch. Also please run checkpatch, I think
> >>you're running over 80 chars here.
> >>
> >>
> >>>@@ -257,14 +273,26 @@ static int sun4i_lradc_probe(struct platform_device *pdev)
> >>>  	if (error)
> >>>  		return error;
> >>>
> >>>-	error = input_register_device(lradc->input);
> >>>+	error = device_create_file(dev, &dev_attr_voltage);
> >>>  	if (error)
> >>>  		return error;
> >>>
> >>>+	error = input_register_device(lradc->input);
> >>>+	if (error) {
> >>>+		device_remove_file(&pdev->dev, &dev_attr_voltage);
> >>>+		return error;
> >>>+	}
> >>>+
> >>>  	platform_set_drvdata(pdev, lradc);
> >>>  	return 0;
> >>>  }
> >>>
> >>>+static int sun4i_lradc_remove(struct platform_device *pdev)
> >>>+{
> >>>+	device_remove_file(&pdev->dev, &dev_attr_voltage);
> >>>+	return 0;
> >>>+}
> >>>+
> >>
> >>This looks wrong, I think (*) that we've a bug here because we're not
> >>unregistering the input device, so maybe do 2 patches, 1 fixing the
> >>not unregistering bug, and then just add the device_remove_file()
> >>in the sysfs patch.
> >
> >The unregister was not necessary since the input device is managed.
> 
> Ah right, looking at the code again I see we use devm_input_allocate_device()
> is there no devm_create_file for creating sysfs entries ?

Greg was pushing the viewpoint that no drivers should create device attributes
manually (since it is somewhat racy - attributes are created after devices show
up) but I do not think he's gonna win that ever. So if someone were to add
devm_create_attribute_group() API I think that would be great. In absence of
this there is always devm_add_action().

Thanks.

-- 
Dmitry

^ permalink raw reply

* Re: [RFC][PATCH v2] procfs: Always expose /proc/<pid>/map_files/ and make it readable
From: Kees Cook @ 2015-01-27 19:53 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andrew Morton, Kirill A. Shutemov, Calvin Owens, Alexey Dobriyan,
	Oleg Nesterov, Eric W. Biederman, Al Viro, Kirill A. Shutemov,
	Peter Feiner, Grant Likely, Siddhesh Poyarekar, LKML,
	kernel-team-b10kYP2dOMg, Pavel Emelyanov, Linux API
In-Reply-To: <20150127073713.GJ651@moon>

On Mon, Jan 26, 2015 at 11:37 PM, Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> On Mon, Jan 26, 2015 at 04:15:26PM -0800, Kees Cook wrote:
>> >
>> > akpm3:/usr/src/25> grep -r map_files Documentation
>>
>> If akpm's comments weren't clear: this needs to be fixed. Everything
>> in /proc should appear in Documentation.
>
> I'll do that.
>
>> > The 640708a2cff7f81 changelog says:
>> >
>> > :     This one behaves similarly to the /proc/<pid>/fd/ one - it contains
>> > :     symlinks one for each mapping with file, the name of a symlink is
>> > :     "vma->vm_start-vma->vm_end", the target is the file.  Opening a symlink
>> > :     results in a file that point exactly to the same inode as them vma's one.
>> > :
>> > :     For example the ls -l of some arbitrary /proc/<pid>/map_files/
>> > :
>> > :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>> > :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>> > :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>> > :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>> > :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
>>
>> How is mmap offset represented in this output?
>
> We're printing vm_area_struct:[vm_start;vm_end] only.
>
>> > afacit this info is also available in /proc/pid/maps, so things
>> > shouldn't get worse if the /proc/pid/map_files permissions are at least
>> > as restrictive as the /proc/pid/maps permissions.  Is that the case?
>> > (Please add to changelog).
>>
>> Both maps and map_files uses ptrace_may_access (via mm_acces) with
>> PTRACE_MODE_READ, so I'm happy from a info leak perspective.
>>
>> Are mount namespaces handled in this output?
>
> Could you clarify this moment, i'm not sure i get it.

I changed how I asked this question in my review of the documentation,
but it looks like these symlinks aren't "regular" symlinks (that are
up to the follower to have access to the file system path shown), but
rather they bypass VFS. As a result, I'm wondering how things like
mount namespaces might change this behavior: what is shown, the path
from the perspective of the target, or from the viewer (which may be
in separate mount namespaces).

-Kees

>
>>
>> > There's one other problem here: we're assuming that the map_files
>> > implementation doesn't have bugs.  If it does have bugs then relaxing
>> > permissions like this will create new vulnerabilities.  And the
>> > map_files implementation is surprisingly complex.  Is it bug-free?



-- 
Kees Cook
Chrome OS Security

^ permalink raw reply

* Re: [RFC][PATCH v2] procfs: Always expose /proc/<pid>/map_files/ and make it readable
From: Cyrill Gorcunov @ 2015-01-27 21:35 UTC (permalink / raw)
  To: Kees Cook, Pavel Emelyanov
  Cc: Andrew Morton, Kirill A. Shutemov, Calvin Owens, Alexey Dobriyan,
	Oleg Nesterov, Eric W. Biederman, Al Viro, Kirill A. Shutemov,
	Peter Feiner, Grant Likely, Siddhesh Poyarekar, LKML,
	kernel-team-b10kYP2dOMg, Linux API
In-Reply-To: <CAGXu5jJFFib7F7uKYgvX4ecyMnbincd22FaO_bFy=VRVKdFbvA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Jan 27, 2015 at 11:53:19AM -0800, Kees Cook wrote:
> >>
> >> Are mount namespaces handled in this output?
> >
> > Could you clarify this moment, i'm not sure i get it.
> 
> I changed how I asked this question in my review of the documentation,
> but it looks like these symlinks aren't "regular" symlinks (that are
> up to the follower to have access to the file system path shown), but
> rather they bypass VFS. As a result, I'm wondering how things like
> mount namespaces might change this behavior: what is shown, the path
> from the perspective of the target, or from the viewer (which may be
> in separate mount namespaces).

I must admit I personally didn't investigating how mount namespaces
might itercat with map-files. Pavel, could you share the thoughts?

^ permalink raw reply

* Re: [RFC][PATCH v2] procfs: Always expose /proc/<pid>/map_files/ and make it readable
From: Pavel Emelyanov @ 2015-01-27 21:46 UTC (permalink / raw)
  To: Kees Cook, Cyrill Gorcunov
  Cc: Andrew Morton, Kirill A. Shutemov, Calvin Owens, Alexey Dobriyan,
	Oleg Nesterov, Eric W. Biederman, Al Viro, Kirill A. Shutemov,
	Peter Feiner, Grant Likely, Siddhesh Poyarekar, LKML, kernel-team,
	Pavel Emelyanov, Linux API
In-Reply-To: <CAGXu5jJFFib7F7uKYgvX4ecyMnbincd22FaO_bFy=VRVKdFbvA@mail.gmail.com>


>>> Are mount namespaces handled in this output?
>>
>> Could you clarify this moment, i'm not sure i get it.
> 
> I changed how I asked this question in my review of the documentation,
> but it looks like these symlinks aren't "regular" symlinks (that are
> up to the follower to have access to the file system path shown), but
> rather they bypass VFS. As a result, I'm wondering how things like
> mount namespaces might change this behavior: what is shown, the path
> from the perspective of the target, or from the viewer (which may be
> in separate mount namespaces).

These work just like the /proc/$pid/fd/$n links do. When you readlink
on it the d_path() is called which walks up the dentry/vfsmnt tree
until it reaches either current root or the global one. For "another"
mount namespace case it produces the path relative to this namespace's
root.

Thanks,
Pavel

^ permalink raw reply

* Re: [v8 4/5] ext4: adds FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support
From: Dave Chinner @ 2015-01-28  0:37 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Andy Lutomirski, Li Xi, Linux FS Devel,
	linux-ext4@vger.kernel.org, Linux API, Theodore Ts'o,
	Andreas Dilger, Jan Kara, Al Viro, Christoph Hellwig, dmonakhov,
	Eric W. Biederman
In-Reply-To: <54C76C3D.4070404@yandex-team.ru>

On Tue, Jan 27, 2015 at 01:45:17PM +0300, Konstantin Khlebnikov wrote:
> On 27.01.2015 11:02, Dave Chinner wrote:
> >On Fri, Jan 23, 2015 at 03:59:04PM -0800, Andy Lutomirski wrote:
> >>On Fri, Jan 23, 2015 at 3:30 PM, Dave Chinner <david@fromorbit.com> wrote:
> >>>On Fri, Jan 23, 2015 at 02:58:09PM +0300, Konstantin Khlebnikov wrote:
> >>
> >>I think I must be missing something simple here.  In a hypothetical
> >>world where the code used nsown_capable, if an admin wants to stick a
> >>container in /mnt/container1 with associated prid 1 and a userns,
> >>shouldn't it just map only prid 1 into the user ns?  Then a user in
> >>that userns can't try to change the prid of a file to 2 because the
> >>number "2" is unmapped for that user and translation will fail.
> >
> >You've effectively said "yes, project quotas are enabled, but you
> >only have a single ID, it's always turned on and you can't change it
> >to anything else.
> >
> >So, why do they need to be mapped via user namespaces to enable
> >this? Think about it a little harder:
> >
> >	- Project IDs are not user IDs.
> >	- Project IDs are not a security/permission mechanism.
> >	- Project quotas only provide a mechanism for
> >	  resource usage control.
> >
> >Think about that last one some more. Perhaps, as a hint, I should
> >relate it to control groups? :) i.e:
> >
> >	- Project quotas can be used as an effective mount ns space
> >	  usage controller.
> >
> >But this can only be safely and reliably by keeping the project IDs
> >inaccessible from the containers themselves. I don't see why a
> >mechanism that controls the amount of filesystem space used by a
> >container should be considered any differently to a memory control
> >group that limits the amount of memory the container can use.
> >
> >However, nobody on the container side of things would answer any of
> >my questions about how project quotas were going to be used,
> >limited, managed, etc back when we had to make a decision to enable
> >XFS user ns support, I did what was needed to support the obvious
> >container use case and close any possible loop hole that containers
> >might be able to use to subvert that use case.
> 
> I have a solution: Hierarchical Project Quota! Each project might have
> parent project and so on. Each level keeps usage, limits and also keeps
> some preallocation from parent level to reduce count of quota updates.

That's an utter nightmare to manage - just ask the gluster guys who
thought this was a good idea when they first implemented quotas.

Besides, following down the path of heirarchical control groups
doesn't seem like a good idea to me because that path has already
proven to be a bad idea for container resource controllers. There's
good reason why control groups have gone back to a flattened ID
space like we already have for project quotas, so I don't think we
want to go that way.

> This might be useful even without containers : normal user quota has
> two levels and admins might classify users into groups and set group
> quota for them. Project quota is flat and cannot provide any control
> if we want classify projects.

I don't follow. project ID is exactly what allows you to control
project classification.

> For containers hierarchy provide full virtualization: user-namespace
> maps maps second-level and projects into subset of real projects.

It's not the mapping that matters - if project quotas are used
outside containers as a resource controller, then they can't be
used inside containers even with a unique mapping range because
we can only store a single project ID per inode.

Besides, I'm struggling to see the use case for project quotas
inside small containers that run single applications and typically
only have a single user. Project quotas have traditionally been used
to manage space in large filesystems shared by many users along
bounds that don't follow any specific heirarchy or permission set.

IOWs, you haven't described your use case for needing project quotas
inside containers, so I've got no idea what problem you are trying
to solve or whether project quotas are even appropriate as a
solution.

> Changing limits and other managing for second-level project quotas
> could be done in user-space by system service (systemd I suppose. lol),
> so we don't have to manage this stuff inside the kernel.

So you are proposing a fourth on-disk quota here i.e.  user, group,
project, new_2nd_level_project? If so, forget about using systemd to
manage it, the first thing we'll need is need full support in
existing quota tools so that the regression tests you write for
xfstests are self contained.

> [ I'm already working on prototype for ext4 ]

You really need to post a use case description and  adesign document
for review so we can actually discuss what you are planning. So far
everything you are doing strikes me as a "because they are there and
it sounds cool" type of development, not because people actually
need them. Let's have a discussion about the real problems and
architecture, not waste time on a stupid "solution looking for a
problem" discussion...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply

* Re: [v8 4/5] ext4: adds FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support
From: Andy Lutomirski @ 2015-01-28  0:45 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Konstantin Khlebnikov, Li Xi, Linux FS Devel,
	linux-ext4-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Linux API,
	Theodore Ts'o, Andreas Dilger, Jan Kara, Al Viro,
	Christoph Hellwig, dmonakhov-GEFAQzZX7r8dnm+yROfE0A,
	Eric W. Biederman
In-Reply-To: <20150127080239.GQ16552@dastard>

On Tue, Jan 27, 2015 at 12:02 AM, Dave Chinner <david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org> wrote:
> On Fri, Jan 23, 2015 at 03:59:04PM -0800, Andy Lutomirski wrote:
>> On Fri, Jan 23, 2015 at 3:30 PM, Dave Chinner <david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org> wrote:
>> > On Fri, Jan 23, 2015 at 02:58:09PM +0300, Konstantin Khlebnikov wrote:
>> >> On 23.01.2015 04:53, Dave Chinner wrote:
>> >> >On Thu, Jan 22, 2015 at 06:28:51PM +0300, Konstantin Khlebnikov wrote:
>> >> >>>+  kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
>> >> >>
>> >> >>Maybe current_user_ns()?
>> >> >>This code should be user-namespace aware from the beginning.
>> >> >
>> >> >No, the code is correct. Project quotas have nothing to do with
>> >> >UIDs and so should never have been included in the uid/gid
>> >> >namespace mapping infrastructure in the first place.
>> >>
>> >> Right, but user-namespace provides id mapping for project-id too.
>> >> This infrastructure adds support for nested project quotas with
>> >> virtualized ids in sub-containers. I couldn't say that this is
>> >> must have feature but implementation is trivial because whole
>> >> infrastructure is already here.
>> >
>> > This is an extremely common misunderstanding of project IDs. Project
>> > IDs are completely separate to the UID/GID namespace.  Project
>> > quotas were originally designed specifically for
>> > accounting/enforcing quotas in situations where uid/gid
>> > accounting/enforcing is not possible. This design intent goes back
>> > 25 years - it predates XFS...
>> >
>> > IOWs, mapping prids via user namespaces defeats the purpose
>> > for which prids were originally intended for.
>> >
>> >> >Point in case: directory subtree quotas can be used as a resource
>> >> >controller for limiting space usage within separate containers that
>> >> >share the same underlying (large) filesystem via mount namespaces.
>> >>
>> >> That's exactly my use-case: 'sub-volumes' for containers with
>> >> quota for space usage/inodes count.
>> >
>> > That doesn't require mapped project IDs. Hard container space limits
>> > can only be controlled by the init namespace, and because inodes can
>> > hold only one project ID the current ns cannot be allowed to change
>> > the project ID on the inode because that allows them to escape the
>> > resource limits set on the project ID associated with the sub-mount
>> > set up by the init namespace...
>> >
>> > i.e.
>> >
>> > /mnt                    prid = 0, default for entire fs.
>> > /mnt/container1/        prid = 1, inherit, 10GB space limit
>> > /mnt/container2/        prid = 2, inherit, 50GB space limit
>> > .....
>> > /mnt/containerN/        prid = N, inherit, 20GB space limit
>> >
>> > And you clone the mount namespace for each container so the root is
>> > at the appropriate /mnt/containerX/.  Now the containers have a
>> > fixed amount of space they can use in the parent filesystem they
>> > know nothing about, and it is enforced by directory subquotas
>> > controlled by the init namespace.  This "fixed amount of space" is
>> > reflected in the container namespace when "df" is run as it will
>> > report the project quota space limits. Adding or removing space to a
>> > container is as simple as changing the project quota limits from the
>> > init namespace. i.e. an admin operation controlled by the host, not
>> > the container....
>> >
>> > Allowing the container to modify the prid and/or the inherit bit of
>> > inodes in it's namespace then means the user can define their own
>> > space usage limits, even turn them off. It's not a resource
>> > container at that point because the user can define their own
>> > limits.  Hence, only if the current_ns cannot change project quotas
>> > will we have a hard fence on space usage that the container *cannot
>> > exceed*.
>>
>> I think I must be missing something simple here.  In a hypothetical
>> world where the code used nsown_capable, if an admin wants to stick a
>> container in /mnt/container1 with associated prid 1 and a userns,
>> shouldn't it just map only prid 1 into the user ns?  Then a user in
>> that userns can't try to change the prid of a file to 2 because the
>> number "2" is unmapped for that user and translation will fail.
>
> You've effectively said "yes, project quotas are enabled, but you
> only have a single ID, it's always turned on and you can't change it
> to anything else.

It's got to be a assigned somehow.  Inheritance from the parent
directory probably works too, though.

>
> So, why do they need to be mapped via user namespaces to enable
> this? Think about it a little harder:
>
>         - Project IDs are not user IDs.
>         - Project IDs are not a security/permission mechanism.
>         - Project quotas only provide a mechanism for
>           resource usage control.
>
> Think about that last one some more. Perhaps, as a hint, I should
> relate it to control groups? :) i.e:
>
>         - Project quotas can be used as an effective mount ns space
>           usage controller.
>
> But this can only be safely and reliably by keeping the project IDs
> inaccessible from the containers themselves. I don't see why a
> mechanism that controls the amount of filesystem space used by a
> container should be considered any differently to a memory control
> group that limits the amount of memory the container can use.
>

Cgroups are ephemeral, and I'd want my containers' quotas to survive
container restarts and even reboots.  I'm sure it *could* be done,
though.

> However, nobody on the container side of things would answer any of
> my questions about how project quotas were going to be used,
> limited, managed, etc back when we had to make a decision to enable
> XFS user ns support, I did what was needed to support the obvious
> container use case and close any possible loop hole that containers
> might be able to use to subvert that use case.
>
> If we want to do anything different, then there's a *lot* of
> userns aware regression tests needed to be written for xfstests....

Agreed.

--Andy

^ permalink raw reply

* [PATCH v9 0/2] Add Spreadtrum SoC bindings and serial driver support
From: Chunyan Zhang @ 2015-01-28  2:47 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r
  Cc: robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	arnd-r2nGTMty4D4, gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8, pawel.moll-5wv7dgnIgG8,
	ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	jslaby-AlSwsSmVLrQ, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <sc9836-serial-v9>

Changes from v8:
	- Moved a few unuseful code lines.

Chunyan Zhang (2):
  Documentation: DT: Add bindings for Spreadtrum SoC Platform
  tty/serial: Add Spreadtrum sc9836-uart driver support

 Documentation/devicetree/bindings/arm/sprd.txt     |   11 +
 .../devicetree/bindings/serial/sprd-uart.txt       |    7 +
 .../devicetree/bindings/vendor-prefixes.txt        |    1 +
 drivers/tty/serial/Kconfig                         |   18 +
 drivers/tty/serial/Makefile                        |    1 +
 drivers/tty/serial/sprd_serial.c                   |  789 ++++++++++++++++++++
 include/uapi/linux/serial_core.h                   |    3 +
 7 files changed, 830 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/sprd.txt
 create mode 100644 Documentation/devicetree/bindings/serial/sprd-uart.txt
 create mode 100644 drivers/tty/serial/sprd_serial.c

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v9 1/2] Documentation: DT: Add bindings for Spreadtrum SoC Platform
From: Chunyan Zhang @ 2015-01-28  2:47 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r
  Cc: robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	arnd-r2nGTMty4D4, gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8, pawel.moll-5wv7dgnIgG8,
	ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	jslaby-AlSwsSmVLrQ, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <1422413261-17184-1-git-send-email-chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>

Adds Spreadtrum's prefix "sprd" to vendor-prefixes file.
Adds the devicetree binding documentations for Spreadtrum's sc9836-uart
and SC9836 SoC based on the Sharkl64 Platform which is a 64-bit SoC
Platform of Spreadtrum.

Signed-off-by: Chunyan Zhang <chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
---
 Documentation/devicetree/bindings/arm/sprd.txt     |   11 +++++++++++
 .../devicetree/bindings/serial/sprd-uart.txt       |    7 +++++++
 .../devicetree/bindings/vendor-prefixes.txt        |    1 +
 3 files changed, 19 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/sprd.txt
 create mode 100644 Documentation/devicetree/bindings/serial/sprd-uart.txt

diff --git a/Documentation/devicetree/bindings/arm/sprd.txt b/Documentation/devicetree/bindings/arm/sprd.txt
new file mode 100644
index 0000000..31a629d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sprd.txt
@@ -0,0 +1,11 @@
+Spreadtrum SoC Platforms Device Tree Bindings
+----------------------------------------------------
+
+Sharkl64 is a Spreadtrum's SoC Platform which is based
+on ARM 64-bit processor.
+
+SC9836 openphone board with SC9836 SoC based on the
+Sharkl64 Platform shall have the following properties.
+
+Required root node properties:
+        - compatible = "sprd,sc9836-openphone", "sprd,sc9836";
diff --git a/Documentation/devicetree/bindings/serial/sprd-uart.txt b/Documentation/devicetree/bindings/serial/sprd-uart.txt
new file mode 100644
index 0000000..2aff0f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/serial/sprd-uart.txt
@@ -0,0 +1,7 @@
+* Spreadtrum serial UART
+
+Required properties:
+- compatible: must be "sprd,sc9836-uart"
+- reg: offset and length of the register set for the device
+- interrupts: exactly one interrupt specifier
+- clocks: phandles to input clocks.
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index b1df0ad..0a8384f 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -153,6 +153,7 @@ snps	Synopsys, Inc.
 solidrun	SolidRun
 sony	Sony Corporation
 spansion	Spansion Inc.
+sprd	Spreadtrum Communications Inc.
 st	STMicroelectronics
 ste	ST-Ericsson
 stericsson	ST-Ericsson
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v9 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Chunyan Zhang @ 2015-01-28  2:47 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r
  Cc: robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	arnd-r2nGTMty4D4, gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8, pawel.moll-5wv7dgnIgG8,
	ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	jslaby-AlSwsSmVLrQ, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <1422413261-17184-1-git-send-email-chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>

Add a full sc9836-uart driver for SC9836 SoC which is based on the
spreadtrum sharkl64 platform.
This driver also support earlycon.

Originally-by: Lanqing Liu <lanqing.liu-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Signed-off-by: Orson Zhai <orson.zhai-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Signed-off-by: Chunyan Zhang <chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Acked-by: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>
---
 drivers/tty/serial/Kconfig       |   18 +
 drivers/tty/serial/Makefile      |    1 +
 drivers/tty/serial/sprd_serial.c |  789 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/serial_core.h |    3 +
 4 files changed, 811 insertions(+)
 create mode 100644 drivers/tty/serial/sprd_serial.c

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index c79b43c..13211f7 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1577,6 +1577,24 @@ config SERIAL_MEN_Z135
 	  This driver can also be build as a module. If so, the module will be called
 	  men_z135_uart.ko
 
+config SERIAL_SPRD
+	tristate "Support for Spreadtrum serial"
+	depends on ARCH_SPRD
+	select SERIAL_CORE
+	help
+	  This enables the driver for the Spreadtrum's serial.
+
+config SERIAL_SPRD_CONSOLE
+	bool "Spreadtrum UART console support"
+	depends on SERIAL_SPRD=y
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_EARLYCON
+	help
+	  Support for early debug console using Spreadtrum's serial. This enables
+	  the console before standard serial driver is probed. This is enabled
+	  with "earlycon" on the kernel command line. The console is
+	  enabled when early_param is processed.
+
 endmenu
 
 config SERIAL_MCTRL_GPIO
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 9a548ac..4801aca 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_SERIAL_ARC)	+= arc_uart.o
 obj-$(CONFIG_SERIAL_RP2)	+= rp2.o
 obj-$(CONFIG_SERIAL_FSL_LPUART)	+= fsl_lpuart.o
 obj-$(CONFIG_SERIAL_MEN_Z135)	+= men_z135_uart.o
+obj-$(CONFIG_SERIAL_SPRD) += sprd_serial.o
 
 # GPIOLIB helpers for modem control lines
 obj-$(CONFIG_SERIAL_MCTRL_GPIO)	+= serial_mctrl_gpio.o
diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
new file mode 100644
index 0000000..a08a57f
--- /dev/null
+++ b/drivers/tty/serial/sprd_serial.c
@@ -0,0 +1,789 @@
+/*
+ * Copyright (C) 2012-2015 Spreadtrum Communications Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+/* device name */
+#define UART_NR_MAX		8
+#define SPRD_TTY_NAME		"ttyS"
+#define SPRD_FIFO_SIZE		128
+#define SPRD_DEF_RATE		26000000
+#define SPRD_BAUD_IO_LIMIT	3000000
+#define SPRD_TIMEOUT		256
+
+/* the offset of serial registers and BITs for them */
+/* data registers */
+#define SPRD_TXD		0x0000
+#define SPRD_RXD		0x0004
+
+/* line status register and its BITs  */
+#define SPRD_LSR		0x0008
+#define SPRD_LSR_OE		BIT(4)
+#define SPRD_LSR_FE		BIT(3)
+#define SPRD_LSR_PE		BIT(2)
+#define SPRD_LSR_BI		BIT(7)
+#define SPRD_LSR_TX_OVER	BIT(15)
+
+/* data number in TX and RX fifo */
+#define SPRD_STS1		0x000C
+
+/* interrupt enable register and its BITs */
+#define SPRD_IEN		0x0010
+#define SPRD_IEN_RX_FULL	BIT(0)
+#define SPRD_IEN_TX_EMPTY	BIT(1)
+#define SPRD_IEN_BREAK_DETECT	BIT(7)
+#define SPRD_IEN_TIMEOUT	BIT(13)
+
+/* interrupt clear register */
+#define SPRD_ICLR		0x0014
+
+/* line control register */
+#define SPRD_LCR		0x0018
+#define SPRD_LCR_STOP_1BIT	0x10
+#define SPRD_LCR_STOP_2BIT	0x30
+#define SPRD_LCR_DATA_LEN	(BIT(2) | BIT(3))
+#define SPRD_LCR_DATA_LEN5	0x0
+#define SPRD_LCR_DATA_LEN6	0x4
+#define SPRD_LCR_DATA_LEN7	0x8
+#define SPRD_LCR_DATA_LEN8	0xc
+#define SPRD_LCR_PARITY	(BIT(0) | BIT(1))
+#define SPRD_LCR_PARITY_EN	0x2
+#define SPRD_LCR_EVEN_PAR	0x0
+#define SPRD_LCR_ODD_PAR	0x1
+
+/* control register 1 */
+#define SPRD_CTL1			0x001C
+#define RX_HW_FLOW_CTL_THLD	BIT(6)
+#define RX_HW_FLOW_CTL_EN	BIT(7)
+#define TX_HW_FLOW_CTL_EN	BIT(8)
+#define RX_TOUT_THLD_DEF	0x3E00
+#define RX_HFC_THLD_DEF	0x40
+
+/* fifo threshold register */
+#define SPRD_CTL2		0x0020
+#define THLD_TX_EMPTY	0x40
+#define THLD_RX_FULL	0x40
+
+/* config baud rate register */
+#define SPRD_CLKD0		0x0024
+#define SPRD_CLKD1		0x0028
+
+/* interrupt mask status register */
+#define SPRD_IMSR			0x002C
+#define SPRD_IMSR_RX_FIFO_FULL		BIT(0)
+#define SPRD_IMSR_TX_FIFO_EMPTY	BIT(1)
+#define SPRD_IMSR_BREAK_DETECT		BIT(7)
+#define SPRD_IMSR_TIMEOUT		BIT(13)
+
+struct reg_backup {
+	u32 ien;
+	u32 ctrl0;
+	u32 ctrl1;
+	u32 ctrl2;
+	u32 clkd0;
+	u32 clkd1;
+	u32 dspwait;
+};
+
+struct sprd_uart_port {
+	struct uart_port port;
+	struct reg_backup reg_bak;
+	char name[16];
+};
+
+static struct sprd_uart_port *sprd_port[UART_NR_MAX];
+static int sprd_ports_num;
+
+static inline unsigned int serial_in(struct uart_port *port, int offset)
+{
+	return readl_relaxed(port->membase + offset);
+}
+
+static inline void serial_out(struct uart_port *port, int offset, int value)
+{
+	writel_relaxed(value, port->membase + offset);
+}
+
+static unsigned int sprd_tx_empty(struct uart_port *port)
+{
+	if (serial_in(port, SPRD_STS1) & 0xff00)
+		return 0;
+	else
+		return TIOCSER_TEMT;
+}
+
+static unsigned int sprd_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_DSR | TIOCM_CTS;
+}
+
+static void sprd_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	/* nothing to do */
+}
+
+static void sprd_stop_tx(struct uart_port *port)
+{
+	unsigned int ien, iclr;
+
+	iclr = serial_in(port, SPRD_ICLR);
+	ien = serial_in(port, SPRD_IEN);
+
+	iclr |= SPRD_IEN_TX_EMPTY;
+	ien &= ~SPRD_IEN_TX_EMPTY;
+
+	serial_out(port, SPRD_ICLR, iclr);
+	serial_out(port, SPRD_IEN, ien);
+}
+
+static void sprd_start_tx(struct uart_port *port)
+{
+	unsigned int ien;
+
+	ien = serial_in(port, SPRD_IEN);
+	if (!(ien & SPRD_IEN_TX_EMPTY)) {
+		ien |= SPRD_IEN_TX_EMPTY;
+		serial_out(port, SPRD_IEN, ien);
+	}
+}
+
+static void sprd_stop_rx(struct uart_port *port)
+{
+	unsigned int ien, iclr;
+
+	iclr = serial_in(port, SPRD_ICLR);
+	ien = serial_in(port, SPRD_IEN);
+
+	ien &= ~(SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT);
+	iclr |= SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT;
+
+	serial_out(port, SPRD_IEN, ien);
+	serial_out(port, SPRD_ICLR, iclr);
+}
+
+/* The Sprd serial does not support this function. */
+static void sprd_break_ctl(struct uart_port *port, int break_state)
+{
+	/* nothing to do */
+}
+
+static int handle_lsr_errors(struct uart_port *port,
+			     unsigned int *flag,
+			     unsigned int *lsr)
+{
+	int ret = 0;
+
+	/* statistics */
+	if (*lsr & SPRD_LSR_BI) {
+		*lsr &= ~(SPRD_LSR_FE | SPRD_LSR_PE);
+		port->icount.brk++;
+		ret = uart_handle_break(port);
+		if (ret)
+			return ret;
+	} else if (*lsr & SPRD_LSR_PE)
+		port->icount.parity++;
+	else if (*lsr & SPRD_LSR_FE)
+		port->icount.frame++;
+	if (*lsr & SPRD_LSR_OE)
+		port->icount.overrun++;
+
+	/* mask off conditions which should be ignored */
+	*lsr &= port->read_status_mask;
+	if (*lsr & SPRD_LSR_BI)
+		*flag = TTY_BREAK;
+	else if (*lsr & SPRD_LSR_PE)
+		*flag = TTY_PARITY;
+	else if (*lsr & SPRD_LSR_FE)
+		*flag = TTY_FRAME;
+
+	return ret;
+}
+
+static inline void sprd_rx(struct uart_port *port)
+{
+	struct tty_port *tty = &port->state->port;
+	unsigned int ch, flag, lsr, max_count = SPRD_TIMEOUT;
+
+	while ((serial_in(port, SPRD_STS1) & 0x00ff) && max_count--) {
+		lsr = serial_in(port, SPRD_LSR);
+		ch = serial_in(port, SPRD_RXD);
+		flag = TTY_NORMAL;
+		port->icount.rx++;
+
+		if (lsr & (SPRD_LSR_BI | SPRD_LSR_PE |
+			SPRD_LSR_FE | SPRD_LSR_OE))
+			if (handle_lsr_errors(port, &lsr, &flag))
+				continue;
+		if (uart_handle_sysrq_char(port, ch))
+			continue;
+
+		uart_insert_char(port, lsr, SPRD_LSR_OE, ch, flag);
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+static inline void sprd_tx(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	int count;
+
+	if (port->x_char) {
+		serial_out(port, SPRD_TXD, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		sprd_stop_tx(port);
+		return;
+	}
+
+	count = THLD_TX_EMPTY;
+	do {
+		serial_out(port, SPRD_TXD, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	} while (--count > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		sprd_stop_tx(port);
+}
+
+/* this handles the interrupt from one port */
+static irqreturn_t sprd_handle_irq(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	unsigned int ims;
+
+	spin_lock(&port->lock);
+
+	ims = serial_in(port, SPRD_IMSR);
+
+	if (!ims)
+		return IRQ_NONE;
+
+	serial_out(port, SPRD_ICLR, ~0);
+
+	if (ims & (SPRD_IMSR_RX_FIFO_FULL |
+		SPRD_IMSR_BREAK_DETECT | SPRD_IMSR_TIMEOUT))
+		sprd_rx(port);
+
+	if (ims & SPRD_IMSR_TX_FIFO_EMPTY)
+		sprd_tx(port);
+
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static int sprd_startup(struct uart_port *port)
+{
+	int ret = 0;
+	unsigned int ien, fc;
+	unsigned int timeout;
+	struct sprd_uart_port *sp;
+	unsigned long flags;
+
+	serial_out(port, SPRD_CTL2, ((THLD_TX_EMPTY << 8) | THLD_RX_FULL));
+
+	/* clear rx fifo */
+	timeout = SPRD_TIMEOUT;
+	while (timeout-- && serial_in(port, SPRD_STS1) & 0x00ff)
+		serial_in(port, SPRD_RXD);
+
+	/* clear tx fifo */
+	timeout = SPRD_TIMEOUT;
+	while (timeout-- && serial_in(port, SPRD_STS1) & 0xff00)
+		cpu_relax();
+
+	/* clear interrupt */
+	serial_out(port, SPRD_IEN, 0);
+	serial_out(port, SPRD_ICLR, ~0);
+
+	/* allocate irq */
+	sp = container_of(port, struct sprd_uart_port, port);
+	snprintf(sp->name, sizeof(sp->name), "sprd_serial%d", port->line);
+	ret = devm_request_irq(port->dev, port->irq, sprd_handle_irq,
+				IRQF_SHARED, sp->name, port);
+	if (ret) {
+		dev_err(port->dev, "fail to request serial irq %d, ret=%d\n",
+			port->irq, ret);
+		return ret;
+	}
+	fc = serial_in(port, SPRD_CTL1);
+	fc |= RX_TOUT_THLD_DEF | RX_HFC_THLD_DEF;
+	serial_out(port, SPRD_CTL1, fc);
+
+	/* enable interrupt */
+	spin_lock_irqsave(&port->lock, flags);
+	ien = serial_in(port, SPRD_IEN);
+	ien |= SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT | SPRD_IEN_TIMEOUT;
+	serial_out(port, SPRD_IEN, ien);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return 0;
+}
+
+static void sprd_shutdown(struct uart_port *port)
+{
+	serial_out(port, SPRD_IEN, 0);
+	serial_out(port, SPRD_ICLR, ~0);
+	devm_free_irq(port->dev, port->irq, port);
+}
+
+static void sprd_set_termios(struct uart_port *port,
+				    struct ktermios *termios,
+				    struct ktermios *old)
+{
+	unsigned int baud, quot;
+	unsigned int lcr = 0, fc;
+	unsigned long flags;
+
+	/* ask the core to calculate the divisor for us */
+	baud = uart_get_baud_rate(port, termios, old, 0, SPRD_BAUD_IO_LIMIT);
+
+	quot = (unsigned int)((port->uartclk + baud / 2) / baud);
+
+	/* set data length */
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		lcr |= SPRD_LCR_DATA_LEN5;
+		break;
+	case CS6:
+		lcr |= SPRD_LCR_DATA_LEN6;
+		break;
+	case CS7:
+		lcr |= SPRD_LCR_DATA_LEN7;
+		break;
+	case CS8:
+	default:
+		lcr |= SPRD_LCR_DATA_LEN8;
+		break;
+	}
+
+	/* calculate stop bits */
+	lcr &= ~(SPRD_LCR_STOP_1BIT | SPRD_LCR_STOP_2BIT);
+	if (termios->c_cflag & CSTOPB)
+		lcr |= SPRD_LCR_STOP_2BIT;
+	else
+		lcr |= SPRD_LCR_STOP_1BIT;
+
+	/* calculate parity */
+	lcr &= ~SPRD_LCR_PARITY;
+	termios->c_cflag &= ~CMSPAR;	/* no support mark/space */
+	if (termios->c_cflag & PARENB) {
+		lcr |= SPRD_LCR_PARITY_EN;
+		if (termios->c_cflag & PARODD)
+			lcr |= SPRD_LCR_ODD_PAR;
+		else
+			lcr |= SPRD_LCR_EVEN_PAR;
+	}
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* update the per-port timeout */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	port->read_status_mask = SPRD_LSR_OE;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= SPRD_LSR_FE | SPRD_LSR_PE;
+	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
+		port->read_status_mask |= SPRD_LSR_BI;
+
+	/* characters to ignore */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= SPRD_LSR_PE | SPRD_LSR_FE;
+	if (termios->c_iflag & IGNBRK) {
+		port->ignore_status_mask |= SPRD_LSR_BI;
+		/*
+		 * If we're ignoring parity and break indicators,
+		 * ignore overruns too (for real raw support).
+		 */
+		if (termios->c_iflag & IGNPAR)
+			port->ignore_status_mask |= SPRD_LSR_OE;
+	}
+
+	/* flow control */
+	fc = serial_in(port, SPRD_CTL1);
+	fc &= ~(RX_HW_FLOW_CTL_THLD | RX_HW_FLOW_CTL_EN | TX_HW_FLOW_CTL_EN);
+	if (termios->c_cflag & CRTSCTS) {
+		fc |= RX_HW_FLOW_CTL_THLD;
+		fc |= RX_HW_FLOW_CTL_EN;
+		fc |= TX_HW_FLOW_CTL_EN;
+	}
+
+	/* clock divider bit0~bit15 */
+	serial_out(port, SPRD_CLKD0, quot & 0xffff);
+
+	/* clock divider bit16~bit20 */
+	serial_out(port, SPRD_CLKD1, (quot & 0x1f0000) >> 16);
+	serial_out(port, SPRD_LCR, lcr);
+	fc |= RX_TOUT_THLD_DEF | RX_HFC_THLD_DEF;
+	serial_out(port, SPRD_CTL1, fc);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	/* Don't rewrite B0 */
+	if (tty_termios_baud_rate(termios))
+		tty_termios_encode_baud_rate(termios, baud, baud);
+}
+
+static const char *sprd_type(struct uart_port *port)
+{
+	return "SPX";
+}
+
+static void sprd_release_port(struct uart_port *port)
+{
+	/* nothing to do */
+}
+
+static int sprd_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+static void sprd_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE)
+		port->type = PORT_SPRD;
+}
+
+static int sprd_verify_port(struct uart_port *port,
+				   struct serial_struct *ser)
+{
+	if (ser->type != PORT_SPRD)
+		return -EINVAL;
+	if (port->irq != ser->irq)
+		return -EINVAL;
+	return 0;
+}
+
+static struct uart_ops serial_sprd_ops = {
+	.tx_empty = sprd_tx_empty,
+	.get_mctrl = sprd_get_mctrl,
+	.set_mctrl = sprd_set_mctrl,
+	.stop_tx = sprd_stop_tx,
+	.start_tx = sprd_start_tx,
+	.stop_rx = sprd_stop_rx,
+	.break_ctl = sprd_break_ctl,
+	.startup = sprd_startup,
+	.shutdown = sprd_shutdown,
+	.set_termios = sprd_set_termios,
+	.type = sprd_type,
+	.release_port = sprd_release_port,
+	.request_port = sprd_request_port,
+	.config_port = sprd_config_port,
+	.verify_port = sprd_verify_port,
+};
+
+#ifdef CONFIG_SERIAL_SPRD_CONSOLE
+static inline void wait_for_xmitr(struct uart_port *port)
+{
+	unsigned int status, tmout = 10000;
+
+	/* wait up to 10ms for the character(s) to be sent */
+	do {
+		status = serial_in(port, SPRD_STS1);
+		if (--tmout == 0)
+			break;
+		udelay(1);
+	} while (status & 0xff00);
+}
+
+static void sprd_console_putchar(struct uart_port *port, int ch)
+{
+	wait_for_xmitr(port);
+	serial_out(port, SPRD_TXD, ch);
+}
+
+static void sprd_console_write(struct console *co, const char *s,
+				      unsigned int count)
+{
+	struct uart_port *port = &sprd_port[co->index]->port;
+	int locked = 1;
+	unsigned long flags;
+
+	if (port->sysrq)
+		locked = 0;
+	else if (oops_in_progress)
+		locked = spin_trylock_irqsave(&port->lock, flags);
+	else
+		spin_lock_irqsave(&port->lock, flags);
+
+	uart_console_write(port, s, count, sprd_console_putchar);
+
+	/* wait for transmitter to become empty */
+	wait_for_xmitr(port);
+
+	if (locked)
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static int __init sprd_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index >= UART_NR_MAX || co->index < 0)
+		co->index = 0;
+
+	port = &sprd_port[co->index]->port;
+	if (port == NULL) {
+		pr_info("serial port %d not yet initialized\n", co->index);
+		return -ENODEV;
+	}
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver sprd_uart_driver;
+static struct console sprd_console = {
+	.name = SPRD_TTY_NAME,
+	.write = sprd_console_write,
+	.device = uart_console_device,
+	.setup = sprd_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &sprd_uart_driver,
+};
+
+#define SPRD_CONSOLE	(&sprd_console)
+
+/* Support for earlycon */
+static void sprd_putc(struct uart_port *port, int c)
+{
+	unsigned int timeout = SPRD_TIMEOUT;
+
+	while (timeout-- &&
+		   !(readl(port->membase + SPRD_LSR) & SPRD_LSR_TX_OVER))
+		cpu_relax();
+
+	writeb(c, port->membase + SPRD_TXD);
+}
+
+static void sprd_early_write(struct console *con, const char *s,
+				    unsigned n)
+{
+	struct earlycon_device *dev = con->data;
+
+	uart_console_write(&dev->port, s, n, sprd_putc);
+}
+
+static int __init sprd_early_console_setup(
+				struct earlycon_device *device,
+				const char *opt)
+{
+	if (!device->port.membase)
+		return -ENODEV;
+
+	device->con->write = sprd_early_write;
+	return 0;
+}
+
+EARLYCON_DECLARE(sprd_serial, sprd_early_console_setup);
+OF_EARLYCON_DECLARE(sprd_serial, "sprd,sc9836-uart",
+		    sprd_early_console_setup);
+
+#else /* !CONFIG_SERIAL_SPRD_CONSOLE */
+#define SPRD_CONSOLE		NULL
+#endif
+
+static struct uart_driver sprd_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "sprd_serial",
+	.dev_name = SPRD_TTY_NAME,
+	.major = 0,
+	.minor = 0,
+	.nr = UART_NR_MAX,
+	.cons = SPRD_CONSOLE,
+};
+
+static int sprd_probe_dt_alias(int index, struct device *dev)
+{
+	struct device_node *np;
+	int ret = index;
+
+	if (!IS_ENABLED(CONFIG_OF))
+		return ret;
+
+	np = dev->of_node;
+	if (!np)
+		return ret;
+
+	ret = of_alias_get_id(np, "serial");
+	if (IS_ERR_VALUE(ret))
+		ret = index;
+	else if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) {
+		dev_warn(dev, "requested serial port %d not available.\n", ret);
+		ret = index;
+	}
+
+	return ret;
+}
+
+static int sprd_remove(struct platform_device *dev)
+{
+	struct sprd_uart_port *sup = platform_get_drvdata(dev);
+
+	if (sup) {
+		uart_remove_one_port(&sprd_uart_driver, &sup->port);
+		sprd_port[sup->port.line] = NULL;
+		sprd_ports_num--;
+	}
+
+	if (!sprd_ports_num)
+		uart_unregister_driver(&sprd_uart_driver);
+
+	return 0;
+}
+
+static int sprd_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct uart_port *up;
+	struct clk *clk;
+	int irq;
+	int index;
+	int ret;
+
+	for (index = 0; index < ARRAY_SIZE(sprd_port); index++)
+		if (sprd_port[index] == NULL)
+			break;
+
+	if (index == ARRAY_SIZE(sprd_port))
+		return -EBUSY;
+
+	index = sprd_probe_dt_alias(index, &pdev->dev);
+
+	sprd_port[index] = devm_kzalloc(&pdev->dev,
+		sizeof(*sprd_port[index]), GFP_KERNEL);
+	if (!sprd_port[index])
+		return -ENOMEM;
+
+	up = &sprd_port[index]->port;
+	up->dev = &pdev->dev;
+	up->line = index;
+	up->type = PORT_SPRD;
+	up->iotype = SERIAL_IO_PORT;
+	up->uartclk = SPRD_DEF_RATE;
+	up->fifosize = SPRD_FIFO_SIZE;
+	up->ops = &serial_sprd_ops;
+	up->flags = UPF_BOOT_AUTOCONF;
+
+	clk = devm_clk_get(&pdev->dev, NULL);
+	if (!IS_ERR(clk))
+		up->uartclk = clk_get_rate(clk);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "not provide mem resource\n");
+		return -ENODEV;
+	}
+	up->mapbase = res->start;
+	up->membase = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(up->membase))
+		return PTR_ERR(up->membase);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "not provide irq resource\n");
+		return -ENODEV;
+	}
+	up->irq = irq;
+
+	if (!sprd_ports_num) {
+		ret = uart_register_driver(&sprd_uart_driver);
+		if (ret < 0) {
+			pr_err("Failed to register SPRD-UART driver\n");
+			return ret;
+		}
+	}
+	sprd_ports_num++;
+
+	ret = uart_add_one_port(&sprd_uart_driver, up);
+	if (ret) {
+		sprd_port[index] = NULL;
+		sprd_remove(pdev);
+	}
+
+	platform_set_drvdata(pdev, up);
+
+	return ret;
+}
+
+static int sprd_suspend(struct device *dev)
+{
+	struct sprd_uart_port *sup = dev_get_drvdata(dev);
+
+	uart_suspend_port(&sprd_uart_driver, &sup->port);
+
+	return 0;
+}
+
+static int sprd_resume(struct device *dev)
+{
+	struct sprd_uart_port *sup = dev_get_drvdata(dev);
+
+	uart_resume_port(&sprd_uart_driver, &sup->port);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(sprd_pm_ops, sprd_suspend, sprd_resume);
+
+static const struct of_device_id serial_ids[] = {
+	{.compatible = "sprd,sc9836-uart",},
+	{}
+};
+
+static struct platform_driver sprd_platform_driver = {
+	.probe		= sprd_probe,
+	.remove		= sprd_remove,
+	.driver		= {
+		.name	= "sprd_serial",
+		.of_match_table = of_match_ptr(serial_ids),
+		.pm	= &sprd_pm_ops,
+	},
+};
+
+module_platform_driver(sprd_platform_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Spreadtrum SoC serial driver series");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index c172180..7e6eb39 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -248,4 +248,7 @@
 /* MESON */
 #define PORT_MESON	109
 
+/* SPRD SERIAL  */
+#define PORT_SPRD	110
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v9 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Peter Hurley @ 2015-01-28  3:59 UTC (permalink / raw)
  To: Chunyan Zhang, gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r
  Cc: robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	arnd-r2nGTMty4D4, gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	pawel.moll-5wv7dgnIgG8, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	galak-sgV2jX0FEOL9JmXXK+q4OQ, grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	jslaby-AlSwsSmVLrQ, heiko-4mtYJXux2i+zQB+pC5nmwQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, florian.vaussard-p8DiymsW2f8,
	andrew-g2DYL2Zd6BY, hytszk-Re5JQEeQqe8AvxtiuMwx3w,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, geng.ren-lxIno14LUO0EEoCn2XhGlw,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <1422413261-17184-3-git-send-email-chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>

On 01/27/2015 09:47 PM, Chunyan Zhang wrote:
> Add a full sc9836-uart driver for SC9836 SoC which is based on the
> spreadtrum sharkl64 platform.
> This driver also support earlycon.

Reviewed-by: Peter Hurley <peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8@public.gmane.org>


--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-01-28  4:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel

Hi Steven,

This patch set is for linux-trace/for-next
It adds ability to attach eBPF programs to tracepoints, syscalls and kprobes.

The programs are run after soft_disabled() check, but before trace_buffer
is allocated to have minimal impact on a system, which can be demonstrated
by 'dd if=/dev/zero of=/dev/null count=5000000' test:
1.19343 s, 2.1 GB/s - no tracing (raw base line)
1.53301 s, 1.7 GB/s - echo 1 > enable
1.62742 s, 1.6 GB/s - echo cnt==1234 > filter
1.23418 s, 2.1 GB/s - attached bpf program does 'return 0'
1.25890 s, 2.0 GB/s - attached bpf program does 'map[log2(count)]++'

Though tracex1 example is an example of event/filter equivalent logic,
should we create a new file '/sys/.../tracing/events/.../bpf' and use
that for attaching instead of overloading 'filter' file meaning?
That will move bpf related logic out of trace_events_filter.c into new
file and we'll be able to use both bpf program as a 'pre filter' and
existing filter code that runs on allocated trace_buffer at the same time.
In this patch set bpf programs co-exist with TP_printk and triggers.

Anyway, resending with accumulated fixes:
V1->V2:
- dropped bpf_dump_stack() and bpf_printk() helpers,
  trigger 'stacktrace' can be used instead of bpf_dump_stack()
- disabled running programs in_nmi
- other minor cleanups

V1 cover letter:
----------------
Mechanism of attaching:
- load program via bpf() syscall and receive program_fd
- event_fd = open("/sys/kernel/debug/tracing/events/.../filter")
- write 'bpf-123' to event_fd where 123 is program_fd
- program will be attached to particular event and event automatically enabled
- close(event_fd) will detach bpf program from event and event disabled

Program attach point and input arguments:
- programs attached to kprobes receive 'struct pt_regs *' as an input.
  See tracex4_kern.c that demonstrates how users can write a C program like:
  SEC("events/kprobes/sys_write")
  int bpf_prog4(struct pt_regs *regs)
  {
     long write_size = regs->dx; 
     // here user need to know the proto of sys_write() from kernel
     // sources and x64 calling convention to know that register $rdx
     // contains 3rd argument to sys_write() which is 'size_t count'

  it's obviously architecture dependent, but allows building sophisticated
  user tools on top, that can see from debug info of vmlinux which variables
  are in which registers or stack locations and fetch it from there.
  'perf probe' can potentialy use this hook to generate programs in user space
  and insert them instead of letting kernel parse string during kprobe creation.

- programs attached to tracepoints and syscalls receive 'struct bpf_context *':
  u64 arg1, arg2, ..., arg6;
  for syscalls they match syscall arguments.
  for tracepoints these args match arguments passed to tracepoint.
  For example:
  trace_sched_migrate_task(p, new_cpu); from sched/core.c
  arg1 <- p        which is 'struct task_struct *'
  arg2 <- new_cpu  which is 'unsigned int'
  arg3..arg6 = 0
  the program can use bpf_fetch_u8/16/32/64/ptr() helpers to walk 'task_struct'
  or any other kernel data structures.
  These helpers are using probe_kernel_read() similar to 'perf probe' which is
  not 100% safe in both cases, but good enough.
  To access task_struct's pid inside 'sched_migrate_task' tracepoint
  the program can do:
  struct task_struct *task = (struct task_struct *)ctx->arg1;
  u32 pid = bpf_fetch_u32(&task->pid);
  Since struct layout is kernel configuration specific such programs are not
  portable and require access to kernel headers to be compiled,
  but in this case we don't need debug info.
  llvm with bpf backend will statically compute task->pid offset as a constant
  based on kernel headers only.
  The example of this arbitrary pointer walking is tracex1_kern.c
  which does skb->dev->name == "lo" filtering.

In all cases the programs are called before trace buffer is allocated to
minimize the overhead, since we want to filter huge number of events, but
buffer alloc/free and argument copy for every event is too costly.
Theoretically we can invoke programs after buffer is allocated, but it
doesn't seem needed, since above approach is faster and achieves the same.

Note, tracepoint/syscall and kprobe programs are two different types:
BPF_PROG_TYPE_TRACING_FILTER and BPF_PROG_TYPE_KPROBE_FILTER,
since they expect different input.
Both use the same set of helper functions:
- map access (lookup/update/delete)
- fetch (probe_kernel_read wrappers)
- memcmp (probe_kernel_read + memcmp)

Portability:
- kprobe programs are architecture dependent and need user scripting
  language like ktap/stap/dtrace/perf that will dynamically generate
  them based on debug info in vmlinux
- tracepoint programs are architecture independent, but if arbitrary pointer
  walking (with fetch() helpers) is used, they need data struct layout to match.
  Debug info is not necessary
- for networking use case we need to access 'struct sk_buff' fields in portable
  way (user space needs to fetch packet length without knowing skb->len offset),
  so for some frequently used data structures we will add helper functions
  or pseudo instructions to access them. I've hacked few ways specifically
  for skb, but abandoned them in favor of more generic type/field infra.
  That work is still wip. Not part of this set.
  Once it's ready tracepoint programs that access common data structs
  will be kernel independent.

Program return value:
- programs return 0 to discard an event
- and return non-zero to proceed with event (allocate trace buffer, copy
  arguments there and print it eventually in trace_pipe in traditional way)

Examples:
- dropmon.c - simple kfree_skb() accounting in eBPF assembler, similar
  to dropmon tool
- tracex1_kern.c - does net/netif_receive_skb event filtering
  for dev->skb->name == "lo" condition
- tracex2_kern.c - same kfree_skb() accounting like dropmon, but now in C
  plus computes histogram of all write sizes from sys_write syscall
  and prints the histogram in userspace
- tracex3_kern.c - most sophisticated example that computes IO latency
  between block/block_rq_issue and block/block_rq_complete events
  and prints 'heatmap' using gray shades of text terminal.
  Useful to analyze disk performance.
- tracex4_kern.c - computes histogram of write sizes from sys_write syscall
  using kprobe mechanism instead of syscall. Since kprobe is optimized into
  ftrace the overhead of instrumentation is smaller than in example 2.

The user space tools like ktap/dtrace/systemptap/perf that has access
to debug info would probably want to use kprobe attachment point, since kprobe
can be inserted anywhere and all registers are avaiable in the program.
tracepoint attachments are useful without debug info, so standalone tools
like iosnoop will use them.

The main difference vs existing perf_probe/ftrace infra is in kernel aggregation
and conditional walking of arbitrary data structures.

Thanks!

Alexei Starovoitov (8):
  tracing: attach eBPF programs to tracepoints and syscalls
  tracing: allow eBPF programs to call ktime_get_ns()
  samples: bpf: simple tracing example in eBPF assembler
  samples: bpf: simple tracing example in C
  samples: bpf: counting example for kfree_skb tracepoint and write
    syscall
  samples: bpf: IO latency analysis (iosnoop/heatmap)
  tracing: attach eBPF programs to kprobe/kretprobe
  samples: bpf: simple kprobe example

 include/linux/ftrace_event.h       |    6 ++
 include/trace/bpf_trace.h          |   25 +++++
 include/trace/ftrace.h             |   29 ++++++
 include/uapi/linux/bpf.h           |    9 ++
 kernel/trace/Kconfig               |    1 +
 kernel/trace/Makefile              |    1 +
 kernel/trace/bpf_trace.c           |  178 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h               |    3 +
 kernel/trace/trace_events.c        |   33 ++++++-
 kernel/trace/trace_events_filter.c |   83 ++++++++++++++++-
 kernel/trace/trace_kprobe.c        |   11 ++-
 kernel/trace/trace_syscalls.c      |   31 +++++++
 samples/bpf/Makefile               |   18 ++++
 samples/bpf/bpf_helpers.h          |   14 +++
 samples/bpf/bpf_load.c             |   62 +++++++++++--
 samples/bpf/bpf_load.h             |    3 +
 samples/bpf/dropmon.c              |  129 ++++++++++++++++++++++++++
 samples/bpf/tracex1_kern.c         |   28 ++++++
 samples/bpf/tracex1_user.c         |   24 +++++
 samples/bpf/tracex2_kern.c         |   71 ++++++++++++++
 samples/bpf/tracex2_user.c         |   95 +++++++++++++++++++
 samples/bpf/tracex3_kern.c         |   92 +++++++++++++++++++
 samples/bpf/tracex3_user.c         |  150 ++++++++++++++++++++++++++++++
 samples/bpf/tracex4_kern.c         |   36 ++++++++
 samples/bpf/tracex4_user.c         |   83 +++++++++++++++++
 25 files changed, 1206 insertions(+), 9 deletions(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c
 create mode 100644 samples/bpf/dropmon.c
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

-- 
1.7.9.5

^ permalink raw reply

* [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Alexei Starovoitov @ 2015-01-28  4:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>

User interface:
fd = open("/sys/kernel/debug/tracing/__event__/filter")

write(fd, "bpf_123")

where 123 is process local FD associated with eBPF program previously loaded.
__event__ is static tracepoint event or syscall.
(kprobe support is in next patch)
Once program is successfully attached to tracepoint event, the tracepoint
will be auto-enabled

close(fd)
auto-disables tracepoint event and detaches eBPF program from it

eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- memcmp
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
  so that eBPF program can walk any kernel data structures

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/ftrace_event.h       |    4 ++
 include/trace/bpf_trace.h          |   25 +++++++
 include/trace/ftrace.h             |   29 ++++++++
 include/uapi/linux/bpf.h           |    7 ++
 kernel/trace/Kconfig               |    1 +
 kernel/trace/Makefile              |    1 +
 kernel/trace/bpf_trace.c           |  129 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h               |    3 +
 kernel/trace/trace_events.c        |   33 ++++++++-
 kernel/trace/trace_events_filter.c |   79 +++++++++++++++++++++-
 kernel/trace/trace_syscalls.c      |   31 +++++++++
 11 files changed, 340 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..79de230b7df3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -248,6 +248,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_BPF_BIT,
 };
 
 /*
@@ -270,6 +271,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_BPF		= (1 << TRACE_EVENT_FL_BPF_BIT),
 };
 
 struct ftrace_event_call {
@@ -544,6 +546,8 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx);
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+	u64 arg1;
+	u64 arg2;
+	u64 arg3;
+	u64 arg4;
+	u64 arg5;
+	u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..07b68332f149 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
 
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
@@ -617,6 +618,24 @@ static inline notrace int ftrace_get_offsets_##call(			\
 #undef __perf_task
 #define __perf_task(t)	(t)
 
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(expr) ({ \
+	u64 ret = 0; \
+	switch (sizeof(expr)) { \
+	case 8: ret = *(u64 *) &expr; break; \
+	case 4: ret = *(u32 *) &expr; break; \
+	case 2: ret = *(u16 *) &expr; break; \
+	case 1: ret = *(u8 *) &expr; break; \
+	} \
+	ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 									\
@@ -632,6 +651,16 @@ ftrace_raw_event_##call(void *__data, proto)				\
 	if (ftrace_trigger_soft_disabled(ftrace_file))			\
 		return;							\
 									\
+	if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {			\
+		__maybe_unused const u64 z = 0;				\
+		struct bpf_context __ctx = ((struct bpf_context) {	\
+				__BPF_CAST6(args, z, z, z, z, z)	\
+			});						\
+									\
+		if (!trace_filter_call_bpf(ftrace_file->filter, &__ctx))\
+			return;						\
+	}								\
+									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	entry = ftrace_event_buffer_reserve(&fbuffer, ftrace_file,	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..3bf42875287c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_TRACING_FILTER,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,12 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_fetch_ptr,       /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u64,       /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u32,       /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+	BPF_FUNC_memcmp,          /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..eb60b234b824 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -75,6 +75,7 @@ config FTRACE_NMI_ENTER
 
 config EVENT_TRACING
 	select CONTEXT_SWITCH_TRACER
+	select BPF_SYSCALL
 	bool
 
 config CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..ef821d90f3f5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_EVENT_TRACING) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..4aabbe2626c5
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,129 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *ptr = NULL;
+
+	probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+	return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)	\
+{									\
+	void *unsafe_ptr = (void *) (long) r1;				\
+	SIZE val = 0;							\
+									\
+	probe_kernel_read(&val, unsafe_ptr, sizeof(val));		\
+	return (u64) (SIZE) val;					\
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *safe_ptr = (void *) (long) r2;
+	u32 size = (u32) r3;
+	char buf[64];
+	int err;
+
+	if (size < 64) {
+		err = probe_kernel_read(buf, unsafe_ptr, size);
+		if (err)
+			return err;
+		return memcmp(buf, safe_ptr, size);
+	}
+	return -1;
+}
+
+static struct bpf_func_proto tracing_filter_funcs[] = {
+#define FETCH(SIZE)				\
+	[BPF_FUNC_fetch_##SIZE] = {		\
+		.func = bpf_fetch_##SIZE,	\
+		.gpl_only = true,		\
+		.ret_type = RET_INTEGER,	\
+	},
+	FETCH(ptr)
+	FETCH(u64)
+	FETCH(u32)
+	FETCH(u16)
+	FETCH(u8)
+#undef FETCH
+	[BPF_FUNC_memcmp] = {
+		.func = bpf_memcmp,
+		.gpl_only = false,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_ANYTHING,
+		.arg2_type = ARG_PTR_TO_STACK,
+		.arg3_type = ARG_CONST_STACK_SIZE,
+	},
+};
+
+static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(tracing_filter_funcs))
+			return NULL;
+		return &tracing_filter_funcs[func_id];
+	}
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tracing_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct bpf_context))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops tracing_filter_ops = {
+	.get_func_proto = tracing_filter_func_proto,
+	.is_valid_access = tracing_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+	.ops = &tracing_filter_ops,
+	.type = BPF_PROG_TYPE_TRACING_FILTER,
+};
+
+static int __init register_tracing_filter_ops(void)
+{
+	bpf_register_prog_type(&tl);
+	return 0;
+}
+late_initcall(register_tracing_filter_ops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..d667547c6f0e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -977,12 +977,15 @@ struct ftrace_event_field {
 	int			is_signed;
 };
 
+struct bpf_prog;
+
 struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
 	struct filter_pred	*preds;
 	struct filter_pred	*root;
 	char			*filter_string;
+	struct bpf_prog		*prog;
 };
 
 struct event_subsystem {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b03a0ea77b99..70482817231a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1084,6 +1084,26 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return r;
 }
 
+static int event_filter_release(struct inode *inode, struct file *filp)
+{
+	struct ftrace_event_file *file;
+	char buf[2] = "0";
+
+	mutex_lock(&event_mutex);
+	file = event_file_data(filp);
+	if (file) {
+		if (file->flags & TRACE_EVENT_FL_BPF) {
+			/* auto-disable the filter */
+			ftrace_event_enable_disable(file, 0);
+
+			/* if BPF filter was used, clear it on fd close */
+			apply_event_filter(file, buf);
+		}
+	}
+	mutex_unlock(&event_mutex);
+	return 0;
+}
+
 static ssize_t
 event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
@@ -1107,8 +1127,18 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	file = event_file_data(filp);
-	if (file)
+	if (file) {
+		/*
+		 * note to user space tools:
+		 * write() into debugfs/tracing/events/xxx/filter file
+		 * must be done with the same privilege level as open()
+		 */
 		err = apply_event_filter(file, buf);
+		if (!err && file->flags & TRACE_EVENT_FL_BPF)
+			/* once filter is applied, auto-enable it */
+			ftrace_event_enable_disable(file, 1);
+	}
+
 	mutex_unlock(&event_mutex);
 
 	free_page((unsigned long) buf);
@@ -1363,6 +1393,7 @@ static const struct file_operations ftrace_event_filter_fops = {
 	.open = tracing_open_generic,
 	.read = event_filter_read,
 	.write = event_filter_write,
+	.release = event_filter_release,
 	.llseek = default_llseek,
 };
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ced69da0ff55..e0303b3cc9fb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -23,6 +23,9 @@
 #include <linux/mutex.h>
 #include <linux/perf_event.h>
 #include <linux/slab.h>
+#include <linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include <linux/filter.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -541,6 +544,21 @@ static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
 	return WALK_PRED_DEFAULT;
 }
 
+unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 0;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(filter->prog, ctx);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_filter_call_bpf);
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
@@ -795,6 +813,8 @@ static void __free_filter(struct event_filter *filter)
 	if (!filter)
 		return;
 
+	if (filter->prog)
+		bpf_prog_put(filter->prog);
 	__free_preds(filter);
 	kfree(filter->filter_string);
 	kfree(filter);
@@ -1874,6 +1894,50 @@ static int create_filter_start(char *filter_str, bool set_str,
 	return err;
 }
 
+static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
+{
+	struct event_filter *filter;
+	struct bpf_prog *prog;
+	long ufd;
+	int err = 0;
+
+	*filterp = NULL;
+
+	filter = __alloc_filter();
+	if (!filter)
+		return -ENOMEM;
+
+	err = replace_filter_string(filter, filter_str);
+	if (err)
+		goto free_filter;
+
+	err = kstrtol(filter_str + 4, 0, &ufd);
+	if (err)
+		goto free_filter;
+
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto free_filter;
+	}
+
+	filter->prog = prog;
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER) {
+		/* valid fd, but invalid bpf program type */
+		err = -EINVAL;
+		goto free_filter;
+	}
+
+	*filterp = filter;
+
+	return 0;
+
+free_filter:
+	__free_filter(filter);
+	return err;
+}
+
 static void create_filter_finish(struct filter_parse_state *ps)
 {
 	if (ps) {
@@ -1971,6 +2035,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
 		filter_disable(file);
 		filter = event_filter(file);
 
+		file->flags &= ~TRACE_EVENT_FL_BPF;
 		if (!filter)
 			return 0;
 
@@ -1983,7 +2048,19 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
 		return 0;
 	}
 
-	err = create_filter(call, filter_string, true, &filter);
+	/*
+	 * 'bpf_123' string is a request to attach eBPF program with id == 123
+	 * also accept 'bpf 123', 'bpf.123', 'bpf-123' variants
+	 */
+	if (memcmp(filter_string, "bpf", 3) == 0 && filter_string[3] != 0 &&
+	    filter_string[4] != 0) {
+		err = create_filter_bpf(filter_string, &filter);
+		if (!err)
+			file->flags |= TRACE_EVENT_FL_BPF;
+	} else {
+		err = create_filter(call, filter_string, true, &filter);
+		file->flags &= ~TRACE_EVENT_FL_BPF;
+	}
 
 	/*
 	 * Always swap the call filter with the new filter
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..e1b25a834cc7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
+#include <trace/bpf_trace.h>
 
 #include "trace_output.h"
 #include "trace.h"
@@ -290,6 +291,20 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+	unsigned long args[6];
+
+	syscall_get_arguments(task, regs, 0, 6, args);
+	ctx->arg1 = args[0];
+	ctx->arg2 = args[1];
+	ctx->arg3 = args[2];
+	ctx->arg4 = args[3];
+	ctx->arg5 = args[4];
+	ctx->arg6 = args[5];
+}
+
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
 	struct trace_array *tr = data;
@@ -319,6 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {
+		struct bpf_context ctx;
+
+		populate_bpf_ctx(&ctx, regs);
+		if (!trace_filter_call_bpf(ftrace_file->filter, &ctx))
+			return;
+	}
+
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
 	local_save_flags(irq_flags);
@@ -366,6 +389,14 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {
+		struct bpf_context ctx = {};
+
+		ctx.arg1 = syscall_get_return_value(current, regs);
+		if (!trace_filter_call_bpf(ftrace_file->filter, &ctx))
+			return;
+	}
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns()
From: Alexei Starovoitov @ 2015-01-28  4:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>

bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3bf42875287c..227a4e404726 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -169,6 +169,7 @@ enum bpf_func_id {
 	BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
 	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
 	BPF_FUNC_memcmp,          /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4aabbe2626c5..1c07f55702d6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -54,6 +54,11 @@ static u64 bpf_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return -1;
 }
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return ktime_get_ns();
+}
+
 static struct bpf_func_proto tracing_filter_funcs[] = {
 #define FETCH(SIZE)				\
 	[BPF_FUNC_fetch_##SIZE] = {		\
@@ -75,6 +80,11 @@ static struct bpf_func_proto tracing_filter_funcs[] = {
 		.arg2_type = ARG_PTR_TO_STACK,
 		.arg3_type = ARG_CONST_STACK_SIZE,
 	},
+	[BPF_FUNC_ktime_get_ns] = {
+		.func = bpf_ktime_get_ns,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+	},
 };
 
 static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler
From: Alexei Starovoitov @ 2015-01-28  4:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>

simple packet drop monitor:
- in-kernel eBPF program attaches to kfree_skb() event and records number
  of packet drops at given location
- userspace iterates over the map every second and prints stats

Usage:
$ sudo dropmon
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2

$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile  |    2 +
 samples/bpf/dropmon.c |  129 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 samples/bpf/dropmon.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..789691374562 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,7 +6,9 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += dropmon
 
+dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
diff --git a/samples/bpf/dropmon.c b/samples/bpf/dropmon.c
new file mode 100644
index 000000000000..9a2cd3344d69
--- /dev/null
+++ b/samples/bpf/dropmon.c
@@ -0,0 +1,129 @@
+/* simple packet drop monitor:
+ * - in-kernel eBPF program attaches to kfree_skb() event and records number
+ *   of packet drops at given location
+ * - userspace iterates over the map every second and prints stats
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include "libbpf.h"
+
+#define TRACEPOINT "/sys/kernel/debug/tracing/events/skb/kfree_skb/"
+
+static int write_to_file(const char *file, const char *str, bool keep_open)
+{
+	int fd, err;
+
+	fd = open(file, O_WRONLY);
+	err = write(fd, str, strlen(str));
+	(void) err;
+
+	if (keep_open) {
+		return fd;
+	} else {
+		close(fd);
+		return -1;
+	}
+}
+
+static int dropmon(void)
+{
+	long long key, next_key, value = 0;
+	int prog_fd, map_fd, i;
+	char fmt[32];
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
+	if (map_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		goto cleanup;
+	}
+
+	/* the following eBPF program is equivalent to C:
+	 * int filter(struct bpf_context *ctx)
+	 * {
+	 *   long loc = ctx->arg2;
+	 *   long init_val = 1;
+	 *   long *value;
+	 *
+	 *   value = bpf_map_lookup_elem(MAP_ID, &loc);
+	 *   if (value) {
+	 *      __sync_fetch_and_add(value, 1);
+	 *   } else {
+	 *      bpf_map_update_elem(MAP_ID, &loc, &init_val, BPF_ANY);
+	 *   }
+	 *   return 0;
+	 * }
+	 */
+	struct bpf_insn prog[] = {
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), /* r2 = *(u64 *)(r1 + 8) */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* *(u64 *)(fp - 8) = r2 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+		BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+		BPF_EXIT_INSN(),
+		BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 1), /* *(u64 *)(fp - 16) = 1 */
+		BPF_MOV64_IMM(BPF_REG_4, BPF_ANY),
+		BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16), /* r3 = fp - 16 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+		BPF_EXIT_INSN(),
+	};
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING_FILTER, prog,
+				sizeof(prog), "GPL");
+	if (prog_fd < 0) {
+		printf("failed to load prog '%s'\n%s", strerror(errno), bpf_log_buf);
+		return -1;
+	}
+
+	sprintf(fmt, "bpf_%d", prog_fd);
+
+	write_to_file(TRACEPOINT "filter", fmt, true);
+
+	for (i = 0; i < 10; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd, &next_key, &value);
+			printf("location 0x%llx count %lld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+
+cleanup:
+	/* maps, programs, tracepoint filters will auto cleanup on process exit */
+
+	return 0;
+}
+
+int main(void)
+{
+	FILE *f;
+
+	/* start ping in the background to get some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	dropmon();
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox