Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* Re: [PATCH v8 02/18] vfio: platform: probe to devices on the platform bus
From: Alex Williamson @ 2014-10-21 16:17 UTC (permalink / raw)
  To: Antonios Motakis
  Cc: open list:VFIO DRIVER, eric.auger-QSEj5FYQhm4dnm+yROfE0A,
	marc.zyngier-5wv7dgnIgG8, open list:ABI/API,
	will.deacon-5wv7dgnIgG8, open list,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	tech-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J,
	kvmarm-FPEHb7Xf0XXUo1n7N8X6UoWGPAHP3yOg,
	christoffer.dall-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <1413205825-6370-3-git-send-email-a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>

On Mon, 2014-10-13 at 15:10 +0200, Antonios Motakis wrote:
> Driver to bind to Linux platform devices, and callbacks to discover their
> resources to be used by the main VFIO PLATFORM code.
> 
> Signed-off-by: Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>
> ---
>  drivers/vfio/platform/vfio_platform.c | 107 ++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h             |   1 +
>  2 files changed, 108 insertions(+)
>  create mode 100644 drivers/vfio/platform/vfio_platform.c
> 
> diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
> new file mode 100644
> index 0000000..baeb7da
> --- /dev/null
> +++ b/drivers/vfio/platform/vfio_platform.c
> @@ -0,0 +1,107 @@
> +/*
> + * Copyright (C) 2013 - Virtual Open Systems
> + * Author: Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/device.h>
> +#include <linux/eventfd.h>
> +#include <linux/interrupt.h>
> +#include <linux/iommu.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/pm_runtime.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <linux/io.h>
> +#include <linux/platform_device.h>
> +#include <linux/irq.h>
> +
> +#include "vfio_platform_private.h"
> +
> +#define DRIVER_VERSION  "0.8"
> +#define DRIVER_AUTHOR   "Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>"
> +#define DRIVER_DESC     "VFIO for platform devices - User Level meta-driver"
> +
> +/* probing devices from the linux platform bus */
> +
> +static struct resource *get_platform_resource(struct vfio_platform_device *vdev,
> +						int num)
> +{
> +	struct platform_device *dev = (struct platform_device *) vdev->opaque;
> +	int i;
> +
> +	for (i = 0; i < dev->num_resources; i++) {
> +		struct resource *r = &dev->resource[i];
> +
> +		if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) {
> +			num--;
> +
> +			if (!num)
> +				return r;

Has this been tested?  What happens when we call this with num = 0?

> +		}
> +	}
> +	return NULL;
> +}
> +
> +static int get_platform_irq(struct vfio_platform_device *vdev, int i)
> +{
> +	struct platform_device *pdev = (struct platform_device *) vdev->opaque;
> +
> +	return platform_get_irq(pdev, i);
> +}
> +
> +
> +static int vfio_platform_probe(struct platform_device *pdev)
> +{
> +	struct vfio_platform_device *vdev;
> +	int ret;
> +
> +	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
> +	if (!vdev)
> +		return -ENOMEM;
> +
> +	vdev->opaque = (void *) pdev;
> +	vdev->name = pdev->name;
> +	vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
> +	vdev->get_resource = get_platform_resource;
> +	vdev->get_irq = get_platform_irq;
> +
> +	ret = vfio_platform_probe_common(vdev, &pdev->dev);
> +	if (ret)
> +		kfree(vdev);
> +
> +	return ret;
> +}
> +
> +static int vfio_platform_remove(struct platform_device *pdev)
> +{
> +	return vfio_platform_remove_common(&pdev->dev);
> +}
> +
> +static struct platform_driver vfio_platform_driver = {
> +	.probe		= vfio_platform_probe,
> +	.remove		= vfio_platform_remove,
> +	.driver	= {
> +		.name	= "vfio-platform",
> +		.owner	= THIS_MODULE,
> +	},
> +};
> +
> +module_platform_driver(vfio_platform_driver);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 111b5e8..aca6d3e 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -158,6 +158,7 @@ struct vfio_device_info {
>  	__u32	flags;
>  #define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
>  #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
> +#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
>  	__u32	num_regions;	/* Max region index + 1 */
>  	__u32	num_irqs;	/* Max IRQ index + 1 */
>  };

^ permalink raw reply

* Re: [PATCH] staging: android: binder: move to the "real" part of the kernel
From: Arnd Bergmann @ 2014-10-21 14:12 UTC (permalink / raw)
  To: Pavel Machek
  Cc: devel, Anup Patel, Greg Kroah-Hartman, lkml,
	Arve Hjønnevåg, John Stultz, viro, Linux API,
	Rebecca Schultz Zavin, Santosh Shilimkar, Sumit Semwal,
	Christoffer Dall
In-Reply-To: <20141021103622.GB23161@amd>

On Tuesday 21 October 2014 12:36:22 Pavel Machek wrote:
> On Fri 2014-10-17 01:12:21, Greg Kroah-Hartman wrote:
> > On Thu, Oct 16, 2014 at 10:09:04AM -0700, John Stultz wrote:
> > > On Thu, Oct 16, 2014 at 5:47 AM, Greg Kroah-Hartman
> > > <gregkh@linuxfoundation.org> wrote:
> > > > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> 
> > > Are the Android guys comfortable with the ABI stability rules they'll
> > > now face?
> > 
> > Just because something is in staging, doesn't mean you don't have to
> > follow the same ABI stability rules as the rest of the kernel.  If a
> > change had happened to this code that broke userspace in the past, I
> > would have reverted it.  So this should not be anything different from
> > what has been happening inthe past.
> 
> Actually, there's big difference.
> 
> If Al Viro changes core filesystem in a way that breaks
> staging/binder, binder is broken, and if it can't be fixed... well it
> can't be fixed.
> 
> If Al Viro changes core filesystem in a way that breaks
> drivers/binder, Al's change is going to be reverted.

One might have argued that we'd have to do that already, but the reasons
for doing that with binder in the main kernel are certainly stronger.

> It is really hard to review without API documentation. Normally, API
> documentation is required for stuff like this.
> 
> For example: does it add new files in /proc?
> 
> Given that it is stable, can we get rid of binder_debug() and
> especially BINDER_DEBUG_ENTRY stuff?

Good point. We require documentation for every single sysfs attribute
that gets added to a driver (some escape the review, but that doesn't
change the rule), so we should not make an exception for a new procfs
file here.

> Checkpatch warns about 98 too long lines. Some of them could be fixed
> easily.

I don't think this should be an argument.

> This looks scary:
> 
>                         trace_binder_transaction_fd(t, fp->handle,
>                         target_fd);
> 			                binder_debug(BINDER_DEBUG_TRANSACTION,
>                                      "        fd %d -> %d\n",
>                         fp->handle, target_fd);
>                         /* TODO: fput? */
>                         fp->handle = target_fd;
> 			        } break;
> 
> Could binder_transcation() be split to smaller functions according to
> CodingStyle? 17 goto targets at the end of function are not exactly
> easy to read.
> 
> ginder_thread_read/write also needs splitting.

Yes, in principle, but this is still a detail that would mainly serve
to simplify review. The problem is more the lack of review and
documentation of the API.

> binder_ioctl_write_read: just use direct return, no need to goto out
> if it just returns.

details, and a lot of people actually like the style used there.

>    proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
>         mutex_unlock(&binder_mmap_lock);
> 
> #ifdef CONFIG_CPU_CACHE_VIPT
>         if (cache_is_vipt_aliasing()) {
>                 while (CACHE_COLOUR((vma->vm_start ^
>         (uint32_t)proc->buffer))) {
> 
> Should this be (uintptr_t)?

It should probably call an architecture specific helper.

	Arnd

^ permalink raw reply

* Re: [PATCH 2/5] vfio: introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag
From: Antonios Motakis @ 2014-10-21 12:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: open list:VFIO DRIVER, Eric Auger, Marc Zyngier,
	open list:ABI/API, Will Deacon, open list, Linux IOMMU,
	VirtualOpenSystems Technical Team, kvm-arm, Christoffer Dall
In-Reply-To: <CALCETrWzxjpKrou6J63_T75x=ZEGWCGbc4KEWT_AMvzSQNn1eg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Mon, Oct 20, 2014 at 11:37 PM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
> On Mon, Oct 13, 2014 at 6:09 AM, Antonios Motakis
> <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org> wrote:
>> We introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag to the VFIO dma map call,
>> and expose its availability via the capability VFIO_DMA_NOEXEC_IOMMU.
>> This way the user can control whether the XN flag will be set on the
>> requested mappings. The IOMMU_NOEXEC flag needs to be available for all
>> the IOMMUs of the container used.
>
> Since you sent this to the linux-api list, I'll bite: what's the XN
> flag?  I know what PROT_EXEC does when you mmap something, and I
> presume that vfio is mmappable, but I don't actually have any clue
> what this patch does.
>
> I assume that this does not have anything to do with a non-CPU DMA
> master executing code in main memory, because that makes rather little
> sense.  (Or maybe it really does, in which case: weird.)

It does actually. For example, the ARM PL330 DMA controller will fetch
from memory code with DMA instructions, and it will respect this flag.
It is not code that can be executed on the CPU of course, but it is
executable on the DMAC.


>
> --Andy



-- 
Antonios Motakis
Virtual Open Systems

^ permalink raw reply

* Re: [PATCH 2/5] vfio: introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag
From: Antonios Motakis @ 2014-10-21 12:17 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm-arm, Linux IOMMU, Will Deacon,
	VirtualOpenSystems Technical Team, Christoffer Dall, Eric Auger,
	Kim Phillips, Marc Zyngier, open list:VFIO DRIVER,
	open list:ABI/API, open list
In-Reply-To: <1413840583.4202.111.camel@ul30vt.home>

On Mon, Oct 20, 2014 at 11:29 PM, Alex Williamson
<alex.williamson@redhat.com> wrote:
> On Mon, 2014-10-13 at 15:09 +0200, Antonios Motakis wrote:
>> We introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag to the VFIO dma map call,
>> and expose its availability via the capability VFIO_DMA_NOEXEC_IOMMU.
>> This way the user can control whether the XN flag will be set on the
>> requested mappings. The IOMMU_NOEXEC flag needs to be available for all
>> the IOMMUs of the container used.
>>
>> Signed-off-by: Antonios Motakis <a.motakis@virtualopensystems.com>
>> ---
>>  include/uapi/linux/vfio.h | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 6612974..111b5e8 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -29,6 +29,7 @@
>>   * capability is subject to change as groups are added or removed.
>>   */
>>  #define VFIO_DMA_CC_IOMMU            4
>> +#define VFIO_DMA_NOEXEC_IOMMU                5
>>
>>  /* Check if EEH is supported */
>>  #define VFIO_EEH                     5
>                                     ^^
> 5 is still already used.  Feel free to convert to enum so we stop making
> this mistake.

Oops :) will do.

>
>> @@ -401,6 +402,7 @@ struct vfio_iommu_type1_dma_map {
>>       __u32   flags;
>>  #define VFIO_DMA_MAP_FLAG_READ (1 << 0)              /* readable from device */
>>  #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)     /* writable from device */
>> +#define VFIO_DMA_MAP_FLAG_NOEXEC (1 << 2)    /* not executable from device */
>>       __u64   vaddr;                          /* Process virtual address */
>>       __u64   iova;                           /* IO virtual address */
>>       __u64   size;                           /* Size of mapping (bytes) */
>
>
>



-- 
Antonios Motakis
Virtual Open Systems

^ permalink raw reply

* Re: [PATCH] staging: android: binder: move to the "real" part of the kernel
From: Christoph Hellwig @ 2014-10-21 10:46 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b, Anup Patel,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Arve Hj?nnev?g,
	Santosh Shilimkar, Rebecca Schultz Zavin, John Stultz,
	Sumit Semwal, Christoffer Dall
In-Reply-To: <20141019220450.GB3780-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>

On Mon, Oct 20, 2014 at 06:04:50AM +0800, Greg Kroah-Hartman wrote:
> There is now work to resolve the interface, it requires someone who has
> the rights to push to Android userspace.  But that is going to be a
> "total rewrite", and until then, this code needs to be used, no matter
> how much we hate this.

It helps to qualify why it absolutely has to, and why this is different
from other interfaces we haven't merged.  Is this the last building
block to run upstream Linux on a common Android device out of the box
without needing any patches or out of tree drivers?  Is there any other
good reason I might have missed.

To convince other people that merging a piece like this absolutely needs
to get merged I'd suggest you start with presenting factual argument,
and then let the broader audience weight their merrits.

I think with the known problems of the code, and the fact that the
real user ABI is a library anyway the stakes are quite high here.

So as a start please prepare a list of arguments, a detailed description
of the ABI, and post a proper patch (not a move) that suggests adding
this driver to all the relevant lists (most importantly linux-fsdevel
and linux-api) so that people with the right experience can review it.

^ permalink raw reply

* Re: [PATCH] staging: android: binder: move to the "real" part of the kernel
From: Pavel Machek @ 2014-10-21 10:36 UTC (permalink / raw)
  To: Greg Kroah-Hartman, viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn
  Cc: John Stultz, lkml, devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
	Linux API, Santosh Shilimkar, Arve Hjønnevåg,
	Sumit Semwal, Rebecca Schultz Zavin, Christoffer Dall, Anup Patel
In-Reply-To: <20141016231221.GA13592-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>

On Fri 2014-10-17 01:12:21, Greg Kroah-Hartman wrote:
> On Thu, Oct 16, 2014 at 10:09:04AM -0700, John Stultz wrote:
> > On Thu, Oct 16, 2014 at 5:47 AM, Greg Kroah-Hartman
> > <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org> wrote:
> > > From: Greg Kroah-Hartman <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>

> > Are the Android guys comfortable with the ABI stability rules they'll
> > now face?
> 
> Just because something is in staging, doesn't mean you don't have to
> follow the same ABI stability rules as the rest of the kernel.  If a
> change had happened to this code that broke userspace in the past, I
> would have reverted it.  So this should not be anything different from
> what has been happening inthe past.

Actually, there's big difference.

If Al Viro changes core filesystem in a way that breaks
staging/binder, binder is broken, and if it can't be fixed... well it
can't be fixed.

If Al Viro changes core filesystem in a way that breaks
drivers/binder, Al's change is going to be reverted.

It is really hard to review without API documentation. Normally, API
documentation is required for stuff like this.

For example: does it add new files in /proc?

Given that it is stable, can we get rid of binder_debug() and
especially BINDER_DEBUG_ENTRY stuff?

Checkpatch warns about 98 too long lines. Some of them could be fixed
easily.

This looks scary:

                        trace_binder_transaction_fd(t, fp->handle,
                        target_fd);
			                binder_debug(BINDER_DEBUG_TRANSACTION,
                                     "        fd %d -> %d\n",
                        fp->handle, target_fd);
                        /* TODO: fput? */
                        fp->handle = target_fd;
			        } break;

Could binder_transcation() be split to smaller functions according to
CodingStyle? 17 goto targets at the end of function are not exactly
easy to read.

ginder_thread_read/write also needs splitting.

binder_ioctl_write_read: just use direct return, no need to goto out
if it just returns.

   proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
        mutex_unlock(&binder_mmap_lock);

#ifdef CONFIG_CPU_CACHE_VIPT
        if (cache_is_vipt_aliasing()) {
                while (CACHE_COLOUR((vma->vm_start ^
        (uint32_t)proc->buffer))) {

Should this be (uintptr_t)?

        /*pr_info("binder_mmap: %d %lx-%lx maps %p\n",                                   

Delete the code, don't comment it out. It is on more than one place.

static void print_binder_thread(struct seq_file *m,
                                struct binder_thread *thread,
                                int print_always)
{
        struct binder_transaction *t;
	struct binder_work *w;
        size_t start_pos = m->count;
	size_t header_pos;

        seq_printf(m, "  thread %d: l %02x\n", thread->pid,
        thread->looper);
        header_pos = m->count;
        t = thread->transaction_stack;
	while (t) {
 	        if (t->from == thread) {
                        print_binder_transaction(m,
				                                 "
        outgoing transaction", t);
                        t = t->from_parent;

Is anyone depending on the debugfs files? Can it be deleted?

Code indentation is "interesting" in binder_thread_read(). See the "}
break;" lines. {}s should not be needed...?

I don't think this code would get merged if it was submitted for
normal inclusion in kernel. I don't think it is good idea to push it
through the back door, without documenting what it does and without
patches even going to the lists.

									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply

* Re: [PATCH v2 2/5] mfd: max77693: Map charger device to its own of_node
From: Lee Jones @ 2014-10-21 10:27 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: Sebastian Reichel, Dmitry Eremin-Solenikov, David Woodhouse,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-pm-u79uwXL29TY76Z2rM5mHXA, Samuel Ortiz,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Kyungmin Park,
	Marek Szyprowski, Bartlomiej Zolnierkiewicz
In-Reply-To: <1413886845.26980.18.camel@AMDC1943>

On Tue, 21 Oct 2014, Krzysztof Kozlowski wrote:

> On wto, 2014-10-21 at 11:17 +0100, Lee Jones wrote:
> > On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:
> > 
> > > On pon, 2014-10-20 at 16:06 +0100, Lee Jones wrote:
> > > > On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:
> > > > 
> > > > > Add a "maxim,max77693-charger" of_compatible to the mfd_cell so the MFD
> > > > > child device (the charger) will have its own of_node set. This will be
> > > > > used by the max77693 charger driver in next patches to obtain battery
> > > > > configuration from DTS.
> > > > > 
> > > > > Signed-off-by: Krzysztof Kozlowski <k.kozlowski-Sze3O3UU22JBDgjK7y7TUQ@public.gmane.org>
> > > > > ---
> > > > >  drivers/mfd/max77693.c | 5 ++++-
> > > > >  1 file changed, 4 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
> > > > > index cf008f45968c..2277a11b6629 100644
> > > > > --- a/drivers/mfd/max77693.c
> > > > > +++ b/drivers/mfd/max77693.c
> > > > > @@ -43,7 +43,10 @@
> > > > >  
> > > > >  static const struct mfd_cell max77693_devs[] = {
> > > > >  	{ .name = "max77693-pmic", },
> > > > > -	{ .name = "max77693-charger", },
> > > > > +	{
> > > > > +		.name = "max77693-charger",
> > > > > +		.of_compatible = "maxim,max77693-charger",
> > > > > +	},
> > > > >  	{ .name = "max77693-muic", },
> > > > >  	{ .name = "max77693-haptic", },
> > > > >  	{
> > > > 
> > > > I'm guessing this can be applied separately?
> > > 
> > > Yes. Patch 3/5 also (it adds necessary defines and symbols for
> > > chargers).
> > 
> > I'll take this patch, but I think 3/5 is required by some of the other
> > patches in the set, no?
> 
> Yes, it is required by 4/5 (charger driver).

Okay, so it must go in with the others then.

I need Acks for that.

-- 
Lee Jones
Linaro STMicroelectronics Landing Team Lead
Linaro.org │ Open source software for ARM SoCs
Follow Linaro: Facebook | Twitter | Blog

^ permalink raw reply

* Re: [PATCH v2 2/5] mfd: max77693: Map charger device to its own of_node
From: Krzysztof Kozlowski @ 2014-10-21 10:20 UTC (permalink / raw)
  To: Lee Jones
  Cc: Sebastian Reichel, Dmitry Eremin-Solenikov, David Woodhouse,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-pm-u79uwXL29TY76Z2rM5mHXA, Samuel Ortiz,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Kyungmin Park,
	Marek Szyprowski, Bartlomiej Zolnierkiewicz
In-Reply-To: <20141021101705.GF26842@x1>

On wto, 2014-10-21 at 11:17 +0100, Lee Jones wrote:
> On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:
> 
> > On pon, 2014-10-20 at 16:06 +0100, Lee Jones wrote:
> > > On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:
> > > 
> > > > Add a "maxim,max77693-charger" of_compatible to the mfd_cell so the MFD
> > > > child device (the charger) will have its own of_node set. This will be
> > > > used by the max77693 charger driver in next patches to obtain battery
> > > > configuration from DTS.
> > > > 
> > > > Signed-off-by: Krzysztof Kozlowski <k.kozlowski-Sze3O3UU22JBDgjK7y7TUQ@public.gmane.org>
> > > > ---
> > > >  drivers/mfd/max77693.c | 5 ++++-
> > > >  1 file changed, 4 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
> > > > index cf008f45968c..2277a11b6629 100644
> > > > --- a/drivers/mfd/max77693.c
> > > > +++ b/drivers/mfd/max77693.c
> > > > @@ -43,7 +43,10 @@
> > > >  
> > > >  static const struct mfd_cell max77693_devs[] = {
> > > >  	{ .name = "max77693-pmic", },
> > > > -	{ .name = "max77693-charger", },
> > > > +	{
> > > > +		.name = "max77693-charger",
> > > > +		.of_compatible = "maxim,max77693-charger",
> > > > +	},
> > > >  	{ .name = "max77693-muic", },
> > > >  	{ .name = "max77693-haptic", },
> > > >  	{
> > > 
> > > I'm guessing this can be applied separately?
> > 
> > Yes. Patch 3/5 also (it adds necessary defines and symbols for
> > chargers).
> 
> I'll take this patch, but I think 3/5 is required by some of the other
> patches in the set, no?

Yes, it is required by 4/5 (charger driver).

Best regards,
Krzysztof

^ permalink raw reply

* Re: [PATCH v2 2/5] mfd: max77693: Map charger device to its own of_node
From: Lee Jones @ 2014-10-21 10:17 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: Sebastian Reichel, Dmitry Eremin-Solenikov, David Woodhouse,
	linux-kernel, linux-pm, Samuel Ortiz, linux-api, devicetree,
	Kyungmin Park, Marek Szyprowski, Bartlomiej Zolnierkiewicz
In-Reply-To: <1413808489-3524-3-git-send-email-k.kozlowski@samsung.com>

On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:

> Add a "maxim,max77693-charger" of_compatible to the mfd_cell so the MFD
> child device (the charger) will have its own of_node set. This will be
> used by the max77693 charger driver in next patches to obtain battery
> configuration from DTS.
> 
> Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
> ---
>  drivers/mfd/max77693.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)

Applied, thanks.

> diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
> index cf008f45968c..2277a11b6629 100644
> --- a/drivers/mfd/max77693.c
> +++ b/drivers/mfd/max77693.c
> @@ -43,7 +43,10 @@
>  
>  static const struct mfd_cell max77693_devs[] = {
>  	{ .name = "max77693-pmic", },
> -	{ .name = "max77693-charger", },
> +	{
> +		.name = "max77693-charger",
> +		.of_compatible = "maxim,max77693-charger",
> +	},
>  	{ .name = "max77693-muic", },
>  	{ .name = "max77693-haptic", },
>  	{

-- 
Lee Jones
Linaro STMicroelectronics Landing Team Lead
Linaro.org │ Open source software for ARM SoCs
Follow Linaro: Facebook | Twitter | Blog

^ permalink raw reply

* Re: [PATCH v2 2/5] mfd: max77693: Map charger device to its own of_node
From: Lee Jones @ 2014-10-21 10:17 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: Sebastian Reichel, Dmitry Eremin-Solenikov, David Woodhouse,
	linux-kernel, linux-pm, Samuel Ortiz, linux-api, devicetree,
	Kyungmin Park, Marek Szyprowski, Bartlomiej Zolnierkiewicz
In-Reply-To: <1413817781.11298.0.camel@AMDC1943>

On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:

> On pon, 2014-10-20 at 16:06 +0100, Lee Jones wrote:
> > On Mon, 20 Oct 2014, Krzysztof Kozlowski wrote:
> > 
> > > Add a "maxim,max77693-charger" of_compatible to the mfd_cell so the MFD
> > > child device (the charger) will have its own of_node set. This will be
> > > used by the max77693 charger driver in next patches to obtain battery
> > > configuration from DTS.
> > > 
> > > Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
> > > ---
> > >  drivers/mfd/max77693.c | 5 ++++-
> > >  1 file changed, 4 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
> > > index cf008f45968c..2277a11b6629 100644
> > > --- a/drivers/mfd/max77693.c
> > > +++ b/drivers/mfd/max77693.c
> > > @@ -43,7 +43,10 @@
> > >  
> > >  static const struct mfd_cell max77693_devs[] = {
> > >  	{ .name = "max77693-pmic", },
> > > -	{ .name = "max77693-charger", },
> > > +	{
> > > +		.name = "max77693-charger",
> > > +		.of_compatible = "maxim,max77693-charger",
> > > +	},
> > >  	{ .name = "max77693-muic", },
> > >  	{ .name = "max77693-haptic", },
> > >  	{
> > 
> > I'm guessing this can be applied separately?
> 
> Yes. Patch 3/5 also (it adds necessary defines and symbols for
> chargers).

I'll take this patch, but I think 3/5 is required by some of the other
patches in the set, no?

-- 
Lee Jones
Linaro STMicroelectronics Landing Team Lead
Linaro.org │ Open source software for ARM SoCs
Follow Linaro: Facebook | Twitter | Blog

^ permalink raw reply

* Re: [PATCH] staging: android: binder: move to the "real" part of the kernel
From: Pavel Machek @ 2014-10-21 10:01 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: devel, Anup Patel, Linux API, lkml, Arve Hjønnevåg,
	John Stultz, Michael Kerrisk (man-pages), Rebecca Schultz Zavin,
	Santosh Shilimkar, Sumit Semwal, Christoffer Dall
In-Reply-To: <20141016231457.GB13592@kroah.com>

On Fri 2014-10-17 01:14:57, Greg Kroah-Hartman wrote:
> On Thu, Oct 16, 2014 at 04:18:02PM +0200, Michael Kerrisk (man-pages) wrote:
> > On Thu, Oct 16, 2014 at 2:47 PM, Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org> wrote:
> > > From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > >
> > > The Android binder code has been "stable" for many years now.  No matter
> > > what comes in the future, we are going to have to support this API, so
> > > might as well move it to the "real" part of the kernel as there's no
> > > real work that needs to be done to the existing code.
> > 
> > Where does one find the canonical documentation of the user-space API?
> 
> There really is only one "canonical" thing, and that is in the libbinder
> code in the Android userspace repository.  And it's not really
> "documentation" so much as, "a C file that interacts with the ioctls in
> the binder kernel code" :(
> 
> Think of this as just a random character driver with some funny ioctls
> that will never get really documented as there is only one user of it.

This is not random character driver, it is communication mechanism. It
should _not_ be a character driver.

And it should really be documented.
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply

* Re: [PATCH 2/2] fs: Support compiling out sendfile
From: Josh Triplett @ 2014-10-21  9:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pieter Smith, Alexander Viro, Andrew Morton, Eric Paris,
	Matt Turner, Michal Hocko, Paul E. McKenney, Fabian Frederick,
	Tejun Heo, ?????????, Luis R. Rodriguez, Peter Foley,
	Konstantin Khlebnikov, Eric W. Biederman, H. Peter Anvin,
	Oleg Nesterov, Andy Lutomirski, David Herrmann, Kees Cook,
	linux-fsdevel, open list, open list:ABI/API
In-Reply-To: <20141021091356.GA26330@infradead.org>

On Tue, Oct 21, 2014 at 02:13:56AM -0700, Christoph Hellwig wrote:
> On Tue, Oct 21, 2014 at 02:04:22AM -0700, Josh Triplett wrote:
> > That's the plan, but since sendfile depends on some of the splice bits,
> > sendfile needs to be optional as well; SENDFILE_SYSCALL will then select
> > SPLICE_SYSCALLS.
> 
> Just include sendfile with the splice syscalls - we don't really need a
> config option for every obscure syscall.

No objection here.  Pieter, since you're planning to remove splice
anyway, can you just fold the two together under the same Kconfig
option?  That should simplify the patch series, since you won't need to
split the two.

- Josh Triplett

^ permalink raw reply

* Re: [PATCH 2/2] fs: Support compiling out sendfile
From: Christoph Hellwig @ 2014-10-21  9:13 UTC (permalink / raw)
  To: Josh Triplett
  Cc: Christoph Hellwig, Pieter Smith, Alexander Viro, Andrew Morton,
	Eric Paris, Matt Turner, Michal Hocko, Paul E. McKenney,
	Fabian Frederick, Tejun Heo, ?????????, Luis R. Rodriguez,
	Peter Foley, Konstantin Khlebnikov, Eric W. Biederman,
	H. Peter Anvin, Oleg Nesterov, Andy Lutomirski, David Herrmann,
	Kees Cook, linux-fsdevel, open list, open list:ABI/API
In-Reply-To: <20141021090421.GB17604@thin>

On Tue, Oct 21, 2014 at 02:04:22AM -0700, Josh Triplett wrote:
> That's the plan, but since sendfile depends on some of the splice bits,
> sendfile needs to be optional as well; SENDFILE_SYSCALL will then select
> SPLICE_SYSCALLS.

Just include sendfile with the splice syscalls - we don't really need a
config option for every obscure syscall.

^ permalink raw reply

* Re: [PATCH 2/2] fs: Support compiling out sendfile
From: Josh Triplett @ 2014-10-21  9:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pieter Smith, Alexander Viro, Andrew Morton, Eric Paris,
	Matt Turner, Michal Hocko, Paul E. McKenney, Fabian Frederick,
	Tejun Heo, ?????????, Luis R. Rodriguez, Peter Foley,
	Konstantin Khlebnikov, Eric W. Biederman, H. Peter Anvin,
	Oleg Nesterov, Andy Lutomirski, David Herrmann, Kees Cook,
	linux-fsdevel, open list, open list:ABI/API
In-Reply-To: <20141021075154.GB10020@infradead.org>

On Tue, Oct 21, 2014 at 12:51:54AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 20, 2014 at 03:24:22PM -0700, josh@joshtriplett.org wrote:
> > On Mon, Oct 20, 2014 at 11:48:37PM +0200, Pieter Smith wrote:
> > > Many embedded systems will not need this syscall, and omitting it
> > > saves space.  Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
> > > (default y) to support compiling it out.
> > 
> > Nice work, thanks!
> > 
> > If there are no objections, and nobody has a tree they'd rather carry
> > this through, I'll take the series through the tiny tree when it's ready
> > to merge.
> 
> I think it's rather pointless - there is very little sendfile code,
> so you'd rather want to disable splice.

That's the plan, but since sendfile depends on some of the splice bits,
sendfile needs to be optional as well; SENDFILE_SYSCALL will then select
SPLICE_SYSCALLS.

- Josh Triplett

^ permalink raw reply

* Re: [PATCH 2/2] fs: Support compiling out sendfile
From: Christoph Hellwig @ 2014-10-21  7:51 UTC (permalink / raw)
  To: josh-iaAMLnmF4UmaiuxdJuQwMA
  Cc: Pieter Smith, Alexander Viro, Andrew Morton, Eric Paris,
	Matt Turner, Michal Hocko, Paul E. McKenney, Fabian Frederick,
	Tejun Heo, ?????????, Luis R. Rodriguez, Peter Foley,
	Konstantin Khlebnikov, Eric W. Biederman, H. Peter Anvin,
	Oleg Nesterov, Andy Lutomirski, David Herrmann, Kees Cook,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, open list,
	open list:ABI/API
In-Reply-To: <20141020222421.GE8929@cloud>

On Mon, Oct 20, 2014 at 03:24:22PM -0700, josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org wrote:
> On Mon, Oct 20, 2014 at 11:48:37PM +0200, Pieter Smith wrote:
> > Many embedded systems will not need this syscall, and omitting it
> > saves space.  Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
> > (default y) to support compiling it out.
> 
> Nice work, thanks!
> 
> If there are no objections, and nobody has a tree they'd rather carry
> this through, I'll take the series through the tiny tree when it's ready
> to merge.

I think it's rather pointless - there is very little sendfile code,
so you'd rather want to disable splice.

^ permalink raw reply

* Re: preadv2/pwritev2 rename
From: Michael Kerrisk (man-pages) @ 2014-10-21  6:53 UTC (permalink / raw)
  To: Milosz Tanski; +Cc: Christoph Hellwig, Jeff Moyer, Linux API
In-Reply-To: <CAKgNAkjwJ6SH_j_Op+u+p4-0mN-cBVE24Wrt4SyhAiG1MMF9tA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Oct 21, 2014 at 8:42 AM, Michael Kerrisk (man-pages)
<mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> Hello Milosz,
>
> On Mon, Oct 20, 2014 at 11:52 PM, Milosz Tanski <milosz-B5zB6C1i6pkAvxtiuMwx3w@public.gmane.org> wrote:
>> Christoph and/or Jeff,
>>
>> I updated the patch for 3.18-rc1 and I'm going to resend it as non-RFC
>> as I didn't get comments last time.
>>
>> I only have one stupid question... I'm going to rename the calls to
>> preadv6 and pwritev6 (so it's more like the other syscalls: dup3,
>> accept4, eventfd2) but I'm not sure if i should call it preadv5 or
>> pwritev6 since the offset argument is split into two different
>> arguments (upper and lower part).
>
> It's points like this that show exactly why naming system calls after
> the number of their arguments is a very bad idea[1]. Please don't do
> it. pwritev2() and preadv2() are not pretty either, but are marginally
> better. pwritev_fl() and preadv_fl() (or simialr) might also be okay,
> I guess.
>
>> Also, In our application we were able to get about 20%-30% reduction
>> in response time when using this before queuing in a IO thread pool on
>> the read path. It's a pretty nice win in the real world.
>
> Cheers,
>
> Michael
>
> http://blog.man7.org/2014/02/system-call-naming-and-numbering.html

Also, please ensure that future iterations of these patches CC
linux-api@ as per Documentation/SubmitChecklist. The past ones did
not, and so this is the first mention of these system calls that I
happened to see.

Cheers,

Michael
-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: preadv2/pwritev2 rename
From: Michael Kerrisk (man-pages) @ 2014-10-21  6:42 UTC (permalink / raw)
  To: Milosz Tanski; +Cc: Christoph Hellwig, Jeff Moyer, Linux API
In-Reply-To: <CANP1eJFfGE53PTGR4XYGmr=HBLLdOByJj+GbkQB6+JSdrgtvFw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hello Milosz,

On Mon, Oct 20, 2014 at 11:52 PM, Milosz Tanski <milosz-B5zB6C1i6pkAvxtiuMwx3w@public.gmane.org> wrote:
> Christoph and/or Jeff,
>
> I updated the patch for 3.18-rc1 and I'm going to resend it as non-RFC
> as I didn't get comments last time.
>
> I only have one stupid question... I'm going to rename the calls to
> preadv6 and pwritev6 (so it's more like the other syscalls: dup3,
> accept4, eventfd2) but I'm not sure if i should call it preadv5 or
> pwritev6 since the offset argument is split into two different
> arguments (upper and lower part).

It's points like this that show exactly why naming system calls after
the number of their arguments is a very bad idea[1]. Please don't do
it. pwritev2() and preadv2() are not pretty either, but are marginally
better. pwritev_fl() and preadv_fl() (or simialr) might also be okay,
I guess.

> Also, In our application we were able to get about 20%-30% reduction
> in response time when using this before queuing in a IO thread pool on
> the read path. It's a pretty nice win in the real world.

Cheers,

Michael

http://blog.man7.org/2014/02/system-call-naming-and-numbering.html

^ permalink raw reply

* Re: [PATCHv1 7/8] cgroup: cgroup namespace setns support
From: Andy Lutomirski @ 2014-10-21  5:49 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux API, Linux Containers, Serge Hallyn,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Ingo Molnar,
	Tejun Heo, cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <87lhoayo59.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>

On Mon, Oct 20, 2014 at 10:42 PM, Eric W. Biederman
<ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
> Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:
>
>> On Mon, Oct 20, 2014 at 9:49 PM, Eric W. Biederman
>> <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>>> Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:
>>>> Possible solution:
>>>>
>>>> Ditch the pinning.  That is, if you're outside a cgroupns (or you have
>>>> a non-ns-confined cgroupfs mounted), then you can move a task in a
>>>> cgroupns outside of its root cgroup.  If you do this, then the task
>>>> thinks its cgroup is something like "../foo" or "../../foo".
>>>
>>> Of the possible solutions that seems attractive to me, simply because
>>> we sometimes want to allow clever things to occur.
>>>
>>> Does anyone know of a reason (beyond pretty printing) why we need
>>> cgroupns to restrict the subset of cgroups processes can be in?
>>>
>>> I would expect permissions on the cgroup directories themselves, and
>>> limited visiblilty would be (in general) to achieve the desired
>>> visiblity.
>>
>> This makes the security impact of cgroupns very easy to understand,
>> right?  Because there really won't be any -- cgroupns only affects
>> reads from /proc and what cgroupfs shows, but it doesn't change any
>> actual cgroups, nor does it affect any cgroup *changes*.
>
> It seems like what we have described is chcgrouproot aka chroot for
> cgroups.  At which point I think there are potentially similar security
> issues as for chroot.  Can we confuse a setuid root process if we make
> it's cgroup names look different.
>
> Of course the confusing root concern is handled by the usual namespace
> security checks that are already present.

I think that the chroot issues are mostly in two categories: setuid
confusion (not an issue here as you described) and chroot escapes.
cgroupns escapes aren't a big deal, I think -- admins should deny the
confined task the right to write to cgroupfs outside its hierarchy, by
setting cgroupfs permissions appropriately and/or avoiding mounting
cgroupfs outside the hierarchy.

>
> I do wonder if we think of this as chcgrouproot if there is a simpler
> implementation.

Could be.  I'll defer to Aditya for that one.

>
>>>> While we're at it, consider making setns for a cgroupns *not* change
>>>> the caller's cgroup.  Is there any reason it really needs to?
>>>
>>> setns doesn't but nsenter is going to need to change the cgroup
>>> if the pinning requirement is kept.  nsenenter is going to want to
>>> change the cgroup if the pinning requirement is dropped.
>>>
>>
>> It seems easy enough for nsenter to change the cgroup all by itself.
>
> Again.  I don't think anyone has suggested or implemented anything
> different.

The current patchset seems to punt on this decision by just failing
the setns call if the caller is outside the cgroup in question.

--Andy

^ permalink raw reply

* Re: [PATCHv1 7/8] cgroup: cgroup namespace setns support
From: Eric W. Biederman @ 2014-10-21  5:42 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Serge E. Hallyn, Aditya Kali, Linux API, Linux Containers,
	Serge Hallyn, linux-kernel@vger.kernel.org, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA, Ingo Molnar
In-Reply-To: <CALCETrVkMtsnEh57jFZrdx5vHbz97BdO7OuupT+xVNnWpJjxng-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:

> On Mon, Oct 20, 2014 at 9:49 PM, Eric W. Biederman
> <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>> Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:
>>
>>> On Sun, Oct 19, 2014 at 9:55 PM, Eric W.Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>>>>
>>>>
>>>> On October 19, 2014 1:26:29 PM CDT, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>>
>>>>> Is the idea
>>>>>that you want a privileged user wrt a cgroupns's userns to be able to
>>>>>use this?  If so:
>>>>>
>>>>>Yes, that current_cred() thing is bogus.  (Actually, this is probably
>>>>>exploitable right now if any cgroup.procs inode anywhere on the system
>>>>>lets non-root write.)  (Can we have some kernel debugging option that
>>>>>makes any use of current_cred() in write(2) warn?)
>>>>>
>>>>>We really need a weaker version of may_ptrace for this kind of stuff.
>>>>>Maybe the existing may_ptrace stuff is okay, actually.  But this is
>>>>>completely missing group checks, cap checks, capabilities wrt the
>>>>>userns, etc.
>>>>>
>>>>>Also, I think that, if this version of the patchset allows non-init
>>>>>userns to unshare cgroupns, then the issue of what permission is
>>>>>needed to lock the cgroup hierarchy like that needs to be addressed,
>>>>>because unshare(CLONE_NEWUSER|CLONE_NEWCGROUP) will effectively pin
>>>>>the calling task with no permission required.  Bolting on a fix later
>>>>>will be a mess.
>>>>
>>>> I imagine the pinning would be like the userns.
>>>>
>>>> Ah but there is a potentially serious issue with the pinning.
>>>> With pinning we can make it impossible for root to move us to a different cgroup.
>>>>
>>>> I am not certain how serious that is but it bears thinking about.
>>>> If we don't implement pinning we should be able to implent everything with just filesystem mount options, and no new namespace required.
>>>>
>>>> Sigh.
>>>>
>>>> I am too tired tonight to see the end game in this.
>>>
>>> Possible solution:
>>>
>>> Ditch the pinning.  That is, if you're outside a cgroupns (or you have
>>> a non-ns-confined cgroupfs mounted), then you can move a task in a
>>> cgroupns outside of its root cgroup.  If you do this, then the task
>>> thinks its cgroup is something like "../foo" or "../../foo".
>>
>> Of the possible solutions that seems attractive to me, simply because
>> we sometimes want to allow clever things to occur.
>>
>> Does anyone know of a reason (beyond pretty printing) why we need
>> cgroupns to restrict the subset of cgroups processes can be in?
>>
>> I would expect permissions on the cgroup directories themselves, and
>> limited visiblilty would be (in general) to achieve the desired
>> visiblity.
>
> This makes the security impact of cgroupns very easy to understand,
> right?  Because there really won't be any -- cgroupns only affects
> reads from /proc and what cgroupfs shows, but it doesn't change any
> actual cgroups, nor does it affect any cgroup *changes*.

It seems like what we have described is chcgrouproot aka chroot for
cgroups.  At which point I think there are potentially similar security
issues as for chroot.  Can we confuse a setuid root process if we make
it's cgroup names look different.

Of course the confusing root concern is handled by the usual namespace
security checks that are already present.

I do wonder if we think of this as chcgrouproot if there is a simpler
implementation.

>>> While we're at it, consider making setns for a cgroupns *not* change
>>> the caller's cgroup.  Is there any reason it really needs to?
>>
>> setns doesn't but nsenter is going to need to change the cgroup
>> if the pinning requirement is kept.  nsenenter is going to want to
>> change the cgroup if the pinning requirement is dropped.
>>
>
> It seems easy enough for nsenter to change the cgroup all by itself.

Again.  I don't think anyone has suggested or implemented anything
different.

Eric

^ permalink raw reply

* Re: [PATCHv1 7/8] cgroup: cgroup namespace setns support
From: Andy Lutomirski @ 2014-10-21  5:03 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux API, Linux Containers, Serge Hallyn,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Ingo Molnar,
	Tejun Heo, cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <87zjcq10ya.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>

On Mon, Oct 20, 2014 at 9:49 PM, Eric W. Biederman
<ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
> Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:
>
>> On Sun, Oct 19, 2014 at 9:55 PM, Eric W.Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>>>
>>>
>>> On October 19, 2014 1:26:29 PM CDT, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>
>>>> Is the idea
>>>>that you want a privileged user wrt a cgroupns's userns to be able to
>>>>use this?  If so:
>>>>
>>>>Yes, that current_cred() thing is bogus.  (Actually, this is probably
>>>>exploitable right now if any cgroup.procs inode anywhere on the system
>>>>lets non-root write.)  (Can we have some kernel debugging option that
>>>>makes any use of current_cred() in write(2) warn?)
>>>>
>>>>We really need a weaker version of may_ptrace for this kind of stuff.
>>>>Maybe the existing may_ptrace stuff is okay, actually.  But this is
>>>>completely missing group checks, cap checks, capabilities wrt the
>>>>userns, etc.
>>>>
>>>>Also, I think that, if this version of the patchset allows non-init
>>>>userns to unshare cgroupns, then the issue of what permission is
>>>>needed to lock the cgroup hierarchy like that needs to be addressed,
>>>>because unshare(CLONE_NEWUSER|CLONE_NEWCGROUP) will effectively pin
>>>>the calling task with no permission required.  Bolting on a fix later
>>>>will be a mess.
>>>
>>> I imagine the pinning would be like the userns.
>>>
>>> Ah but there is a potentially serious issue with the pinning.
>>> With pinning we can make it impossible for root to move us to a different cgroup.
>>>
>>> I am not certain how serious that is but it bears thinking about.
>>> If we don't implement pinning we should be able to implent everything with just filesystem mount options, and no new namespace required.
>>>
>>> Sigh.
>>>
>>> I am too tired tonight to see the end game in this.
>>
>> Possible solution:
>>
>> Ditch the pinning.  That is, if you're outside a cgroupns (or you have
>> a non-ns-confined cgroupfs mounted), then you can move a task in a
>> cgroupns outside of its root cgroup.  If you do this, then the task
>> thinks its cgroup is something like "../foo" or "../../foo".
>
> Of the possible solutions that seems attractive to me, simply because
> we sometimes want to allow clever things to occur.
>
> Does anyone know of a reason (beyond pretty printing) why we need
> cgroupns to restrict the subset of cgroups processes can be in?
>
> I would expect permissions on the cgroup directories themselves, and
> limited visiblilty would be (in general) to achieve the desired
> visiblity.

This makes the security impact of cgroupns very easy to understand,
right?  Because there really won't be any -- cgroupns only affects
reads from /proc and what cgroupfs shows, but it doesn't change any
actual cgroups, nor does it affect any cgroup *changes*.

>
>> While we're at it, consider making setns for a cgroupns *not* change
>> the caller's cgroup.  Is there any reason it really needs to?
>
> setns doesn't but nsenter is going to need to change the cgroup
> if the pinning requirement is kept.  nsenenter is going to want to
> change the cgroup if the pinning requirement is dropped.
>

It seems easy enough for nsenter to change the cgroup all by itself.

--Andy

^ permalink raw reply

* Re: [PATCHv1 7/8] cgroup: cgroup namespace setns support
From: Eric W. Biederman @ 2014-10-21  4:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Serge E. Hallyn, Aditya Kali, Linux API, Linux Containers,
	Serge Hallyn, linux-kernel@vger.kernel.org, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA, Ingo Molnar
In-Reply-To: <CALCETrXhGnBM_xx=Auz3WRQXkqhGGTWuZN=PU+A9HZ7Ek27FLA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:

> On Sun, Oct 19, 2014 at 9:55 PM, Eric W.Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>>
>>
>> On October 19, 2014 1:26:29 PM CDT, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:

>>> Is the idea
>>>that you want a privileged user wrt a cgroupns's userns to be able to
>>>use this?  If so:
>>>
>>>Yes, that current_cred() thing is bogus.  (Actually, this is probably
>>>exploitable right now if any cgroup.procs inode anywhere on the system
>>>lets non-root write.)  (Can we have some kernel debugging option that
>>>makes any use of current_cred() in write(2) warn?)
>>>
>>>We really need a weaker version of may_ptrace for this kind of stuff.
>>>Maybe the existing may_ptrace stuff is okay, actually.  But this is
>>>completely missing group checks, cap checks, capabilities wrt the
>>>userns, etc.
>>>
>>>Also, I think that, if this version of the patchset allows non-init
>>>userns to unshare cgroupns, then the issue of what permission is
>>>needed to lock the cgroup hierarchy like that needs to be addressed,
>>>because unshare(CLONE_NEWUSER|CLONE_NEWCGROUP) will effectively pin
>>>the calling task with no permission required.  Bolting on a fix later
>>>will be a mess.
>>
>> I imagine the pinning would be like the userns.
>>
>> Ah but there is a potentially serious issue with the pinning.
>> With pinning we can make it impossible for root to move us to a different cgroup.
>>
>> I am not certain how serious that is but it bears thinking about.
>> If we don't implement pinning we should be able to implent everything with just filesystem mount options, and no new namespace required.
>>
>> Sigh.
>>
>> I am too tired tonight to see the end game in this.
>
> Possible solution:
>
> Ditch the pinning.  That is, if you're outside a cgroupns (or you have
> a non-ns-confined cgroupfs mounted), then you can move a task in a
> cgroupns outside of its root cgroup.  If you do this, then the task
> thinks its cgroup is something like "../foo" or "../../foo".

Of the possible solutions that seems attractive to me, simply because
we sometimes want to allow clever things to occur.

Does anyone know of a reason (beyond pretty printing) why we need
cgroupns to restrict the subset of cgroups processes can be in?

I would expect permissions on the cgroup directories themselves, and
limited visiblilty would be (in general) to achieve the desired
visiblity.

> While we're at it, consider making setns for a cgroupns *not* change
> the caller's cgroup.  Is there any reason it really needs to?

setns doesn't but nsenter is going to need to change the cgroup 
if the pinning requirement is kept.  nsenenter is going to want to
change the cgroup if the pinning requirement is dropped.

Eric

^ permalink raw reply

* Re: [PATCHv4 RESEND 0/3] syscalls,x86: Add execveat() system call
From: Eric W. Biederman @ 2014-10-21  4:29 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: David Drysdale, Alexander Viro, Meredydd Luff,
	linux-kernel@vger.kernel.org, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrew Morton, Kees Cook, Arnd Bergmann, X86 ML,
	linux-arch, Linux API
In-Reply-To: <CALCETrXBjLZTWVJfcsE4NA-JP9zSSgn=6ND0=cZ9BTy=CoN7pA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:

> On Mon, Oct 20, 2014 at 6:48 AM, David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>> On Sun, Oct 19, 2014 at 1:20 AM, Eric W. Biederman
>> <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>>> Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> writes:
>>>
>>>> [Added Eric Biederman, since I think your tree might be a reasonable
>>>> route forward for these patches.]
>>>>
>>>> On Thu, Jun 5, 2014 at 6:40 AM, David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>>>>> Resending, adding cc:linux-api.
>>>>>
>>>>> Also, it may help to add a little more background -- this patch is
>>>>> needed as a (small) part of implementing Capsicum in the Linux kernel.
>>>>>
>>>>> Capsicum is a security framework that has been present in FreeBSD since
>>>>> version 9.0 (Jan 2012), and is based on concepts from object-capability
>>>>> security [1].
>>>>>
>>>>> One of the features of Capsicum is capability mode, which locks down
>>>>> access to global namespaces such as the filesystem hierarchy.  In
>>>>> capability mode, /proc is thus inaccessible and so fexecve(3) doesn't
>>>>> work -- hence the need for a kernel-space
>>>>
>>>> I just found myself wanting this syscall for another reason: injecting
>>>> programs into sandboxes or otherwise heavily locked-down namespaces.
>>>>
>>>> For example, I want to be able to reliably do something like nsenter
>>>> --namespace-flags-here toybox sh.  Toybox's shell is unusual in that
>>>> it is more or less fully functional, so this should Just Work (tm),
>>>> except that the toybox binary might not exist in the namespace being
>>>> entered.  If execveat were available, I could rig nsenter or a similar
>>>> tool to open it with O_CLOEXEC, enter the namespace, and then call
>>>> execveat.
>>>>
>>>> Is there any reason that these patches can't be merged more or less as
>>>> is for 3.19?
>>>
>>> Yes.  There is a silliness in how it implements fexecve.  The fexecve
>>> case should be use the empty string "" not a NULL pointer to indication
>>> that.  That change will then harmonize execveat with the other ...at
>>> system calls and simplify the code and remove a special case.  I believe
>>> using the empty string "" requires implementing the AT_EMPTY_PATH flag.
>>
>> Good point -- I'll shift to "" + AT_EMPTY_PATH.
>
> Pending a better idea, I would also see if the patches can be changed
> to return an error if d_path ends up with an "(unreachable)" thing
> rather than failing inexplicably later on.

For my reference we are talking about  

> @@ -1489,7 +1524,21 @@ static int do_execve_common(struct filename *filename,
>  	sched_exec();
> 
>  	bprm->file = file;
> -	bprm->filename = bprm->interp = filename->name;
> +	if (filename && fd == AT_FDCWD) {
> +		bprm->filename = filename->name;
> +	} else {
> +		pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
> +		if (!pathbuf) {
> +			retval = -ENOMEM;
> +			goto out_unmark;
> +		}
> +		bprm->filename = d_path(&file->f_path, pathbuf, PATH_MAX);
> +		if (IS_ERR(bprm->filename)) {
> +			retval = PTR_ERR(bprm->filename);
> +			goto out_unmark;
> +		}
> +	}
> +	bprm->interp = bprm->filename;
> 
>  	retval = bprm_mm_init(bprm);
>  	if (retval)

The interesting case for fexecve is when we either don't know what files
are present or we don't want to depend on which files are present.

As Al pointed out d_path really isn't the right solution.  It fails when
printing /proc/self/fd/${fd}/${filename->name} would work, and the
"(deleted)" or "(unreachable)" strings are wrong.

The test for today's cases should be:
if ((filename->name[0] == '/') || fd == AT_FDCWD) {
	bprm->filename = filename->name;
} 

To handle the case where the file descriptor is relevant.

For the case where the file descriptor is relevant let me suggest
setting bprm->filename and bprm->interp to:

/dev/fd/${fd}/${filename->name}

It is more a description of what we have done but as a magic string it
is descriptive.  Documetation/devices.txt documents that /dev/fd/ should
exist, making it an unambiguous path.  Further these days the kernel
sets the device naming policy in dev, so I think we are strongly safe in
using that path in any event.

I think execveat is interesting in the kernel because the motivating
cases are the cases where anything except a static executable is
uninteresting.

Now it has been suggested creating a dupfs or a mini-proc.  I think that
sounds like a nice companion, to the concept of a locked down root.
But I don't think it removes the need for execveat (because we still
have the case where we don't want to care what is mounted, and are happy
to use static executables). 

Eric

^ permalink raw reply

* [GIT PULL] core drm support for Rockchip SoCs
From: Mark yao @ 2014-10-21  0:43 UTC (permalink / raw)
  To: heiko, Boris BREZILLON, David Airlie, Rob Clark, Daniel Vetter,
	Rob Herring, Pawel Moll, Mark Rutland, Ian Campbell, Kumar Gala,
	Randy Dunlap, Grant Likely, Greg Kroah-Hartman, John Stultz,
	Rom Lemarchand
  Cc: devicetree, linux-doc, linux-kernel, dri-devel, linux-api,
	linux-rockchip, dianders, marcheu, dbehr, olof, djkurtz, xjq, kfx,
	cym, cf, zyw, xxm, huangtao, kever.yang, yxj, wxt, xw, Mark Yao
In-Reply-To: <1412763781-31533-1-git-send-email-mark.yao@rock-chips.com>

Hi, Dave

The following changes since commit bfe01a5ba2490f299e1d2d5508cbbbadd897bbe9:

    Linux 3.17 (2014-10-05 12:23:04 -0700)

are available in the git repository at:

    https://github.com/markyzq/kernel-drm-rockchip.git  drmrockchip

for you to fetch changes up to 45bb5f4e7e82b30e9e7069c73441413680c9a59f:

    dt-bindings: video: Add documentation for rockchip vop (2014-10-17
16:39:31 +0800)

----------------------------------------------------------------
Mark yao (3):
        drm: rockchip: Add basic drm driver
        dt-bindings: video: Add for rockchip display subsytem
        dt-bindings: video: Add documentation for rockchip vop

   .../devicetree/bindings/video/rockchip-drm.txt     |   19 +
   .../devicetree/bindings/video/rockchip-vop.txt     |   58 +
   drivers/gpu/drm/Kconfig                            |    2 +
   drivers/gpu/drm/Makefile                           |    1 +
   drivers/gpu/drm/rockchip/Kconfig                   |   17 +
   drivers/gpu/drm/rockchip/Makefile                  |    8 +
   drivers/gpu/drm/rockchip/rockchip_drm_drv.c        |  449 ++++++
   drivers/gpu/drm/rockchip/rockchip_drm_drv.h        |   54 +
   drivers/gpu/drm/rockchip/rockchip_drm_fb.c         |  200 +++
   drivers/gpu/drm/rockchip/rockchip_drm_fb.h         |   28 +
   drivers/gpu/drm/rockchip/rockchip_drm_fbdev.c      |  210 +++
   drivers/gpu/drm/rockchip/rockchip_drm_fbdev.h      |   20 +
   drivers/gpu/drm/rockchip/rockchip_drm_gem.c        |  293 ++++
   drivers/gpu/drm/rockchip/rockchip_drm_gem.h        |   54 +
   drivers/gpu/drm/rockchip/rockchip_drm_vop.c        | 1427 ++++++++++++++++++++
   drivers/gpu/drm/rockchip/rockchip_drm_vop.h        |  196 +++
   16 files changed, 3036 insertions(+)
   create mode 100644 Documentation/devicetree/bindings/video/rockchip-drm.txt
   create mode 100644 Documentation/devicetree/bindings/video/rockchip-vop.txt
   create mode 100644 drivers/gpu/drm/rockchip/Kconfig
   create mode 100644 drivers/gpu/drm/rockchip/Makefile
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_drv.c
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_drv.h
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_fb.c
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_fb.h
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_fbdev.c
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_fbdev.h
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_gem.c
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_gem.h
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_vop.c
   create mode 100644 drivers/gpu/drm/rockchip/rockchip_drm_vop.h



^ permalink raw reply

* Re: [PATCHv1 7/8] cgroup: cgroup namespace setns support
From: Andy Lutomirski @ 2014-10-21  0:20 UTC (permalink / raw)
  To: Eric W.Biederman
  Cc: Serge E. Hallyn, Aditya Kali, Linux API, Linux Containers,
	Serge Hallyn,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA, Ingo Molnar
In-Reply-To: <44072106-c0f3-46b8-b2b5-9b1cbd1b7d88-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>

On Sun, Oct 19, 2014 at 9:55 PM, Eric W.Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>
>
> On October 19, 2014 1:26:29 PM CDT, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>>On Sat, Oct 18, 2014 at 10:23 PM, Eric W. Biederman
>><ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> wrote:
>>> "Serge E. Hallyn" <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org> writes:
>>>
>>>> Quoting Aditya Kali (adityakali-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org):
>>>>> On Thu, Oct 16, 2014 at 2:12 PM, Serge E. Hallyn <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
>>wrote:
>>>>> > Quoting Aditya Kali (adityakali-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org):
>>>>> >> setns on a cgroup namespace is allowed only if
>>>>> >> * task has CAP_SYS_ADMIN in its current user-namespace and
>>>>> >>   over the user-namespace associated with target cgroupns.
>>>>> >> * task's current cgroup is descendent of the target
>>cgroupns-root
>>>>> >>   cgroup.
>>>>> >
>>>>> > What is the point of this?
>>>>> >
>>>>> > If I'm a user logged into
>>>>> > /lxc/c1/user.slice/user-1000.slice/session-c12.scope and I start
>>>>> > a container which is in
>>>>> > /lxc/c1/user.slice/user-1000.slice/session-c12.scope/x1
>>>>> > then I will want to be able to enter the container's cgroup.
>>>>> > The container's cgroup root is under my own (satisfying the
>>>>> > below condition0 but my cgroup is not a descendent of the
>>>>> > container's cgroup.
>>>>> >
>>>>> This condition is there because we don't want to do implicit cgroup
>>>>> changes when a process attaches to another cgroupns. cgroupns tries
>>to
>>>>> preserve the invariant that at any point, your current cgroup is
>>>>> always under the cgroupns-root of your cgroup namespace. But in
>>your
>>>>> example, if we allow a process in "session-c12.scope" container to
>>>>> attach to cgroupns root'ed at "session-c12.scope/x1" container
>>>>> (without implicitly moving its cgroup), then this invariant won't
>>>>> hold.
>>>>
>>>> Oh, I see.  Guess that should be workable.  Thanks.
>>>
>>> Which has me looking at what the rules are for moving through
>>> the cgroup hierarchy.
>>>
>>> As long as we have write access to cgroup.procs and are allowed
>>> to open the file for write, we can move any of our own tasks
>>> into the cgroup.  So the cgroup namespace rules don't seem
>>> to be a problem.
>>>
>>> Andy can you please take a look at the permission checks in
>>> __cgroup_procs_write.
>>
>>The actual requirements for calling that function haven't changed,
>>right?  IOW, what does this have to do with cgroupns?
>
> Excluding user namespaces the requirements have not changed.
>
> The immediate correlation is that to enter a cgroupns you must first put your process in one of it's cgroups.
>
> So I was examining what it would take to enter the cgroup of cgroupns.
>
>> Is the idea
>>that you want a privileged user wrt a cgroupns's userns to be able to
>>use this?  If so:
>>
>>Yes, that current_cred() thing is bogus.  (Actually, this is probably
>>exploitable right now if any cgroup.procs inode anywhere on the system
>>lets non-root write.)  (Can we have some kernel debugging option that
>>makes any use of current_cred() in write(2) warn?)
>>
>>We really need a weaker version of may_ptrace for this kind of stuff.
>>Maybe the existing may_ptrace stuff is okay, actually.  But this is
>>completely missing group checks, cap checks, capabilities wrt the
>>userns, etc.
>>
>>Also, I think that, if this version of the patchset allows non-init
>>userns to unshare cgroupns, then the issue of what permission is
>>needed to lock the cgroup hierarchy like that needs to be addressed,
>>because unshare(CLONE_NEWUSER|CLONE_NEWCGROUP) will effectively pin
>>the calling task with no permission required.  Bolting on a fix later
>>will be a mess.
>
> I imagine the pinning would be like the userns.
>
> Ah but there is a potentially serious issue with the pinning.
> With pinning we can make it impossible for root to move us to a different cgroup.
>
> I am not certain how serious that is but it bears thinking about.
> If we don't implement pinning we should be able to implent everything with just filesystem mount options, and no new namespace required.
>
> Sigh.
>
> I am too tired tonight to see the end game in this.

Possible solution:

Ditch the pinning.  That is, if you're outside a cgroupns (or you have
a non-ns-confined cgroupfs mounted), then you can move a task in a
cgroupns outside of its root cgroup.  If you do this, then the task
thinks its cgroup is something like "../foo" or "../../foo".

While we're at it, consider making setns for a cgroupns *not* change
the caller's cgroup.  Is there any reason it really needs to?

Thoughts?

--Andy

^ permalink raw reply

* Re: [PATCH] staging: android: binder: move to the "real" part of the kernel
From: Arve Hjønnevåg @ 2014-10-20 23:32 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Greg Kroah-Hartman,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org,
	Anup Patel, linux-api-u79uwXL29TY76Z2rM5mHXA, LKML, John Stultz,
	Rebecca Schultz Zavin, Santosh Shilimkar, Sumit Semwal,
	Christoffer Dall
In-Reply-To: <20141020092049.GJ23154@mwanda>

On Mon, Oct 20, 2014 at 2:20 AM, Dan Carpenter <dan.carpenter-QHcLZuEGTsthl2p70BpVqQ@public.gmane.orgm> wrote:
> On Mon, Oct 20, 2014 at 06:05:47AM +0800, Greg Kroah-Hartman wrote:
>> On Fri, Oct 17, 2014 at 12:26:01PM +0300, Dan Carpenter wrote:
>> > The code isn't very beautiful and there are lots of details wrong like
>> > the error codes.
>>
>> Really, what is wrong with the existing error code usages?
>>
>
> I guess I was mostly looking at binder_ioctl(), the rest of the code
> seems better.
>
> I feel like we went directly from "This code is never going in so there
> is no need to look at it." to "This code is going in in one week so
> there is no time to look at it."
>
> How often do people rely on proc_no_lock=1 to make this work?  People
> are saying on the internet that you don't need acurate information so
> you should disable locking as a speed up.
> http://www.programdevelop.com/1821706/.  It seems like a bad idea to
> provide provide the "run fast and crash" option.

That is not what this switch is for. It is only there to debug the
driver if it gets stuck with the lock held. I would not object to
adding a config option to remove this param by default.

>
> Why is binder_set_nice needed?  Some comments would help here.

The driver has some support for priority inheritance.

>
>    434  static void binder_set_nice(long nice)
>    435  {
>    436          long min_nice;
>    437
>    438          if (can_nice(current, nice)) {
>    439                  set_user_nice(current, nice);
>    440                  return;
>    441          }
>    442          min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
>    443          binder_debug(BINDER_DEBUG_PRIORITY_CAP,
>    444                       "%d: nice value %ld not allowed use %ld instead\n",
>    445                        current->pid, nice, min_nice);
>    446          set_user_nice(current, min_nice);
>    447          if (min_nice <= MAX_NICE)
>    448                  return;
>
> It's harmless but wierd to check if min_nice is valid after we already
> called set_user_nice().

I don't remember why I did this, but I agree it is weird.

>
>    449          binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
>    450  }
>
> Random comment:
>
>    709          has_page_addr =
>    710                  (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
>
> The casting on "buffer->data" ugly and inconsistent.  There should be
> a function:

Other code in the kernel seems to do this the same way (although most
cast to unsigned long instead of uintptr_t). This code rounds down to
the start of of the page, and needs to cast to an integer type for
this. Adding a global kernel helper for this would be better than a
binder specific one. The existing PAGE_ALIGN function, which rounds
up, still needs casts to use with pointer types though.

> void *buffer_data(struct binder_buffer *buffer)
> {
>         return buffer.data;
> }
>
> That way this becomes:
>                 has_page_addr = (buffer_data(buffer) + buffer_size) & PAGE_MASK);

I don't think this will compile.

>
> The "has_page_addr" variable name is misleading, because we're not
> storing true/false.  We're storing the last page address.

It is the address of a page we already have mapped, not the last
address of the new allocation.

>
>    711          if (n == NULL) {
>    712                  if (size + sizeof(struct binder_buffer) + 4 >= buffer_size)
>    713                          buffer_size = size; /* no room for other buffers */
>    714                  else
>    715                          buffer_size = size + sizeof(struct binder_buffer);
>    716          }
>    717          end_page_addr =
>    718                  (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
>    719          if (end_page_addr > has_page_addr)
>    720                  end_page_addr = has_page_addr;
>    721          if (binder_update_page_range(proc, 1,
>    722              (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
>    723                  return NULL;
>
> The alignment here is confusing because we don't align buffer->data
> below.

binder_update_page_range allocates and maps pages. buffer->data will
point to a range within the allocated pages.

>
>    724
>    725          rb_erase(best_fit, &proc->free_buffers);
>    726          buffer->free = 0;
>    727          binder_insert_allocated_buffer(proc, buffer);
>    728          if (buffer_size != size) {
>    729                  struct binder_buffer *new_buffer = (void *)buffer->data + size;
>                                                            ^^^^^^^^^^^^^^^^^^^^
> I don't really understand when buffer->data has to be page aligned and
> when not.  This code has very few comments.
>

buffer->data never needs to be page aligned. The code above rounds to
page boundaries to allocate new pages as needed.

>    730
>    731                  list_add(&new_buffer->entry, &buffer->entry);
>    732                  new_buffer->free = 1;
>    733                  binder_insert_free_buffer(proc, new_buffer);
>    734          }
>
> Does the stop_on_user_error option work?  There should be some
> documentation for this stuff.
>

Yes.

> regards,
> dan carpenter
>

-- 
Arve Hjønnevåg

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox