Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH v1 2/5] IB/uverbs: ex_query_device: check request's comp_mask
From: Haggai Eran @ 2015-02-01  8:12 UTC (permalink / raw)
  To: Yann Droneaud, Sagi Grimberg, Shachar Raindel, Eli Cohen,
	Roland Dreier
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <335da45738872e446f63db338ca766a34608c90a.1422553023.git.ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

On 29/01/2015 19:59, Yann Droneaud wrote:
> This patch ensures the extended QUERY_DEVICE uverbs request's
> comp_mask has only known and supported bits (currently none).
> 
> If userspace set unknown features bits, -EINVAL will be returned,
> ensuring current programs are not allowed to set random feature
> bits: such bits could enable new extended features in future kernel
> versions and those features can trigger a behavior not unsupported
> by the older programs or make the newer kernels return an error
> for a request which was valid on older kernels.
> 
> Additionally, returning an error for unsupported feature would
> allow userspace to probe/discover which extended features are
> currently supported by a kernel.

As I wrote before, I hope in the future we don't force userspace to
probe features this way, because it may be unnecessarily complex.

I agree though that we should have a way to extend this verb in the future.

Reviewed-by: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

> 
> Link: http://mid.gmane.org/cover.1422553023.git.ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org
> Cc: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Cc: Shachar Raindel <raindel-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Cc: Eli Cohen <eli-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Cc: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Yann Droneaud <ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
> ---
>  drivers/infiniband/core/uverbs_cmd.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
> index 6ef06a9b4362..fbcc54b86795 100644
> --- a/drivers/infiniband/core/uverbs_cmd.c
> +++ b/drivers/infiniband/core/uverbs_cmd.c
> @@ -3312,6 +3312,9 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
>  	if (err)
>  		return err;
>  
> +	if (cmd.comp_mask)
> +		return -EINVAL;
> +
>  	if (cmd.reserved)
>  		return -EINVAL;
>  
> 

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Haggai Eran @ 2015-02-01  8:04 UTC (permalink / raw)
  To: Yann Droneaud, Jason Gunthorpe
  Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1422568741.3133.247.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

On 29/01/2015 23:59, Yann Droneaud wrote:
> But I wonder about this part of commit 860f10a799c8:
> 
> diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
> index c7a43624c96b..f9326ccda4b5 100644
> --- a/drivers/infiniband/core/uverbs_cmd.c
> +++ b/drivers/infiniband/core/uverbs_cmd.c
> @@ -953,6 +953,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
>                 goto err_free;
>         }
>  
> +       if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
> +               struct ib_device_attr attr;
> +
> +               ret = ib_query_device(pd->device, &attr);
> +               if (ret || !(attr.device_cap_flags &
> +                               IB_DEVICE_ON_DEMAND_PAGING)) {
> +                       pr_debug("ODP support not available\n");
> +                       ret = -EINVAL;
> +                       goto err_put;
> +               }
> +       }
> +
> 
> AFAICT (1 << 6) bit from struct ib_uverbs_reg_mr access_flags field
> was not enforced to be 0 previously, as ib_check_mr_access() only check 
> for some coherency between a subset of the bits (it's not a function
> dedicated to check flags provided by userspace):
> 
> include/rdma/ib_verbs.h:
> 
>    1094 enum ib_access_flags {
>    1095         IB_ACCESS_LOCAL_WRITE   = 1,
>    1096         IB_ACCESS_REMOTE_WRITE  = (1<<1),
>    1097         IB_ACCESS_REMOTE_READ   = (1<<2),
>    1098         IB_ACCESS_REMOTE_ATOMIC = (1<<3),
>    1099         IB_ACCESS_MW_BIND       = (1<<4),
>    1100         IB_ZERO_BASED           = (1<<5),
>    1101         IB_ACCESS_ON_DEMAND     = (1<<6),
>    1102 };
> 
> drivers/infiniband/core/uverbs_cmd.c: ib_uverbs_reg_mr()
> 
>     961         ret = ib_check_mr_access(cmd.access_flags);
>     962         if (ret)
>     963                 return ret;
> 
> include/rdma/ib_verbs.h:
> 
>    2643 static inline int ib_check_mr_access(int flags)
>    2644 {
>    2645         /*
>    2646          * Local write permission is required if remote write or
>    2647          * remote atomic permission is also requested.
>    2648          */
>    2649         if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
>    2650             !(flags & IB_ACCESS_LOCAL_WRITE))
>    2651                 return -EINVAL;
>    2652 
>    2653         return 0;
>    2654 }
> 
> drivers/infiniband/core/uverbs_cmd.c: ib_uverbs_reg_mr()
> 
>     990         mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
>     991                                      cmd.access_flags, &udata);
> 
> reg_user_mr() functions may call ib_umem_get() and pass access_flags to
> it:
> 
> drivers/infiniband/core/umem.c: ib_umem_get()
> 
>     114         /*
>     115          * We ask for writable memory if any of the following
>     116          * access flags are set.  "Local write" and "remote write"
>     117          * obviously require write access.  "Remote atomic" can do
>     118          * things like fetch and add, which will modify memory, and
>     119          * "MW bind" can change permissions by binding a window.
>     120          */
>     121         umem->writable  = !!(access &
>     122                 (IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
>     123                  IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
>     124 
>     125         if (access & IB_ACCESS_ON_DEMAND) {
>     126                 ret = ib_umem_odp_get(context, umem);
>     127                 if (ret) {
>     128                         kfree(umem);
>     129                         return ERR_PTR(ret);
>     130                 }
>     131                 return umem;
>     132         }
> 
> 
> As you can see only a few bits in access_flags are checked in the end,
> so it may exist a very unlikely possibility that an existing userspace
> program is setting this IB_ACCESS_ON_DEMAND bit without the intention
> of enabling on demand paging as this would be unnoticed by older kernel.
>
> In the other hand, a newer program built with on-demand-paging in mind
> will set the bit, but when run on older kernel, it will be a no-op,
> allowing the program to continue, perhaps thinking on-demand-paging
> is available.

This is why we added a method for userspace to check the kernel
capabilities both when adding the memory windows support and with ODP.
User-space can still send garbage to the kernel, but if it does so
without checking the kernel supports its request, it's user-space's problem.

> That should be avoided as much as possible.
> 
> Unfortunately, I think this cannot be fixed as it's was long since 
> IB_ZERO_BASED was added by commit 7083e42ee2 ("IB/core: Add "type 2" 
> memory windows support").
> Anyway there was no check for IB_ACCESS_REMOTE_READ, nor 
> IB_ACCESS_MW_BIND in the uverb layer either.

I think it would be perfectly fine to add a check today that validates
the MR access flags input is known. Because the newer feature require
user-space to check capabilities, I don't think it would matter if we
returned an error on newer kernels that do not support these features.

Haggai

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Haggai Eran @ 2015-02-01  7:39 UTC (permalink / raw)
  To: Jason Gunthorpe, Yann Droneaud
  Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20150129182800.GH11842-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

On 29/01/2015 20:28, Jason Gunthorpe wrote:
> On Thu, Jan 29, 2015 at 06:59:58PM +0100, Yann Droneaud wrote:
>> > -		resp.odp_caps.general_caps = attr.odp_caps.general_caps;
>> > -		resp.odp_caps.per_transport_caps.rc_odp_caps =
>> > -			attr.odp_caps.per_transport_caps.rc_odp_caps;
>> > -		resp.odp_caps.per_transport_caps.uc_odp_caps =
>> > -			attr.odp_caps.per_transport_caps.uc_odp_caps;
>> > -		resp.odp_caps.per_transport_caps.ud_odp_caps =
>> > -			attr.odp_caps.per_transport_caps.ud_odp_caps;
>> > -		resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
>> > -	}
>> > +	resp.odp_caps.general_caps = attr.odp_caps.general_caps;
>> > +	resp.odp_caps.per_transport_caps.rc_odp_caps =
>> > +		attr.odp_caps.per_transport_caps.rc_odp_caps;
>> > +	resp.odp_caps.per_transport_caps.uc_odp_caps =
>> > +		attr.odp_caps.per_transport_caps.uc_odp_caps;
>> > +	resp.odp_caps.per_transport_caps.ud_odp_caps =
>> > +		attr.odp_caps.per_transport_caps.ud_odp_caps;
>> >  #endif
>> > +	resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
> Not sure about this - if 0 is a valid null answer for all the _caps
> then it is fine, and the comp_mask bit should just be removed as the
> size alone should be enough.

Zero is indeed a valid answer. There the IB_ODP_SUPPORT bit in the
general_caps field that says whether or not ODP is supported in general.
The per transport capabilities are also default to not supported.

However, I think we should keep the comp_mask field for future
extensions. The current code doesn't report the size of the response to
user space, and in addition, comp_mask being a bit mask has the
advantage of allowing only part of the structure to be marked valid.

Regards,
Haggai

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Or Gerlitz @ 2015-01-31 20:09 UTC (permalink / raw)
  To: Yann Droneaud
  Cc: Roland Dreier, Jason Gunthorpe, Sagi Grimberg, Shachar Raindel,
	Eli Cohen, Haggai Eran,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1422638760.3133.260.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

On Fri, Jan 30, 2015, Yann Droneaud <ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org> wrote:
> [..] I have not the chance
> of owning HCA with the support for this feature, nor the patches
> libibverbs / libmlx5 ... (anyway I would not have the time to test). [...]

Yann, have you ever run Linux over RDMA capable HW (IB, RoCE, iWARP or alike?)

^ permalink raw reply

* Re: [RFC][PATCH v2] procfs: Always expose /proc/<pid>/map_files/ and make it readable
From: Calvin Owens @ 2015-01-31  1:58 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andrew Morton, Cyrill Gorcunov, Kirill A. Shutemov,
	Alexey Dobriyan, Oleg Nesterov, Eric W. Biederman, Al Viro,
	Kirill A. Shutemov, Peter Feiner, Grant Likely,
	Siddhesh Poyarekar, LKML, kernel-team-b10kYP2dOMg,
	Pavel Emelyanov, Linux API
In-Reply-To: <CAGXu5j+wa2-CCGaktPzDec=HF0CizP__HVVjZKcjGuJJOvLFyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Thursday 01/29 at 17:30 -0800, Kees Cook wrote:
> On Tue, Jan 27, 2015 at 8:38 PM, Calvin Owens <calvinowens-b10kYP2dOMg@public.gmane.org> wrote:
> > On Monday 01/26 at 15:43 -0800, Andrew Morton wrote:
> >> On Tue, 27 Jan 2015 00:00:54 +0300 Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> >>
> >> > On Mon, Jan 26, 2015 at 02:47:31PM +0200, Kirill A. Shutemov wrote:
> >> > > On Fri, Jan 23, 2015 at 07:15:44PM -0800, Calvin Owens wrote:
> >> > > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> >> > > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface
> >> > > > is very useful for enumerating the files mapped into a process when
> >> > > > the more verbose information in /proc/<pid>/maps is not needed.
> >>
> >> This is the main (actually only) justification for the patch, and it it
> >> far too thin.  What does "not needed" mean.  Why can't people just use
> >> /proc/pid/maps?
> >
> > The biggest difference is that if you do something like this:
> >
> >         fd = open("/stuff", O_BLAH);
> >         map = mmap(NULL, 4096, PROT_BLAH, MAP_SHARED, fd, 0);
> >         close(fd);
> >         unlink("/stuff");
> >
> > ...then map_files/ gives you a way to get a file descriptor for
> > "/stuff", which you couldn't do with /proc/pid/maps.
> >
> > It's also something of a win if you just want to see what is mapped at a
> > specific address, since you can just readlink() the symlink for the
> > address range you care about and it will go grab the appropriate VMA and
> > give you the answer. /proc/pid/maps requires walking the VMA tree, which
> > is quite expensive for processes with many thousands of threads, even
> > without the O(N^2) issue.
> >
> > (You have to know what address range you want though, since readdir() on
> > map_files/ obviously has to walk the VMA tree just like /proc/N/maps.)
> >
> >> > > > This patch moves the folder out from behind CHECKPOINT_RESTORE, and
> >> > > > removes the CAP_SYS_ADMIN restrictions. Following the links requires
> >> > > > the ability to ptrace the process in question, so this doesn't allow
> >> > > > an attacker to do anything they couldn't already do before.
> >> > > >
> >> > > > Signed-off-by: Calvin Owens <calvinowens-b10kYP2dOMg@public.gmane.org>
> >> > >
> >> > > Cc +linux-api@
> >> >
> >> > Looks good to me, thanks! Though I would really appreciate if someone
> >> > from security camp take a look as well.
> >>
> >> hm, who's that.  Kees comes to mind.
> >>
> >> And reviewers' task would be a heck of a lot easier if they knew what
> >> /proc/pid/map_files actually does.  This:
> >>
> >> akpm3:/usr/src/25> grep -r map_files Documentation
> >> akpm3:/usr/src/25>
> >>
> >> does not help.
> >>
> >> The 640708a2cff7f81 changelog says:
> >>
> >> :     This one behaves similarly to the /proc/<pid>/fd/ one - it contains
> >> :     symlinks one for each mapping with file, the name of a symlink is
> >> :     "vma->vm_start-vma->vm_end", the target is the file.  Opening a symlink
> >> :     results in a file that point exactly to the same inode as them vma's one.
> >> :
> >> :     For example the ls -l of some arbitrary /proc/<pid>/map_files/
> >> :
> >> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
> >> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
> >> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
> >> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
> >> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
> >>
> >> afacit this info is also available in /proc/pid/maps, so things
> >> shouldn't get worse if the /proc/pid/map_files permissions are at least
> >> as restrictive as the /proc/pid/maps permissions.  Is that the case?
> >> (Please add to changelog).
> >
> > Yes, the only difference is that you can follow the link as per above.
> > I'll resend with a new message explaining that and the deletion thing.
> >
> >> There's one other problem here: we're assuming that the map_files
> >> implementation doesn't have bugs.  If it does have bugs then relaxing
> >> permissions like this will create new vulnerabilities.  And the
> >> map_files implementation is surprisingly complex.  Is it bug-free?
> >
> > While I was messing with it I used it a good bit and didn't see any
> > issues, although I didn't actively try to fuzz it or anything. I'd be
> > happy to write something to test hammering it in weird ways if you like.
> > I'm also happy to write testcases for namespaces.
> >
> > So far as security issues, as others have pointed out you can't follow
> > the links unless you can ptrace the process in question, which seems
> > like a pretty solid guarantee. As Cyrill pointed out in the discussion
> > about the documentation, that's the same protection as /proc/N/fd/*, and
> > those links function in the same way.
> 
> My concern here is that fd/* are connected as streams, and while that
> has a certain level of badness as an external-to-the-process attacker,
> PTRACE_MODE_READ is much weaker than PTRACE_MODE_ATTACH (which is
> required for access to /proc/N/mem). Since these fds are the things
> mapped into memory on a process, writing to them is a subset of access
> to /proc/N/mem, and I don't feel that PTRACE_MODE_READ is sufficient.

If you haven't done close() on a mmapped file, doesn't fd/* allow the
same access to the corresponding regions of memory? Or am I missing
something?
 
But that said, I can't think of any reason making it MODE_ATTACH would
be a problem. Would you rather that be enforced on follow_link() like
the original patch did, or enforce it for the whole directory?

Thanks,
Calvin
 
> -Kees
> 
> -- 
> Kees Cook
> Chrome OS Security

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Yann Droneaud @ 2015-01-30 17:26 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Jason Gunthorpe, Sagi Grimberg, Shachar Raindel, Eli Cohen,
	Haggai Eran, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAG4TOxN4DpTMMsCM-1oe6-w+5xYR4YHdJeL7p2nQpUy9gYCSjA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hi,

Le jeudi 29 janvier 2015 à 15:17 -0800, Roland Dreier a écrit :
> On Thu, Jan 29, 2015 at 1:59 PM, Yann Droneaud <ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org> wrote:
> >> Roland: I agree with Yann, these patches need to go in, or the ODP
> >> patches reverted.
> 
> > Reverting all On Demand Paging patches seems overkill:
> > if something as to be reverted it should be commit 5a77abf9a97a
> > ("IB/core: Add support for extended query device caps") and the part of
> > commit 860f10a799c8 ("IB/core: Add flags for on demand paging support")
> > which modify ib_uverbs_ex_query_device().
> 
> Thank you and Jason for taking on this interface.
> 
> At this point I feel like I do about the IPoIB changes -- we should
> revert the broken stuff and get it right for 3.20.
> 
> If we revert the two things you describe above, is everything else OK
> to leave in 3.19 with respect to ABI?
> 

I've tried to review every changes since v3.18 on drivers/infiniband
include/rdma and include/uapi/rdma with respect to ABI issues.

I've noticed no other issue, but I have to admit I've not well reviewed
the drivers (hw/) internal changes.

If the IB_USER_VERBS_EX_CMD_QUERY_DEVICE and ib_uverbs_ex_query_device
changes are going to be reverted for v3.19, the on-demand-paging
feature will be available (IB_DEVICE_ON_DEMAND_PAGING will be set 
device_cap_flags in response to non extended QUERY_DEVICE for mlx5 HCA
and IB_ACCESS_ON_DEMAND access flag will be effective for REG_MR 
uverbs), but its parameters won't be. I don't know if it's a no-go for 
the usage of on-demand paging by userspace: I have not the chance
of owning HCA with the support for this feature, nor the patches
libibverbs / libmlx5 ... (anyway I would not have the time to test).
I've hoped people from Mellanox would have commented on the revert 
option too.

Regards.

-- 
Yann Droneaud
OPTEYA

^ permalink raw reply

* Re: [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Peter Hurley @ 2015-01-30 15:59 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Thierry Reding, Varka Bhadram, Chunyan Zhang,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, mark.rutland-5wv7dgnIgG8,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	heiko-4mtYJXux2i+zQB+pC5nmwQ, andrew-g2DYL2Zd6BY,
	jslaby-AlSwsSmVLrQ, lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	pawel.moll-5wv7dgnIgG8, zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	geng.ren-lxIno14LUO0EEoCn2XhGlw,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, florian.vaussard-p8DiymsW2f8,
	devicetree-u79uwXL29TY76Z2rM5mHXA, jason-NLaQJdtUoK4Be96aLqz0jA,
	arnd-r2nGTMty4D4, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	hytszk-Re5JQEeQqe8AvxtiuMwx3w, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, galak-sgV2jX0FEOL9JmXXK+q4OQ,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <20150130154926.GN26493-l+eeeJia6m9vn6HldHNs0ANdhmdF6hFW@public.gmane.org>

On 01/30/2015 10:49 AM, Russell King - ARM Linux wrote:
> On Fri, Jan 30, 2015 at 10:32:54AM -0500, Peter Hurley wrote:
>> Before you say consistency, I think you should look at the stats below.
>> IOW, if you want to change the error code return from probe() for
>> consistency's sake, a tree-wide patch would be the appropriate way.
> 
> Now look outside the serial driver sub-tree.
> 
> There are 1234 instances of platform_get_resource(, IORESOURCE_MEM, ) in
> the drivers/ sub-tree, with 700 instances of devm_ioremap_resource()
> being used there.  Of the devm_ioremap_resource() instances:
> 
> - 555 use platform_get_resource() in the preceding two lines - which is
>   not enough to do anything but rely on the -EINVAL return value.
> - 16 mention ENODEV in the preceding three lines.
> 
> There are 132 which use platform_get_resource() and return ENODEV within
> the following three lines (which may intersect with the above 16 number)
> and 88 which use EINVAL.
> 
> So, there are in total 643 instances where a missing resource returns
> EINVAL, and between 132 and 148 instances which return ENODEV.
> 
> Yes, 643 + 148 isn't 1234, but I'm not going to read through all 1234
> locations just for the sake of this thread.   What's clear though is that
> more than 50% of sites using platform_get_resource(, IORESOURCE_MEM, )
> return EINVAL for the lack of a resource.

Sure, now that they're using devm_ioremap_resource(). What about before
they were converted?

For example, of the 10 serial drivers now using devm_ioremap_resource(),
_not 1 returned EINVAL_ prior to using devm_ioremap_resource(). And of those
10 commits, only 1 mentions changing the return codes on purpose.

In fact, all of them but one returned ENODEV.

Regards,
Peter Hurley



--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Russell King - ARM Linux @ 2015-01-30 15:49 UTC (permalink / raw)
  To: Peter Hurley
  Cc: Thierry Reding, Varka Bhadram, Chunyan Zhang,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, mark.rutland-5wv7dgnIgG8,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	heiko-4mtYJXux2i+zQB+pC5nmwQ, andrew-g2DYL2Zd6BY,
	jslaby-AlSwsSmVLrQ, lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	pawel.moll-5wv7dgnIgG8, zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	geng.ren-lxIno14LUO0EEoCn2XhGlw,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, florian.vaussard-p8DiymsW2f8,
	devicetree-u79uwXL29TY76Z2rM5mHXA, jason-NLaQJdtUoK4Be96aLqz0jA,
	arnd-r2nGTMty4D4, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	hytszk-Re5JQEeQqe8AvxtiuMwx3w, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, galak-sgV2jX0FEOL9JmXXK+q4OQ,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <54CBA426.7050900-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8@public.gmane.org>

On Fri, Jan 30, 2015 at 10:32:54AM -0500, Peter Hurley wrote:
> Before you say consistency, I think you should look at the stats below.
> IOW, if you want to change the error code return from probe() for
> consistency's sake, a tree-wide patch would be the appropriate way.

Now look outside the serial driver sub-tree.

There are 1234 instances of platform_get_resource(, IORESOURCE_MEM, ) in
the drivers/ sub-tree, with 700 instances of devm_ioremap_resource()
being used there.  Of the devm_ioremap_resource() instances:

- 555 use platform_get_resource() in the preceding two lines - which is
  not enough to do anything but rely on the -EINVAL return value.
- 16 mention ENODEV in the preceding three lines.

There are 132 which use platform_get_resource() and return ENODEV within
the following three lines (which may intersect with the above 16 number)
and 88 which use EINVAL.

So, there are in total 643 instances where a missing resource returns
EINVAL, and between 132 and 148 instances which return ENODEV.

Yes, 643 + 148 isn't 1234, but I'm not going to read through all 1234
locations just for the sake of this thread.   What's clear though is that
more than 50% of sites using platform_get_resource(, IORESOURCE_MEM, )
return EINVAL for the lack of a resource.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply

* Re: [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Peter Hurley @ 2015-01-30 15:32 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Thierry Reding, Varka Bhadram, Chunyan Zhang, gregkh,
	mark.rutland, gnomes, heiko, andrew, jslaby, lanqing.liu,
	pawel.moll, zhang.lyra, zhizhou.zhang, geng.ren, antonynpavlov,
	linux-serial, grant.likely, orsonzhai, florian.vaussard,
	devicetree, jason, arnd, ijc+devicetree, hytszk, robh+dt,
	wei.qiao, linux-arm-kernel, linux-api, linux-kernel, galak,
	shawn.guo
In-Reply-To: <20150130140854.GJ26493@n2100.arm.linux.org.uk>

On 01/30/2015 09:08 AM, Russell King - ARM Linux wrote:
> On Fri, Jan 30, 2015 at 07:03:03AM -0500, Peter Hurley wrote:
>> On 01/30/2015 05:18 AM, Thierry Reding wrote:
>>> -ENODEV is certainly not the correct return value if a resource is not
>>> available. It translates to "no such device", but the device must
>>> clearly be there, otherwise the ->probe() shouldn't have been called.
>>
>> -ENODEV is the appropriate return from the probe(); there is no device.
> 
> No it is not.  "no such device" means "the device is not present".  If
> the device is not present, we wouldn't have a struct device associated
> with it.
> 
> The missing resource is an error condition: what it's saying is that the
> device is there, but something has failed in providing the IO regions
> necessary to access it.  That's really something very different from
> "there is no device present".

This is masking behavior changes behind what is essentially a refactor.
For example, here's Felipe's changelog to omap-serial:

commit d044d2356f8dd18c755e13f34318bc03ef9c6887
Author: Felipe Balbi <balbi@ti.com>
Date:   Wed Apr 23 09:58:33 2014 -0500

    tty: serial: omap: switch over to devm_ioremap_resource
    
    just using helper function to remove some duplicated
    code a bit. While at that, also move allocation of
    struct uart_omap_port higher in the code so that
    we return much earlier in case of no memory.
    
    Signed-off-by: Felipe Balbi <balbi@ti.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

No mention of correcting an error return value here.

Why change the error return from probe() now?

Before you say consistency, I think you should look at the stats below.
IOW, if you want to change the error code return from probe() for
consistency's sake, a tree-wide patch would be the appropriate way.


>>> Or if it really isn't there, then you'd at least need a memory region
>>> to probe, otherwise you can't determine whether it's there or not. So
>>> from the perspective of a device driver I think a missing, or NULL,
>>> resource, is indeed an "invalid argument".
>>
>> Trying to argue that a missing host bus window is an "invalid argument"
>> to probe() is just trying to make the shoe fit.
> 
> As is arguing that -ENODEV is an appropriate return value for the missing
> resource.
> 
> Moreover, returning -ENODEV is actually *bad* in this case - the kernel's
> generic probe handling does not report the failure of the driver to bind
> given this, so a missing resource potentially becomes a silent failure of
> a driver - leading users to wonder why their devices aren't found.
> 
> If we /really/ have a problem with the error code, then why not invent
> a new error code to cater for this condition - maybe, EBADRES (bad resource).
> 
>>> I understand that people might see some ambiguity there, and -EINVAL is
>>> certainly not a very accurate description, but such is the nature of
>>> error codes. You pick what fits best. -ENXIO and -EADDRNOTAVAIL had been
>>> under discussion as well if I remember correctly, but they both equally
>>> ambiguous. -ENODATA might have been a better fit, but that too has a
>>> different meaning in other contexts.
>>>
>>> Besides, there's an error message that goes along with the error code
>>> return, that should remove any ambiguity for people looking at dmesg.
>>>
>>> All of that said, the original assertion that the check is not required
>>> is still valid. We can argue at length about the specific error code but
>>> if we decide that it needs to change, then we should modify
>>> devm_ioremap_resource() rather than requiring all callers to perform the
>>> extra check again.
>>
>> None of the devm_ioremap_resource() changes have been submitted for
>> serial drivers.
> 
> $ grep devm_ioremap_resource drivers/tty/serial/ -r | wc -l
> 10

Ok, not 'none' but hardly tree-wide.

And of those 10 drivers now using devm_ioremap_resource(),
3 drivers still return ENODEV for a missing resource [1]. (FWIW, I wrote 'none'
on the basis of a grep of devm_ioremap_resource and looking at the last one,
serial-tegra.c, which has exactly the construct objected to in the Spreadtrum
driver).

Another 9 drivers still use plain devm_ioremap(), even those with
trivial conversions like samsung.c [2]

Of the serial drivers which use platform_get_resource(),
10 return  ENODEV
5  return  EINVAL (not including those converted to devm_ioremap_resource())
4  return  ENXIO
3  return  ENOMEM
2  return  ENOENT
1  returns EBUSY
1  returns EFAULT

So to recap, I see no reason to respin this driver submission when:
1. even drivers already using devm_ioremap_resource() aren't consistent
2. drivers which could trivially use devm_ioremap_resource, don't
3. there's no basis for requiring consistent return value _yet_
   (or even what that value should be)
4. the platform_get_resource()/devm_ioremap_resource is an awkward code construct

Regards,
Peter Hurley


[1]
drivers/tty/serial/bcm63xx_uart.c-	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
drivers/tty/serial/bcm63xx_uart.c-	if (!res_mem)
drivers/tty/serial/bcm63xx_uart.c-		return -ENODEV;
drivers/tty/serial/bcm63xx_uart.c-
drivers/tty/serial/bcm63xx_uart.c-	port->mapbase = res_mem->start;
drivers/tty/serial/bcm63xx_uart.c:	port->membase = devm_ioremap_resource(&pdev->dev, res_mem);

drivers/tty/serial/vt8500_serial.c-	mmres = platform_get_resource(pdev, IORESOURCE_MEM, 0);
drivers/tty/serial/vt8500_serial.c-	irqres = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
drivers/tty/serial/vt8500_serial.c-	if (!mmres || !irqres)
drivers/tty/serial/vt8500_serial.c-		return -ENODEV;
[...]
drivers/tty/serial/vt8500_serial.c:	vt8500_port->uart.membase = devm_ioremap_resource(&pdev->dev, mmres);

drivers/tty/serial/serial-tegra.c-	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
drivers/tty/serial/serial-tegra.c-	if (!resource) {
drivers/tty/serial/serial-tegra.c-		dev_err(&pdev->dev, "No IO memory resource\n");
drivers/tty/serial/serial-tegra.c-		return -ENODEV;
drivers/tty/serial/serial-tegra.c-	}
drivers/tty/serial/serial-tegra.c-
drivers/tty/serial/serial-tegra.c-	u->mapbase = resource->start;
drivers/tty/serial/serial-tegra.c:	u->membase = devm_ioremap_resource(&pdev->dev, resource);


[2]
drivers/tty/serial/samsung.c:	res = platform_get_resource(platdev, IORESOURCE_MEM, 0);
drivers/tty/serial/samsung.c-	if (res == NULL) {
drivers/tty/serial/samsung.c-		dev_err(port->dev, "failed to find memory resource for uart\n");
drivers/tty/serial/samsung.c-		return -EINVAL;
drivers/tty/serial/samsung.c-	}
drivers/tty/serial/samsung.c-
drivers/tty/serial/samsung.c-	dbg("resource %pR)\n", res);
drivers/tty/serial/samsung.c-
drivers/tty/serial/samsung.c-	port->membase = devm_ioremap(port->dev, res->start, resource_size(res));
drivers/tty/serial/samsung.c-	if (!port->membase) {
drivers/tty/serial/samsung.c-		dev_err(port->dev, "failed to remap controller address\n");

^ permalink raw reply

* Re: [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Russell King - ARM Linux @ 2015-01-30 14:08 UTC (permalink / raw)
  To: Peter Hurley
  Cc: Thierry Reding, Varka Bhadram, Chunyan Zhang,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, mark.rutland-5wv7dgnIgG8,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	heiko-4mtYJXux2i+zQB+pC5nmwQ, andrew-g2DYL2Zd6BY,
	jslaby-AlSwsSmVLrQ, lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	pawel.moll-5wv7dgnIgG8, zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	geng.ren-lxIno14LUO0EEoCn2XhGlw,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, florian.vaussard-p8DiymsW2f8,
	devicetree-u79uwXL29TY76Z2rM5mHXA, jason-NLaQJdtUoK4Be96aLqz0jA,
	arnd-r2nGTMty4D4, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	hytszk-Re5JQEeQqe8AvxtiuMwx3w, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, galak-sgV2jX0FEOL9JmXXK+q4OQ,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <54CB72F7.8060706-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8@public.gmane.org>

On Fri, Jan 30, 2015 at 07:03:03AM -0500, Peter Hurley wrote:
> On 01/30/2015 05:18 AM, Thierry Reding wrote:
> > -ENODEV is certainly not the correct return value if a resource is not
> > available. It translates to "no such device", but the device must
> > clearly be there, otherwise the ->probe() shouldn't have been called.
> 
> -ENODEV is the appropriate return from the probe(); there is no device.

No it is not.  "no such device" means "the device is not present".  If
the device is not present, we wouldn't have a struct device associated
with it.

The missing resource is an error condition: what it's saying is that the
device is there, but something has failed in providing the IO regions
necessary to access it.  That's really something very different from
"there is no device present".

> > Or if it really isn't there, then you'd at least need a memory region
> > to probe, otherwise you can't determine whether it's there or not. So
> > from the perspective of a device driver I think a missing, or NULL,
> > resource, is indeed an "invalid argument".
> 
> Trying to argue that a missing host bus window is an "invalid argument"
> to probe() is just trying to make the shoe fit.

As is arguing that -ENODEV is an appropriate return value for the missing
resource.

Moreover, returning -ENODEV is actually *bad* in this case - the kernel's
generic probe handling does not report the failure of the driver to bind
given this, so a missing resource potentially becomes a silent failure of
a driver - leading users to wonder why their devices aren't found.

If we /really/ have a problem with the error code, then why not invent
a new error code to cater for this condition - maybe, EBADRES (bad resource).

> > I understand that people might see some ambiguity there, and -EINVAL is
> > certainly not a very accurate description, but such is the nature of
> > error codes. You pick what fits best. -ENXIO and -EADDRNOTAVAIL had been
> > under discussion as well if I remember correctly, but they both equally
> > ambiguous. -ENODATA might have been a better fit, but that too has a
> > different meaning in other contexts.
> > 
> > Besides, there's an error message that goes along with the error code
> > return, that should remove any ambiguity for people looking at dmesg.
> > 
> > All of that said, the original assertion that the check is not required
> > is still valid. We can argue at length about the specific error code but
> > if we decide that it needs to change, then we should modify
> > devm_ioremap_resource() rather than requiring all callers to perform the
> > extra check again.
> 
> None of the devm_ioremap_resource() changes have been submitted for
> serial drivers.

$ grep devm_ioremap_resource drivers/tty/serial/ -r | wc -l
10

It seems that statement is false.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply

* [PATCH v13 02/18] vfio: platform: probe to devices on the platform bus
From: Baptiste Reynal @ 2015-01-30 13:46 UTC (permalink / raw)
  To: kvmarm-FPEHb7Xf0XXUo1n7N8X6UoWGPAHP3yOg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	alex.williamson-H+wXaHxf7aLQT0dZR+AlfA
  Cc: kim.phillips-KZfg59tc24xl57MIdRCFDg, open list:VFIO DRIVER,
	eric.auger-QSEj5FYQhm4dnm+yROfE0A, marc.zyngier-5wv7dgnIgG8,
	open list:ABI/API, will.deacon-5wv7dgnIgG8, open list,
	Antonios Motakis, tech-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J,
	christoffer.dall-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <1422625584-3741-1-git-send-email-b.reynal-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>

From: Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>

Driver to bind to Linux platform devices, and callbacks to discover their
resources to be used by the main VFIO PLATFORM code.

Signed-off-by: Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>
---
 drivers/vfio/platform/vfio_platform.c | 103 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h             |   1 +
 2 files changed, 104 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_platform.c

diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
new file mode 100644
index 0000000..cef645c
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/platform_device.h>
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis <a.motakis-lrHrjnjw1UfHK3s98zE1ajGjJy/sRE9J@public.gmane.org>"
+#define DRIVER_DESC     "VFIO for platform devices - User Level meta-driver"
+
+/* probing devices from the linux platform bus */
+
+static struct resource *get_platform_resource(struct vfio_platform_device *vdev,
+					      int num)
+{
+	struct platform_device *dev = (struct platform_device *) vdev->opaque;
+	int i;
+
+	for (i = 0; i < dev->num_resources; i++) {
+		struct resource *r = &dev->resource[i];
+
+		if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) {
+			if (!num)
+				return r;
+
+			num--;
+		}
+	}
+	return NULL;
+}
+
+static int get_platform_irq(struct vfio_platform_device *vdev, int i)
+{
+	struct platform_device *pdev = (struct platform_device *) vdev->opaque;
+
+	return platform_get_irq(pdev, i);
+}
+
+static int vfio_platform_probe(struct platform_device *pdev)
+{
+	struct vfio_platform_device *vdev;
+	int ret;
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+
+	vdev->opaque = (void *) pdev;
+	vdev->name = pdev->name;
+	vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
+	vdev->get_resource = get_platform_resource;
+	vdev->get_irq = get_platform_irq;
+
+	ret = vfio_platform_probe_common(vdev, &pdev->dev);
+	if (ret)
+		kfree(vdev);
+
+	return ret;
+}
+
+static int vfio_platform_remove(struct platform_device *pdev)
+{
+	struct vfio_platform_device *vdev;
+
+	vdev = vfio_platform_remove_common(&pdev->dev);
+	if (vdev) {
+		kfree(vdev);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static struct platform_driver vfio_platform_driver = {
+	.probe		= vfio_platform_probe,
+	.remove		= vfio_platform_remove,
+	.driver	= {
+		.name	= "vfio-platform",
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_platform_driver(vfio_platform_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9ade02b..4e93a97 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -159,6 +159,7 @@ struct vfio_device_info {
 	__u32	flags;
 #define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
 #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
+#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
 };
-- 
2.2.2

^ permalink raw reply related

* Re: [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Peter Hurley @ 2015-01-30 12:03 UTC (permalink / raw)
  To: Thierry Reding, Russell King - ARM Linux
  Cc: Varka Bhadram, Chunyan Zhang,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, mark.rutland-5wv7dgnIgG8,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	heiko-4mtYJXux2i+zQB+pC5nmwQ, andrew-g2DYL2Zd6BY,
	jslaby-AlSwsSmVLrQ, lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	pawel.moll-5wv7dgnIgG8, zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	geng.ren-lxIno14LUO0EEoCn2XhGlw,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, florian.vaussard-p8DiymsW2f8,
	devicetree-u79uwXL29TY76Z2rM5mHXA, jason-NLaQJdtUoK4Be96aLqz0jA,
	arnd-r2nGTMty4D4, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	hytszk-Re5JQEeQqe8AvxtiuMwx3w, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, galak-sgV2jX0FEOL9JmXXK+q4OQ,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <20150130101838.GC16744@ulmo>

On 01/30/2015 05:18 AM, Thierry Reding wrote:
> On Thu, Jan 29, 2015 at 04:05:53PM +0000, Russell King - ARM Linux wrote:
>> On Thu, Jan 29, 2015 at 10:49:34AM -0500, Peter Hurley wrote:
>>> Hi Varka,
>>>
>>> On 01/29/2015 10:26 AM, Varka Bhadram wrote:
>>>> This check is not required. It will be done by devm_ioremap_resource()
>>>
>>> I disagree. devm_ioremap_resource() interprets the NULL resource as
>>> a bad parameter and returns -EINVAL which is then forwarded as the
>>> return value from the probe.
>>>
>>> -ENODEV is the correct return value from the probe if the expected
>>> resource is not available (either because it doesn't exist or was already
>>> claimed by another driver).
>>
>> (Adding Thierry as the author of this function.)
>>
>> I believe devm_ioremap_resource() was explicitly designed to remove such
>> error handling from drivers, and to give drivers a unified error response
>> to such things as missing resources.
>>
>> See the comments for this function in lib/devres.c.
> 
> Right. Before the introduction of this function drivers would copy the
> same boilerplate but would use completely inconsistent return codes.
> Well, to be correct devm_request_and_ioremap() was introduced first to
> reduce the boilerplate, but one of the problems was that it returned a
> NULL pointer on failure, so it was impossible to distinguish between
> specific error conditions. It also had the negative side-effect of not
> giving drivers any directions on what to do with the NULL return value
> other than the example in the kerneldoc. But most people just didn't
> seem to look at that, so instead of using -EADDRNOTAVAIL they'd again
> go and do completely inconsistent things everywhere.
> 
> When we introduced devm_ioremap_resource(), the idea was to remove any
> of these inconsistencies once and for all. You call the function and if
> it fails you simply propagate the error code, so you get consistent
> behaviour everywhere.
> 
> If I remember correctly the error codes for the various conditions were
> discussed quite extensively, and what we currently have is what we got
> by concensus.
> 
> -ENODEV is certainly not the correct return value if a resource is not
> available. It translates to "no such device", but the device must
> clearly be there, otherwise the ->probe() shouldn't have been called.

-ENODEV is the appropriate return from the probe(); there is no device.
That returning that value doesn't make sense from devm_ioremap_resource
is simply a reflection of the awkward construct.

> Or if it really isn't there, then you'd at least need a memory region
> to probe, otherwise you can't determine whether it's there or not. So
> from the perspective of a device driver I think a missing, or NULL,
> resource, is indeed an "invalid argument".

Trying to argue that a missing host bus window is an "invalid argument"
to probe() is just trying to make the shoe fit.

> I understand that people might see some ambiguity there, and -EINVAL is
> certainly not a very accurate description, but such is the nature of
> error codes. You pick what fits best. -ENXIO and -EADDRNOTAVAIL had been
> under discussion as well if I remember correctly, but they both equally
> ambiguous. -ENODATA might have been a better fit, but that too has a
> different meaning in other contexts.
> 
> Besides, there's an error message that goes along with the error code
> return, that should remove any ambiguity for people looking at dmesg.
> 
> All of that said, the original assertion that the check is not required
> is still valid. We can argue at length about the specific error code but
> if we decide that it needs to change, then we should modify
> devm_ioremap_resource() rather than requiring all callers to perform the
> extra check again.

None of the devm_ioremap_resource() changes have been submitted for
serial drivers. I see no problem with accepting the Spreadtrum driver
as is, and if someone wants to do a massive changeset to rework the
platform_get_resource()/devm_ioremap_resource() idiom for serial drivers
for 3.21, then the Spreadtrum driver would be included then.

That said, I'd prefer to see that idiom wrapped in a single function
rather than what's being suggested.

Regards,
Peter Hurley


--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Thierry Reding @ 2015-01-30 10:18 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Peter Hurley, Varka Bhadram, Chunyan Zhang,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, mark.rutland-5wv7dgnIgG8,
	gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
	heiko-4mtYJXux2i+zQB+pC5nmwQ, andrew-g2DYL2Zd6BY,
	jslaby-AlSwsSmVLrQ, lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
	pawel.moll-5wv7dgnIgG8, zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
	zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
	geng.ren-lxIno14LUO0EEoCn2XhGlw,
	antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, florian.vaussard-p8DiymsW2f8,
	devicetree-u79uwXL29TY76Z2rM5mHXA, jason-NLaQJdtUoK4Be96aLqz0jA,
	arnd-r2nGTMty4D4, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
	hytszk-Re5JQEeQqe8AvxtiuMwx3w, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	wei.qiao-lxIno14LUO0EEoCn2XhGlw,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, galak-sgV2jX0FEOL9JmXXK+q4OQ,
	shawn.guo-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <20150129160553.GC26493-l+eeeJia6m9vn6HldHNs0ANdhmdF6hFW@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 3400 bytes --]

On Thu, Jan 29, 2015 at 04:05:53PM +0000, Russell King - ARM Linux wrote:
> On Thu, Jan 29, 2015 at 10:49:34AM -0500, Peter Hurley wrote:
> > Hi Varka,
> > 
> > On 01/29/2015 10:26 AM, Varka Bhadram wrote:
> > > This check is not required. It will be done by devm_ioremap_resource()
> > 
> > I disagree. devm_ioremap_resource() interprets the NULL resource as
> > a bad parameter and returns -EINVAL which is then forwarded as the
> > return value from the probe.
> > 
> > -ENODEV is the correct return value from the probe if the expected
> > resource is not available (either because it doesn't exist or was already
> > claimed by another driver).
> 
> (Adding Thierry as the author of this function.)
> 
> I believe devm_ioremap_resource() was explicitly designed to remove such
> error handling from drivers, and to give drivers a unified error response
> to such things as missing resources.
> 
> See the comments for this function in lib/devres.c.

Right. Before the introduction of this function drivers would copy the
same boilerplate but would use completely inconsistent return codes.
Well, to be correct devm_request_and_ioremap() was introduced first to
reduce the boilerplate, but one of the problems was that it returned a
NULL pointer on failure, so it was impossible to distinguish between
specific error conditions. It also had the negative side-effect of not
giving drivers any directions on what to do with the NULL return value
other than the example in the kerneldoc. But most people just didn't
seem to look at that, so instead of using -EADDRNOTAVAIL they'd again
go and do completely inconsistent things everywhere.

When we introduced devm_ioremap_resource(), the idea was to remove any
of these inconsistencies once and for all. You call the function and if
it fails you simply propagate the error code, so you get consistent
behaviour everywhere.

If I remember correctly the error codes for the various conditions were
discussed quite extensively, and what we currently have is what we got
by concensus.

-ENODEV is certainly not the correct return value if a resource is not
available. It translates to "no such device", but the device must
clearly be there, otherwise the ->probe() shouldn't have been called.
Or if it really isn't there, then you'd at least need a memory region
to probe, otherwise you can't determine whether it's there or not. So
from the perspective of a device driver I think a missing, or NULL,
resource, is indeed an "invalid argument".

I understand that people might see some ambiguity there, and -EINVAL is
certainly not a very accurate description, but such is the nature of
error codes. You pick what fits best. -ENXIO and -EADDRNOTAVAIL had been
under discussion as well if I remember correctly, but they both equally
ambiguous. -ENODATA might have been a better fit, but that too has a
different meaning in other contexts.

Besides, there's an error message that goes along with the error code
return, that should remove any ambiguity for people looking at dmesg.

All of that said, the original assertion that the check is not required
is still valid. We can argue at length about the specific error code but
if we decide that it needs to change, then we should modify
devm_ioremap_resource() rather than requiring all callers to perform the
extra check again.

Thierry

[-- Attachment #2: Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [RFC][PATCH v2] procfs: Always expose /proc/<pid>/map_files/ and make it readable
From: Kees Cook @ 2015-01-30  1:30 UTC (permalink / raw)
  To: Calvin Owens
  Cc: Andrew Morton, Cyrill Gorcunov, Kirill A. Shutemov,
	Alexey Dobriyan, Oleg Nesterov, Eric W. Biederman, Al Viro,
	Kirill A. Shutemov, Peter Feiner, Grant Likely,
	Siddhesh Poyarekar, LKML, kernel-team-b10kYP2dOMg,
	Pavel Emelyanov, Linux API
In-Reply-To: <20150128043832.GA2266262-ZEWhMxyTXSP95iwofa7G/laTQe2KTcn/@public.gmane.org>

On Tue, Jan 27, 2015 at 8:38 PM, Calvin Owens <calvinowens-b10kYP2dOMg@public.gmane.org> wrote:
> On Monday 01/26 at 15:43 -0800, Andrew Morton wrote:
>> On Tue, 27 Jan 2015 00:00:54 +0300 Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>>
>> > On Mon, Jan 26, 2015 at 02:47:31PM +0200, Kirill A. Shutemov wrote:
>> > > On Fri, Jan 23, 2015 at 07:15:44PM -0800, Calvin Owens wrote:
>> > > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
>> > > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface
>> > > > is very useful for enumerating the files mapped into a process when
>> > > > the more verbose information in /proc/<pid>/maps is not needed.
>>
>> This is the main (actually only) justification for the patch, and it it
>> far too thin.  What does "not needed" mean.  Why can't people just use
>> /proc/pid/maps?
>
> The biggest difference is that if you do something like this:
>
>         fd = open("/stuff", O_BLAH);
>         map = mmap(NULL, 4096, PROT_BLAH, MAP_SHARED, fd, 0);
>         close(fd);
>         unlink("/stuff");
>
> ...then map_files/ gives you a way to get a file descriptor for
> "/stuff", which you couldn't do with /proc/pid/maps.
>
> It's also something of a win if you just want to see what is mapped at a
> specific address, since you can just readlink() the symlink for the
> address range you care about and it will go grab the appropriate VMA and
> give you the answer. /proc/pid/maps requires walking the VMA tree, which
> is quite expensive for processes with many thousands of threads, even
> without the O(N^2) issue.
>
> (You have to know what address range you want though, since readdir() on
> map_files/ obviously has to walk the VMA tree just like /proc/N/maps.)
>
>> > > > This patch moves the folder out from behind CHECKPOINT_RESTORE, and
>> > > > removes the CAP_SYS_ADMIN restrictions. Following the links requires
>> > > > the ability to ptrace the process in question, so this doesn't allow
>> > > > an attacker to do anything they couldn't already do before.
>> > > >
>> > > > Signed-off-by: Calvin Owens <calvinowens-b10kYP2dOMg@public.gmane.org>
>> > >
>> > > Cc +linux-api@
>> >
>> > Looks good to me, thanks! Though I would really appreciate if someone
>> > from security camp take a look as well.
>>
>> hm, who's that.  Kees comes to mind.
>>
>> And reviewers' task would be a heck of a lot easier if they knew what
>> /proc/pid/map_files actually does.  This:
>>
>> akpm3:/usr/src/25> grep -r map_files Documentation
>> akpm3:/usr/src/25>
>>
>> does not help.
>>
>> The 640708a2cff7f81 changelog says:
>>
>> :     This one behaves similarly to the /proc/<pid>/fd/ one - it contains
>> :     symlinks one for each mapping with file, the name of a symlink is
>> :     "vma->vm_start-vma->vm_end", the target is the file.  Opening a symlink
>> :     results in a file that point exactly to the same inode as them vma's one.
>> :
>> :     For example the ls -l of some arbitrary /proc/<pid>/map_files/
>> :
>> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>> :      | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
>>
>> afacit this info is also available in /proc/pid/maps, so things
>> shouldn't get worse if the /proc/pid/map_files permissions are at least
>> as restrictive as the /proc/pid/maps permissions.  Is that the case?
>> (Please add to changelog).
>
> Yes, the only difference is that you can follow the link as per above.
> I'll resend with a new message explaining that and the deletion thing.
>
>> There's one other problem here: we're assuming that the map_files
>> implementation doesn't have bugs.  If it does have bugs then relaxing
>> permissions like this will create new vulnerabilities.  And the
>> map_files implementation is surprisingly complex.  Is it bug-free?
>
> While I was messing with it I used it a good bit and didn't see any
> issues, although I didn't actively try to fuzz it or anything. I'd be
> happy to write something to test hammering it in weird ways if you like.
> I'm also happy to write testcases for namespaces.
>
> So far as security issues, as others have pointed out you can't follow
> the links unless you can ptrace the process in question, which seems
> like a pretty solid guarantee. As Cyrill pointed out in the discussion
> about the documentation, that's the same protection as /proc/N/fd/*, and
> those links function in the same way.

My concern here is that fd/* are connected as streams, and while that
has a certain level of badness as an external-to-the-process attacker,
PTRACE_MODE_READ is much weaker than PTRACE_MODE_ATTACH (which is
required for access to /proc/N/mem). Since these fds are the things
mapped into memory on a process, writing to them is a subset of access
to /proc/N/mem, and I don't feel that PTRACE_MODE_READ is sufficient.

-Kees

-- 
Kees Cook
Chrome OS Security

^ permalink raw reply

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
From: Martin K. Petersen @ 2015-01-30  0:48 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Jeff Layton, J. Bruce Fields
In-Reply-To: <20150129020025.GE9981-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>

>>>>> "Darrick" == Darrick J Wong <darrick.wong-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> writes:

Darrick> Create a new ioctl to expose the block layer's newfound ability
Darrick> to issue either a zeroing discard, a WRITE SAME with a zero
Darrick> page, or a regular write with the zero page.  This BLKZEROOUT2
Darrick> ioctl takes {start, length, flags} as parameters.  So far, the
Darrick> only flag available is to enable the zeroing discard part --
Darrick> without it, the call invokes the old BLKZEROOUT behavior.
Darrick> start and length have the same meaning as in BLKZEROOUT.

Darrick> Furthermore, because BLKZEROOUT2 issues commands directly to
Darrick> the storage device, we must invalidate the page cache (as a
Darrick> regular O_DIRECT write would do) to avoid returning stale cache
Darrick> contents at a later time.

Looks good to me, Darrick.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Roland Dreier @ 2015-01-29 23:17 UTC (permalink / raw)
  To: Yann Droneaud
  Cc: Jason Gunthorpe, Sagi Grimberg, Shachar Raindel, Eli Cohen,
	Haggai Eran, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1422568741.3133.247.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

On Thu, Jan 29, 2015 at 1:59 PM, Yann Droneaud <ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org> wrote:
>> Roland: I agree with Yann, these patches need to go in, or the ODP
>> patches reverted.

> Reverting all On Demand Paging patches seems overkill:
> if something as to be reverted it should be commit 5a77abf9a97a
> ("IB/core: Add support for extended query device caps") and the part of
> commit 860f10a799c8 ("IB/core: Add flags for on demand paging support")
> which modify ib_uverbs_ex_query_device().

Thank you and Jason for taking on this interface.

At this point I feel like I do about the IPoIB changes -- we should
revert the broken stuff and get it right for 3.20.

If we revert the two things you describe above, is everything else OK
to leave in 3.19 with respect to ABI?

Thanks,
  Roland

^ permalink raw reply

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
From: Arnd Bergmann @ 2015-01-29 22:19 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields
In-Reply-To: <20150129190102.GF9981@birch.djwong.org>

On Thursday 29 January 2015 11:01:02 Darrick J. Wong wrote:
> It won't do all-ones, because the underlying blkdev_issue_zeroout call only
> knows how to tell the device to write zeroes or perform a discard if the flag
> is set and the device is whitelisted.  This patch only exposes the existing
> kernel call to userspace.
>
> (All-ones could be plumbed into the storage stack, but that would have to be a
> separate patch.)

I see. I've just checked the code in the mmc core and it seems to
get the correct information for both mmc and sd devices (they use
different bits to announce the feature).

So as long as the devices follow the spec, everything is fine here.

Thanks Darrick and Robert for the quick reply!

	Arnd

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Yann Droneaud @ 2015-01-29 21:59 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Haggai Eran,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150129182800.GH11842-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

Hi,

Le jeudi 29 janvier 2015 à 11:28 -0700, Jason Gunthorpe a écrit :
> On Thu, Jan 29, 2015 at 06:59:58PM +0100, Yann Droneaud wrote:
> > As specified in "Extending Verbs API" presentation [1] by Tzahi Oved
> > during OFA International Developer Workshop 2013, the request's
> > comp_mask should describe the request data: it's describe the
> > availability of extended fields in the request.
> > Conversely, the response's comp_mask should describe the presence
> > of extended fields in the response.
> 
> Roland: I agree with Yann, these patches need to go in, or the ODP
> patches reverted.
> 

Reverting all On Demand Paging patches seems overkill:
if something as to be reverted it should be commit 5a77abf9a97a
("IB/core: Add support for extended query device caps") and the part of
commit 860f10a799c8 ("IB/core: Add flags for on demand paging support")
which modify ib_uverbs_ex_query_device().

But I wonder about this part of commit 860f10a799c8:

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index c7a43624c96b..f9326ccda4b5 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -953,6 +953,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
                goto err_free;
        }
 
+       if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+               struct ib_device_attr attr;
+
+               ret = ib_query_device(pd->device, &attr);
+               if (ret || !(attr.device_cap_flags &
+                               IB_DEVICE_ON_DEMAND_PAGING)) {
+                       pr_debug("ODP support not available\n");
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+       }
+

AFAICT (1 << 6) bit from struct ib_uverbs_reg_mr access_flags field
was not enforced to be 0 previously, as ib_check_mr_access() only check 
for some coherency between a subset of the bits (it's not a function
dedicated to check flags provided by userspace):

include/rdma/ib_verbs.h:

   1094 enum ib_access_flags {
   1095         IB_ACCESS_LOCAL_WRITE   = 1,
   1096         IB_ACCESS_REMOTE_WRITE  = (1<<1),
   1097         IB_ACCESS_REMOTE_READ   = (1<<2),
   1098         IB_ACCESS_REMOTE_ATOMIC = (1<<3),
   1099         IB_ACCESS_MW_BIND       = (1<<4),
   1100         IB_ZERO_BASED           = (1<<5),
   1101         IB_ACCESS_ON_DEMAND     = (1<<6),
   1102 };

drivers/infiniband/core/uverbs_cmd.c: ib_uverbs_reg_mr()

    961         ret = ib_check_mr_access(cmd.access_flags);
    962         if (ret)
    963                 return ret;

include/rdma/ib_verbs.h:

   2643 static inline int ib_check_mr_access(int flags)
   2644 {
   2645         /*
   2646          * Local write permission is required if remote write or
   2647          * remote atomic permission is also requested.
   2648          */
   2649         if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
   2650             !(flags & IB_ACCESS_LOCAL_WRITE))
   2651                 return -EINVAL;
   2652 
   2653         return 0;
   2654 }

drivers/infiniband/core/uverbs_cmd.c: ib_uverbs_reg_mr()

    990         mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
    991                                      cmd.access_flags, &udata);

reg_user_mr() functions may call ib_umem_get() and pass access_flags to
it:

drivers/infiniband/core/umem.c: ib_umem_get()

    114         /*
    115          * We ask for writable memory if any of the following
    116          * access flags are set.  "Local write" and "remote write"
    117          * obviously require write access.  "Remote atomic" can do
    118          * things like fetch and add, which will modify memory, and
    119          * "MW bind" can change permissions by binding a window.
    120          */
    121         umem->writable  = !!(access &
    122                 (IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
    123                  IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
    124 
    125         if (access & IB_ACCESS_ON_DEMAND) {
    126                 ret = ib_umem_odp_get(context, umem);
    127                 if (ret) {
    128                         kfree(umem);
    129                         return ERR_PTR(ret);
    130                 }
    131                 return umem;
    132         }


As you can see only a few bits in access_flags are checked in the end,
so it may exist a very unlikely possibility that an existing userspace
program is setting this IB_ACCESS_ON_DEMAND bit without the intention
of enabling on demand paging as this would be unnoticed by older kernel.

In the other hand, a newer program built with on-demand-paging in mind
will set the bit, but when run on older kernel, it will be a no-op,
allowing the program to continue, perhaps thinking on-demand-paging
is available.

That should be avoided as much as possible.

Unfortunately, I think this cannot be fixed as it's was long since 
IB_ZERO_BASED was added by commit 7083e42ee2 ("IB/core: Add "type 2" 
memory windows support").
Anyway there was no check for IB_ACCESS_REMOTE_READ, nor 
IB_ACCESS_MW_BIND in the uverb layer either.

So, just as the second argument of open() syscall (remember O_TMPFILE,
see http://lwn.net/Articles/562294/ ), we will have to live with and be
careful ...

Regards.

-- 
Yann Droneaud
OPTEYA

^ permalink raw reply related

* Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask
From: Jason Gunthorpe @ 2015-01-29 21:34 UTC (permalink / raw)
  To: Yann Droneaud
  Cc: Haggai Eran, Sagi Grimberg, Shachar Raindel, Eli Cohen,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Tzahi Oved, Or Gerlitz,
	Matan Barak
In-Reply-To: <1422566069.3133.218.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

On Thu, Jan 29, 2015 at 10:14:29PM +0100, Yann Droneaud wrote:
> > > Unfortunately, the userspace don't get the size of the returned data:
> > > it's only a single "write()" syscall after all.
> > 
> > A write syscall that behaves nothing like write() actually should, so
> > I don't see why we can't have
> > 
> > resp_len = write(fd,inout_buf,sizeof(input_len));
> > 
> > Returning 0 from write makes no sense at all.

Okay, yes, I see, the flow is different for the ex versions from the
non-ex versions.

> > In the fullness of your patchset it will maintain the invariant that
> > resp_len <= sizeof(input_len)

> I don't catch your point: the response can be bigger than the request.

Indeed, I forgot the scheme used that indirection buffer with
pointers. My comments on write() result are all wrong. Sorry

> > It was never the intent that the size should be computed from
> > comp_mask. If the size is necessary it must be explicit.
> > 
> > In this instance if the size is not returned then libibverbs will have
> > to zero the entire user buffer before passing it to the kernel,
> > because it has to ensure any tail for the user app is 0'd.
>
> The proposed patch ensure the integrity of the response regarding
> comp_mask: if a bit is set in response's comp_mask that means the
> related fields are presents (and valid).

The patch it is good, but sitting this into the larger framework where
we have libibverbs consumers compiled against arbitrary versions of
the structure creates a broader complexity that someone has to deal
with.

libibverbs needs to ensure the trailer portion of the structure from
the end user is 0'd.

> > Remember for santity we want comp mask bits for things that can't be 0
> 
> For me, it's better if a bit is set in response's comp_mask by the 
> kernel when the kernel have written something in the related fields
> even if the those fields are all 0.

Yes, but my point is that comp mask bit should only be used to protect
fields where 0 is not an acceptable 'do not support' answer.

Primarily we should rely on memset(0) to provide compatibility, and
only when really necessary introduce a comp mask bit.

This is why the size is important, we can't deduce the size from
comp_mask, and without the size we can't know where the kernel stopped
writing and the whole thing becomes a little more fragile.

Zeroing the response buffer before calling into the kernel is not a
great general pattern.

Perhaps having the kernel zero the trailing portion it doesn't write
too is okay, but it still feels like not telling user space the size
is asking for future trouble.

> > Ideally we want to minimize the number of COMP bits because it is a
> > huge burden on the end user to work with them.
> 
> Sure. So I think comp_mask is just an overly complicated way of
> expressing the version and the size of the response.

Yes. But understand it is selected to try and provide a good end user
experience, exposing structure versions or size directly to the user
is very difficult to work with.

Ideally the end user will not see many comp mask bits (ie I would drop
the ODP one) and things will 'just work' as compiled.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask
From: Yann Droneaud @ 2015-01-29 21:14 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Haggai Eran, Sagi Grimberg, Shachar Raindel, Eli Cohen,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Tzahi Oved, Or Gerlitz,
	Matan Barak
In-Reply-To: <20150129191423.GL11842-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

Le jeudi 29 janvier 2015 à 12:14 -0700, Jason Gunthorpe a écrit :
> On Thu, Jan 29, 2015 at 07:35:14PM +0100, Yann Droneaud wrote:
> 
> > Unfortunately, the userspace don't get the size of the returned data:
> > it's only a single "write()" syscall after all.
> 
> A write syscall that behaves nothing like write() actually should, so
> I don't see why we can't have
> 
> resp_len = write(fd,inout_buf,sizeof(input_len));
> 
> Returning 0 from write makes no sense at all.
> 

0 is not the result of the write() syscall, as for extended uverbs
I've ensure the return value of the write() syscall was the size
it was given. See
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/infiniband/core/uverbs_main.c?id=v3.19-rc6#n599

    705                 err = uverbs_ex_cmd_table[command](file,
    706                                                    &ucore,
    707                                                    &uhw);
    708 
    709                 if (err)
    710                         return err;
    711 
    712                 return written_count;

See commit f21519b23c1 ("IB/core: extended command: an improved
infrastructure for uverbs commands")

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=f21519b23c1b6fa25366be4114ccf7fcf1c190f9

> In the fullness of your patchset it will maintain the invariant that
> resp_len <= sizeof(input_len)
> 

I don't catch your point: the response can be bigger than the request.

> Which seems OK to me considering what we have to work with, and a
> significantly better choice than 0.
> 
> > So the kernel is left with the comp_mask in the response to express
> > the returned size.
> 
> It was never the intent that the size should be computed from
> comp_mask. If the size is necessary it must be explicit.
> 
> In this instance if the size is not returned then libibverbs will have
> to zero the entire user buffer before passing it to the kernel,
> because it has to ensure any tail for the user app is 0'd.
> 

The proposed patch ensure the integrity of the response regarding
comp_mask: if a bit is set in response's comp_mask that means the
related fields are presents (and valid).

So before parsing the response fields, userspace have to check
response's comp_mask: fields access must be protected by correct
check on comp_mask ... but it might be useful for the userspace
developer to clear the response buffer just in case he/she decided
to be lazy with the check.

> Remember for santity we want comp mask bits for things that can't be 0

For me, it's better if a bit is set in response's comp_mask by the 
kernel when the kernel have written something in the related fields 
even if the those fields are all 0.

> and we want 0 for things that are not set.
> 
> struct ib_query_device_ex res;
> ibv_query_device_ex(..,res,sizeof(res));
> 
> printf("%u",res.foo_cap); // 0 if unsupported is OK
> if (res.comp_mask & COMP_BAR)
>   printf("%u",res.bar_thingy);  // 0 has meaning, must use COMP_BAR
> 
> Ideally we want to minimize the number of COMP bits because it is a
> huge burden on the end user to work with them.
> 

Sure. So I think comp_mask is just an overly complicated way of
expressing the version and the size of the response.

> > > The purpose of the output comp_mask is to allow drivers to declare
> > > they do not support the new structure members, and comp_mask bits
> > > should only be used with new structure members do not have a natural
> > > 'null' value.
> 
> > It's not (yet) about drivers as the output's comp_mask (in the 
> > response), is set by uverbs layer.
> > 
> > Do you think we have to enforce a 1 to 1 relation between the request 
> > comp_mask's bits and the response comp_mask's bits ?
> 
> In the query context I would be happy enough to just treat the in
> bound buffer as uninitialized buffer space, but certainly generally
> speaking the comp_mask+size should refer to the structure - so
> input/output are not directly related.
> 

OK.

> > > Further, we need to distinguish cases where additional data is
> > > mandatory or optional.
> > > 
> > > query_device is clearly optional, the only purpose the input comp mask
> > > serves is to reduce expensive work in the driver by indicating that
> > > some result bits are not needed.
> > 
> > That's not how I've understand the purpose of the request's comp_mask
> > after reading the presentation: request's comp_mask describe the content
> > of the request. Then that additional content can trigger the presence 
> > of additional fields in the response if needed.
> 
> Agreed - what I described was an abuse that some early Mellanox
> patches for a query_ex included and it was just a shortcut to avoid
> defining a dedicated input structure. It seems that same scheme was
> copied into these new patches.
> 

OK

> > >  It is perfectly OK for the kernel to
> > > ignore the input comp mask, and OK for userspace to typically request
> > > all bits. (indeed, I think this is a pretty silly optimization myself,
> > > and the original patch that motivated this was restructured so it is
> > > not needed)
> > > 
> > 
> > It's not at all OK to ignore request's comp_mask: it has to be checked
> > to find if there's a feature requested the kernel cannot fullfil, and 
> > any unknown bit is a possible feature request. So the kernel has to 
> > reject the request as a whole as it don't know how to process it.
> 
> In the context of the above scheme the input structure was just this:
> 
> struct query_input
> {
>   u64 requested_output;
> };
> 
> ie it wasn't actually a comp_mask, it just overlapped the comp_mask
> bytes on output.
> 
> Such a use was never explicitly documented, and IIRC was never
> actually included in libibverbs.
> 

OK

> Unless someone has a strong reason we need to do this I am inclined to
> think that your interpretation is the better one, and we could always
> add a requested_output to the input someday if it became urgent.
> 

Sure one field can be added later: in such case, one bit of request's
comp_mask will gain the meaning: "request_output field is present in the
request".

> In any event, you are right, we can't ingore the input bytes today and
> expect to give them meaning tomorrow.
> 

Sure.

> > As we don't know yet how we would extend the extended QUERY_DEVICE
> > uverbs, the kernel have to refuse any random value.
> > 
> > http://blog.ffwll.ch/2013/11/botching-up-ioctls.html
> 
> or the kernel treats QUERY_DEVICE as an output only function and never
> inspects the in/out buffer at all.
> 

It's something we could think of but the extended uverbs is already
merge with the comp_mask field in the request and it cost only 4 bytes,
while allowing us to make mistake on the first iteration of the extended
QUERY_DEVICE uverb (provided we manage to add the check for the field
being 0).

> > > > Thanks for the link to the presentation.
> > > 
> > > If I recall the presentation is old and had some flaws that were
> > > addressed before things made it into libibverbs..
> > 
> > I have to have a look to this part of libibverbs: I'm not sure
> > the extended QUERY_DEVICE is already implemented.
> 
> The patches turned out to be unnecessary and were dropped, IIRC.
>  

OK.

> > > Thank you for looking at this, it is very important that this scheme
> > > is used properly, and it is very easy to make mistakes. I haven't had
> > > time to review any of these new patches myself.
> 
> > I hope you would find some time to review my latest patchset so that
> > we don't miss a corner case. It's starting to become urgent.
> 
> I have and will, thank you
> 

:)

Thanks.

Regards.

-- 
Yann Droneaud
OPTEYA

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Jason Gunthorpe @ 2015-01-29 21:12 UTC (permalink / raw)
  To: Yann Droneaud
  Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Haggai Eran,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1422564638.3133.198.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

On Thu, Jan 29, 2015 at 09:50:38PM +0100, Yann Droneaud wrote:

> Anyway, I recognize that uverb way of abusing write() syscall is 
> borderline (at best) regarding other Linux subsystems and Unix paradigm 
> in general. But it's not enough to screw it more.

Then we must return the correct output size explicitly in the struct.

Jason

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Yann Droneaud @ 2015-01-29 20:50 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Haggai Eran,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150129191801.GM11842-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

Hi,

Le jeudi 29 janvier 2015 à 12:18 -0700, Jason Gunthorpe a écrit :
> On Thu, Jan 29, 2015 at 07:43:29PM +0100, Yann Droneaud wrote:
> 
> > The write() syscall must return the size buffer passed to it, or
> > less, but in such case it would ask for trouble as userspace would
> > be allowed to write() the remaining bytes.  Returning a size bigger
> > than the one passed to write() is not acceptable and would break any
> > expectation.
> 
> By that logic the 0 return is still wrong, and it should be ucore->in_len
> 

This is handled by ib_uverbs_write() in
drivers/infiniband/core/uverbs_main.c:

    709                 if (err)
    710                         return err;
    711 
    712                 return written_count;


> But I think we can return less without risking anything breaking, it
> already violates the invariant associated with write() - it mutates
> the buffer passed in!
> 

I don't think so, as only the response buffer is written to and the
response buffer pointer is provided in the buffer given to write().

AFAIK, no uverbs ever change the content of the input buffer (eg. the
request): I've managed to declare the various input buffers "const" so
it would surprising to find it use for writing to userspace.

Anyway, I recognize that uverb way of abusing write() syscall is 
borderline (at best) regarding other Linux subsystems and Unix paradigm 
in general. But it's not enough to screw it more.

Regards.

-- 
Yann Droneaud
OPTEYA

^ permalink raw reply

* Re: [PATCH v1 1/5] IB/uverbs: ex_query_device: answer must not depend on request's comp_mask
From: Yann Droneaud @ 2015-01-29 20:42 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Haggai Eran,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1422557009.3133.172.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>

Le jeudi 29 janvier 2015 à 19:43 +0100, Yann Droneaud a écrit :
> Le jeudi 29 janvier 2015 à 11:28 -0700, Jason Gunthorpe a écrit :
> > On Thu, Jan 29, 2015 at 06:59:58PM +0100, Yann Droneaud wrote:

> > > -		resp.odp_caps.general_caps = attr.odp_caps.general_caps;
> > > -		resp.odp_caps.per_transport_caps.rc_odp_caps =
> > > -			attr.odp_caps.per_transport_caps.rc_odp_caps;
> > > -		resp.odp_caps.per_transport_caps.uc_odp_caps =
> > > -			attr.odp_caps.per_transport_caps.uc_odp_caps;
> > > -		resp.odp_caps.per_transport_caps.ud_odp_caps =
> > > -			attr.odp_caps.per_transport_caps.ud_odp_caps;
> > > -		resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
> > > -	}
> > > +	resp.odp_caps.general_caps = attr.odp_caps.general_caps;
> > > +	resp.odp_caps.per_transport_caps.rc_odp_caps =
> > > +		attr.odp_caps.per_transport_caps.rc_odp_caps;
> > > +	resp.odp_caps.per_transport_caps.uc_odp_caps =
> > > +		attr.odp_caps.per_transport_caps.uc_odp_caps;
> > > +	resp.odp_caps.per_transport_caps.ud_odp_caps =
> > > +		attr.odp_caps.per_transport_caps.ud_odp_caps;
> > >  #endif
> > > +	resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
> > 
> > Not sure about this - if 0 is a valid null answer for all the _caps
> > then it is fine, and the comp_mask bit should just be removed as the
> > size alone should be enough.
> > 
> 
> That's difficult to say. But I hope 0 are valids answers in this case.
> 

Hum, according to include/rdma/ib_verbs.h, there's a bit set
for ODP support:

    148 enum ib_odp_general_cap_bits {
    149         IB_ODP_SUPPORT = 1 << 0,
    150 };

So it should be safe to set everything to 0 in struct
ib_uverbs_odp_caps.

Regards.

-- 
Yann Droneaud
OPTEYA

^ permalink raw reply

* [PATCH v12 2/2] crypto: AF_ALG: enable AEAD interface compilation
From: Stephan Mueller @ 2015-01-29 20:26 UTC (permalink / raw)
  To: 'Herbert Xu'
  Cc: 'Quentin Gouchet', Daniel Borkmann, 'LKML',
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, 'Linux API'
In-Reply-To: <1681008.mocmysOoQU-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

Enable compilation of the AEAD AF_ALG support and provide a Kconfig
option to compile the AEAD AF_ALG support.

Signed-off-by: Stephan Mueller <smueller-T9tCv8IpfcWELgA04lAiVw@public.gmane.org>
---
 crypto/Kconfig  | 9 +++++++++
 crypto/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 50f4da4..41a3fc5 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1523,6 +1523,15 @@ config CRYPTO_USER_API_RNG
 	  This option enables the user-spaces interface for random
 	  number generator algorithms.
 
+config CRYPTO_USER_API_AEAD
+	tristate "User-space interface for AEAD cipher algorithms"
+	depends on NET
+	select CRYPTO_AEAD
+	select CRYPTO_USER_API
+	help
+	  This option enables the user-spaces interface for AEAD
+	  cipher algorithms.
+
 config CRYPTO_HASH_INFO
 	bool
 
diff --git a/crypto/Makefile b/crypto/Makefile
index ba19465..97b7d3a 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
+obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
 
 #
 # generic algorithms and the async_tx api
-- 
2.1.0

^ permalink raw reply related

* [PATCH v12 1/2] crypto: AF_ALG: add AEAD support
From: Stephan Mueller @ 2015-01-29 20:24 UTC (permalink / raw)
  To: 'Herbert Xu'
  Cc: 'Quentin Gouchet', Daniel Borkmann, 'LKML',
	linux-crypto, 'Linux API'
In-Reply-To: <1681008.mocmysOoQU@tachyon.chronox.de>

This patch adds the AEAD support for AF_ALG.

The implementation is based on algif_skcipher, but contains heavy
modifications to streamline the interface for AEAD uses.

To use AEAD, the user space consumer has to use the salg_type named
"aead".

The AEAD implementation includes some overhead to calculate the size of
the ciphertext, because the AEAD implementation of the kernel crypto API
makes implied assumption on the location of the authentication tag. When
performing an encryption, the tag will be added to the created
ciphertext (note, the tag is placed adjacent to the ciphertext). For
decryption, the caller must hand in the ciphertext with the tag appended
to the ciphertext. Therefore, the selection of the used memory
needs to add/subtract the tag size from the source/destination buffers
depending on the encryption type. The code is provided with comments
explaining when and how that operation is performed.

A fully working example using all aspects of AEAD is provided at
http://www.chronox.de/libkcapi.html

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_aead.c | 666 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 666 insertions(+)
 create mode 100644 crypto/algif_aead.c

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
new file mode 100644
index 0000000..fe3dfdd
--- /dev/null
+++ b/crypto/algif_aead.c
@@ -0,0 +1,666 @@
+/*
+ * algif_aead: User-space interface for AEAD algorithms
+ *
+ * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
+ *
+ * This file provides the user-space API for AEAD ciphers.
+ *
+ * This file is derived from algif_skcipher.c.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/scatterwalk.h>
+#include <crypto/if_alg.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <net/sock.h>
+
+struct aead_sg_list {
+	unsigned int cur;
+	struct scatterlist sg[ALG_MAX_PAGES];
+};
+
+struct aead_ctx {
+	struct aead_sg_list tsgl;
+	/*
+	 * RSGL_MAX_ENTRIES is an artificial limit where user space at maximum
+	 * can cause the kernel to allocate RSGL_MAX_ENTRIES * ALG_MAX_PAGES
+	 * bytes
+	 */
+#define RSGL_MAX_ENTRIES ALG_MAX_PAGES
+	struct af_alg_sgl rsgl[RSGL_MAX_ENTRIES];
+
+	void *iv;
+
+	struct af_alg_completion completion;
+
+	unsigned long used;
+
+	unsigned int len;
+	bool more;
+	bool merge;
+	bool enc;
+
+	size_t aead_assoclen;
+	struct aead_request aead_req;
+};
+
+static inline int aead_sndbuf(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+
+	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
+			  ctx->used, 0);
+}
+
+static inline bool aead_writable(struct sock *sk)
+{
+	return PAGE_SIZE <= aead_sndbuf(sk);
+}
+
+static inline bool aead_sufficient_data(struct aead_ctx *ctx)
+{
+	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
+
+	return (ctx->used >= (ctx->aead_assoclen + (ctx->enc ? 0 : as)));
+}
+
+static void aead_put_sgl(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct scatterlist *sg = sgl->sg;
+	unsigned int i;
+
+	for (i = 0; i < sgl->cur; i++) {
+		if (!sg_page(sg + i))
+			continue;
+
+		put_page(sg_page(sg + i));
+		sg_assign_page(sg + i, NULL);
+	}
+	sgl->cur = 0;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+}
+
+static void aead_wmem_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	if (!aead_writable(sk))
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+static int aead_wait_for_data(struct sock *sk, unsigned flags)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	long timeout;
+	DEFINE_WAIT(wait);
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT)
+		return -EAGAIN;
+
+	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, !ctx->more)) {
+			err = 0;
+			break;
+		}
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	return err;
+}
+
+static void aead_data_wakeup(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct socket_wq *wq;
+
+	if (ctx->more)
+		return;
+	if (!ctx->used)
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	rcu_read_unlock();
+}
+
+static int aead_sendmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned ivsize =
+		crypto_aead_ivsize(crypto_aead_reqtfm(&ctx->aead_req));
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct af_alg_control con = {};
+	long copied = 0;
+	bool enc = 0;
+	bool init = 0;
+	int err = -EINVAL;
+
+	if (msg->msg_controllen) {
+		err = af_alg_cmsg_send(msg, &con);
+		if (err)
+			return err;
+
+		init = 1;
+		switch (con.op) {
+		case ALG_OP_ENCRYPT:
+			enc = 1;
+			break;
+		case ALG_OP_DECRYPT:
+			enc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (con.iv && con.iv->ivlen != ivsize)
+			return -EINVAL;
+	}
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (init) {
+		ctx->enc = enc;
+		if (con.iv)
+			memcpy(ctx->iv, con.iv->iv, ivsize);
+
+		ctx->aead_assoclen = con.aead_assoclen;
+	}
+
+	while (size) {
+		unsigned long len = size;
+		struct scatterlist *sg = NULL;
+
+		/* use the existing memory in an allocated page */
+		if (ctx->merge) {
+			sg = sgl->sg + sgl->cur - 1;
+			len = min_t(unsigned long, len,
+				    PAGE_SIZE - sg->offset - sg->length);
+			err = memcpy_from_msg(page_address(sg_page(sg)) +
+					      sg->offset + sg->length,
+					      msg, len);
+			if (err)
+				goto unlock;
+
+			sg->length += len;
+			ctx->merge = (sg->offset + sg->length) &
+				     (PAGE_SIZE - 1);
+
+			ctx->used += len;
+			copied += len;
+			size -= len;
+			continue;
+		}
+
+		if (!aead_writable(sk)) {
+			/* user space sent too much data */
+			aead_put_sgl(sk);
+			err = -EMSGSIZE;
+			goto unlock;
+		}
+
+		/* allocate a new page */
+		len = min_t(unsigned long, size, aead_sndbuf(sk));
+		while (len) {
+			int plen = 0;
+
+			if (sgl->cur >= ALG_MAX_PAGES) {
+				aead_put_sgl(sk);
+				err = -E2BIG;
+				goto unlock;
+			}
+
+			sg = sgl->sg + sgl->cur;
+			plen = min_t(int, len, PAGE_SIZE);
+
+			sg_assign_page(sg, alloc_page(GFP_KERNEL));
+			err = -ENOMEM;
+			if (!sg_page(sg))
+				goto unlock;
+
+			err = memcpy_from_msg(page_address(sg_page(sg)),
+					      msg, plen);
+			if (err) {
+				__free_page(sg_page(sg));
+				sg_assign_page(sg, NULL);
+				goto unlock;
+			}
+
+			sg->offset = 0;
+			sg->length = plen;
+			len -= plen;
+			ctx->used += plen;
+			copied += plen;
+			sgl->cur++;
+			size -= plen;
+			ctx->merge = plen & (PAGE_SIZE - 1);
+		}
+	}
+
+	err = 0;
+
+	ctx->more = msg->msg_flags & MSG_MORE;
+	if (!ctx->more && !aead_sufficient_data(ctx)) {
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+	}
+
+unlock:
+	aead_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: copied;
+}
+
+static ssize_t aead_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	int err = -EINVAL;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	if (sgl->cur >= ALG_MAX_PAGES)
+		return -E2BIG;
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (!size)
+		goto done;
+
+	if (!aead_writable(sk)) {
+		/* user space sent too much data */
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+		goto unlock;
+	}
+
+	ctx->merge = 0;
+
+	get_page(page);
+	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
+	sgl->cur++;
+	ctx->used += size;
+
+	err = 0;
+
+done:
+	ctx->more = flags & MSG_MORE;
+	if (!ctx->more && !aead_sufficient_data(ctx)) {
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+	}
+
+unlock:
+	aead_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: size;
+}
+
+static int aead_recvmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t ignored, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned bs = crypto_aead_blocksize(crypto_aead_reqtfm(&ctx->aead_req));
+	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct scatterlist *sg = NULL;
+	struct scatterlist assoc[ALG_MAX_PAGES];
+	size_t assoclen = 0;
+	unsigned int i = 0;
+	int err = -EINVAL;
+	unsigned long used = 0;
+	unsigned long outlen = 0;
+	const struct iovec *iov;
+	unsigned long iovlen = 0;
+	unsigned long usedpages = 0;
+
+	/* Limit number of IOV blocks to be accessed below */
+	if (msg->msg_iter.nr_segs > RSGL_MAX_ENTRIES)
+		return -ENOMSG;
+
+	lock_sock(sk);
+
+	/*
+	 * AEAD memory structure: For encryption, the tag is appended to the
+	 * ciphertext which implies that the memory allocated for the ciphertext
+	 * must be increased by the tag length. For decryption, the tag
+	 * is expected to be concatenated to the ciphertext. The plaintext
+	 * therefore has a memory size of the ciphertext minus the tag length.
+	 *
+	 * The memory structure for cipher operation has the following
+	 * structure:
+	 *	AEAD encryption input:  assoc data || plaintext
+	 *	AEAD encryption output: cipherntext || auth tag
+	 *	AEAD decryption input:  assoc data || ciphertext || auth tag
+	 *	AEAD decryption output: plaintext
+	 */
+
+	if (ctx->more) {
+		err = aead_wait_for_data(sk, flags);
+		if (err)
+			goto unlock;
+	}
+
+	used = ctx->used;
+
+	/*
+	 * Make sure sufficient data is present -- note, the same check is
+	 * is also present in sendmsg/sendpage. The checks in sendpage/sendmsg
+	 * shall provide an information to the data sender that something is
+	 * wrong, but they are irrelevant to maintain the kernel integrity.
+	 * We need this check here too in case user space decides to not honor
+	 * the error message in sendmsg/sendpage and still call recvmsg. This
+	 * check here protects the kernel integrity.
+	 */
+	if (!aead_sufficient_data(ctx))
+		goto unlock;
+
+	/*
+	 * The cipher operation input data is reduced by the associated data
+	 * length as this data is processed separately later on.
+	 */
+	used -= ctx->aead_assoclen;
+
+	if (ctx->enc) {
+		/* round up output buffer to multiple of block size */
+		outlen = ((used + bs - 1) / bs * bs);
+		/* add the size needed for the auth tag to be created */
+		outlen += as;
+	} else {
+		/* output data size is input without the authentication tag */
+		outlen = used - as;
+		/* round up output buffer to multiple of block size */
+		outlen = ((outlen + bs - 1) / bs * bs);
+	}
+
+	/* convert iovecs of output buffers into scatterlists */
+	for (iov = msg->msg_iter.iov, iovlen = 0;
+	     iovlen < msg->msg_iter.nr_segs; iovlen++, iov++) {
+		char __user *from = iov->iov_base;
+		unsigned long seglen = min_t(unsigned long, iov->iov_len,
+					     (outlen - usedpages));
+
+		/* make one iovec available as scatterlist */
+		err = af_alg_make_sg(&ctx->rsgl[iovlen], from, seglen, 1);
+		if (err < 0)
+			goto unlock;
+		usedpages += err;
+		/* chain the new scatterlist with initial list */
+		if (iovlen)
+			scatterwalk_crypto_chain(ctx->rsgl[0].sg,
+					ctx->rsgl[iovlen].sg, 1,
+					sg_nents(ctx->rsgl[iovlen-1].sg));
+		/* we do not need more iovecs as we have sufficient memory */
+		if (outlen <= usedpages)
+			break;
+	}
+
+	err = -EINVAL;
+	/* ensure output buffer is sufficiently large */
+	if (usedpages < outlen)
+		goto unlock;
+
+	sg_init_table(assoc, ALG_MAX_PAGES);
+	assoclen = ctx->aead_assoclen;
+	/*
+	 * Split scatterlist into two: first part becomes AD, second part
+	 * is plaintext / ciphertext. The first part is assigned to assoc
+	 * scatterlist. When this loop finishes, sg points to the start of the
+	 * plaintext / ciphertext.
+	 */
+	for (i = 0; i < ctx->tsgl.cur; i++) {
+		sg = sgl->sg + i;
+		if (sg->length <= assoclen) {
+			/* AD is larger than one page */
+			sg_set_page(assoc + i, sg_page(sg),
+				    sg->length, sg->offset);
+			assoclen -= sg->length;
+			if (i >= ctx->tsgl.cur)
+				goto unlock;
+		} else if (!assoclen) {
+			/* current page is to start of plaintext / ciphertext */
+			if (i)
+				/* AD terminates at page boundary */
+				sg_mark_end(assoc + i - 1);
+			else
+				/* AD size is zero */
+				sg_mark_end(assoc);
+			break;
+		} else {
+			/* AD does not terminate at page boundary */
+			sg_set_page(assoc + i, sg_page(sg),
+				    assoclen, sg->offset);
+			sg_mark_end(assoc + i);
+			/* plaintext / ciphertext starts after AD */
+			sg->length -= assoclen;
+			sg->offset += assoclen;
+			break;
+		}
+	}
+
+	aead_request_set_assoc(&ctx->aead_req, assoc, ctx->aead_assoclen);
+	aead_request_set_crypt(&ctx->aead_req, sg, ctx->rsgl[0].sg, used,
+			       ctx->iv);
+
+	err = af_alg_wait_for_completion(ctx->enc ?
+					 crypto_aead_encrypt(&ctx->aead_req) :
+					 crypto_aead_decrypt(&ctx->aead_req),
+					 &ctx->completion);
+
+	if (err) {
+		/* EBADMSG implies a valid cipher operation took place */
+		if (err == -EBADMSG)
+			aead_put_sgl(sk);
+		goto unlock;
+	}
+
+	aead_put_sgl(sk);
+
+	err = 0;
+
+unlock:
+	for (i = 0; i < iovlen; i++)
+		af_alg_free_sg(&ctx->rsgl[i]);
+
+	aead_wmem_wakeup(sk);
+	release_sock(sk);
+
+	return err ? err : outlen;
+}
+
+static unsigned int aead_poll(struct file *file, struct socket *sock,
+			      poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	if (!ctx->more)
+		mask |= POLLIN | POLLRDNORM;
+
+	if (aead_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static struct proto_ops algif_aead_ops = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	aead_sendmsg,
+	.sendpage	=	aead_sendpage,
+	.recvmsg	=	aead_recvmsg,
+	.poll		=	aead_poll,
+};
+
+static void *aead_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_aead(name, type, mask);
+}
+
+static void aead_release(void *private)
+{
+	crypto_free_aead(private);
+}
+
+static int aead_setauthsize(void *private, unsigned int authsize)
+{
+	return crypto_aead_setauthsize(private, authsize);
+}
+
+static int aead_setkey(void *private, const u8 *key, unsigned int keylen)
+{
+	return crypto_aead_setkey(private, key, keylen);
+}
+
+static void aead_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned int ivlen = crypto_aead_ivsize(
+				crypto_aead_reqtfm(&ctx->aead_req));
+
+	aead_put_sgl(sk);
+	sock_kzfree_s(sk, ctx->iv, ivlen);
+	sock_kfree_s(sk, ctx, ctx->len);
+	af_alg_release_parent(sk);
+}
+
+static int aead_accept_parent(void *private, struct sock *sk)
+{
+	struct aead_ctx *ctx;
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int len = sizeof(*ctx) + crypto_aead_reqsize(private);
+	unsigned int ivlen = crypto_aead_ivsize(private);
+
+	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	memset(ctx, 0, len);
+
+	ctx->iv = sock_kmalloc(sk, ivlen, GFP_KERNEL);
+	if (!ctx->iv) {
+		sock_kfree_s(sk, ctx, len);
+		return -ENOMEM;
+	}
+	memset(ctx->iv, 0, ivlen);
+
+	ctx->len = len;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+	ctx->enc = 0;
+	ctx->tsgl.cur = 0;
+	ctx->aead_assoclen = 0;
+	af_alg_init_completion(&ctx->completion);
+	sg_init_table(ctx->tsgl.sg, ALG_MAX_PAGES);
+
+	ask->private = ctx;
+
+	aead_request_set_tfm(&ctx->aead_req, private);
+	aead_request_set_callback(&ctx->aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  af_alg_complete, &ctx->completion);
+
+	sk->sk_destruct = aead_sock_destruct;
+
+	return 0;
+}
+
+static const struct af_alg_type algif_type_aead = {
+	.bind		=	aead_bind,
+	.release	=	aead_release,
+	.setkey		=	aead_setkey,
+	.setauthsize	=	aead_setauthsize,
+	.accept		=	aead_accept_parent,
+	.ops		=	&algif_aead_ops,
+	.name		=	"aead",
+	.owner		=	THIS_MODULE
+};
+
+static int __init algif_aead_init(void)
+{
+	return af_alg_register_type(&algif_type_aead);
+}
+
+static void __exit algif_aead_exit(void)
+{
+	int err = af_alg_unregister_type(&algif_type_aead);
+	BUG_ON(err);
+}
+
+module_init(algif_aead_init);
+module_exit(algif_aead_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
+MODULE_DESCRIPTION("AEAD kernel crypto API user space interface");
-- 
2.1.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox