Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* Re: [v9 4/5] ext4: adds FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support
From: Jan Kara @ 2015-03-16 15:26 UTC (permalink / raw)
  To: Li Xi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-ext4-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, tytso-3s7WtUTddSA,
	adilger-m1MBpc4rdrD3fQ9qLvQP4Q, jack-AlSwsSmVLrQ,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, hch-wEGCiKHe2LqWVfeAwA7xHQ,
	dmonakhov-GEFAQzZX7r8dnm+yROfE0A
In-Reply-To: <1426043003-31043-5-git-send-email-lixi-LfVdkaOWEx8@public.gmane.org>

On Wed 11-03-15 12:03:22, Li Xi wrote:
> This patch adds FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR ioctl interface
> support for ext4. The interface is kept consistent with
> XFS_IOC_FSGETXATTR/XFS_IOC_FSGETXATTR.
  Thanks for the patch! I think we are getting to a working solution :) Apart
from the bug Konstantin pointed out, I have a few comments below.
 
> Signed-off-by: Li Xi <lixi-LfVdkaOWEx8@public.gmane.org>
> ---
>  fs/ext4/ext4.h          |   47 +++++++
>  fs/ext4/ioctl.c         |  341 +++++++++++++++++++++++++++++++++--------------
>  fs/xfs/xfs_fs.h         |   47 +++----
>  include/uapi/linux/fs.h |   32 +++++
>  4 files changed, 338 insertions(+), 129 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3443456..2f4b9ba 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -385,6 +385,51 @@ struct flex_groups {
>  #define EXT4_FL_USER_VISIBLE		0x204BDFFF /* User visible flags */
>  #define EXT4_FL_USER_MODIFIABLE		0x204380FF /* User modifiable flags */
>  
> +#define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
> +					 EXT4_IMMUTABLE_FL | \
> +					 EXT4_APPEND_FL | \
> +					 EXT4_NOATIME_FL | \
> +					 EXT4_PROJINHERIT_FL)
> +
> +/* Transfer internal flags to xflags */
> +static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
> +{
> +	__u32 xflags = 0;
> +
> +	if (iflags & EXT4_SYNC_FL)
> +		xflags |= FS_XFLAG_SYNC;
> +	if (iflags & EXT4_IMMUTABLE_FL)
> +		xflags |= FS_XFLAG_IMMUTABLE;
> +	if (iflags & EXT4_APPEND_FL)
> +		xflags |= FS_XFLAG_APPEND;
> +	if (iflags & EXT4_NOATIME_FL)
> +		xflags |= FS_XFLAG_NOATIME;
> +	if (iflags & EXT4_PROJINHERIT_FL)
> +		xflags |= FS_XFLAG_PROJINHERIT;
> +	return xflags;
> +}
  I think EXT4_NODUMP_FL is missing in EXT4_FL_XFLAG_VISIBLE and isn't
handled in ext4_iflags_to_xflags().

> +/* Transfer xflags flags to internal */
> +static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
> +{
> +	unsigned long iflags = 0;
> +
> +	if (xflags & FS_XFLAG_SYNC)
> +		iflags |= EXT4_SYNC_FL;
> +	if (xflags & FS_XFLAG_IMMUTABLE)
> +		iflags |= EXT4_IMMUTABLE_FL;
> +	if (xflags & FS_XFLAG_APPEND)
> +		iflags |= EXT4_APPEND_FL;
> +	if (xflags & FS_XFLAG_NODUMP)
> +		iflags |= EXT4_NODUMP_FL;
> +	if (xflags & FS_XFLAG_NOATIME)
> +		iflags |= EXT4_NOATIME_FL;
> +	if (xflags & FS_XFLAG_PROJINHERIT)
> +		iflags |= EXT4_PROJINHERIT_FL;
> +
> +	return iflags;
> +}
  These two functions are only used in fs/ext4/ioctl.c. So just move their
definition there.

...
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index f58a0d1..20a6337 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -14,6 +14,8 @@
>  #include <linux/compat.h>
>  #include <linux/mount.h>
>  #include <linux/file.h>
> +#include <linux/quotaops.h>
> +#include <linux/quota.h>
>  #include <asm/uaccess.h>
>  #include "ext4_jbd2.h"
>  #include "ext4.h"
> @@ -196,126 +198,229 @@ journal_err_out:
>  	return err;
>  }
>  
> -long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> +static int ext4_ioctl_setflags(struct file *filp,
> +			       unsigned int flags, int is_from_xflags)
>  {
  Hum, I think we can get rid of the is_from_xflags argument. As I'm
looking into the code below the only real reason why we have is_from_xflags
is that we need to compare old value of flags against new value of flags
and see what has changed. So what we can do is that FSSETXATTR ioctl will
construct flags to pass to ext4_ioctl_setflags() like:
	flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
		(flags & EXT4_FL_XFLAG_VISIBLE);

That way we don't change any flags outside of EXT4_FL_XFLAG_VISIBLE and
ext4_ioctl_setflags() doesn't have to be aware about different callers. The
only downside is that we need to hold i_mutex to reliably create 'flags'
value which requires moving mnt_want_write_file() and
mutex_lock(&inode->i_mutex) into the caller but that's pretty simple.

>  	struct inode *inode = file_inode(filp);
> -	struct super_block *sb = inode->i_sb;
>  	struct ext4_inode_info *ei = EXT4_I(inode);
> -	unsigned int flags;
> +	handle_t *handle = NULL;
> +	int err, migrate = 0;
> +	struct ext4_iloc iloc;
> +	unsigned int oldflags, mask, i;
> +	unsigned int jflag;
>  
> -	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
> +	if (!inode_owner_or_capable(inode))
> +		return -EACCES;
>  
> -	switch (cmd) {
> -	case EXT4_IOC_GETFLAGS:
> -		ext4_get_inode_flags(ei);
> -		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
> -		return put_user(flags, (int __user *) arg);
> -	case EXT4_IOC_SETFLAGS: {
> -		handle_t *handle = NULL;
> -		int err, migrate = 0;
> -		struct ext4_iloc iloc;
> -		unsigned int oldflags, mask, i;
> -		unsigned int jflag;
> +	err = mnt_want_write_file(filp);
> +	if (err)
> +		return err;
>  
> -		if (!inode_owner_or_capable(inode))
> -			return -EACCES;
> +	flags = ext4_mask_flags(inode->i_mode, flags);
> +	if (is_from_xflags)
> +		flags &= EXT4_FL_XFLAG_VISIBLE;
> +
> +	err = -EPERM;
> +	mutex_lock(&inode->i_mutex);
> +	/* Is it quota file? Do not allow user to mess with it */
> +	if (IS_NOQUOTA(inode))
> +		goto flags_out;
> +
> +	oldflags = ei->i_flags;
> +	if (is_from_xflags)
> +		oldflags &= EXT4_FL_XFLAG_VISIBLE;
> +
> +	/* The JOURNAL_DATA flag is modifiable only by root */
> +	jflag = flags & EXT4_JOURNAL_DATA_FL;
> +	if (is_from_xflags)
> +		jflag &= EXT4_FL_XFLAG_VISIBLE;
> +
> +	/*
> +	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
> +	 * the relevant capability.
> +	 *
> +	 * This test looks nicer. Thanks to Pauline Middelink
> +	 */
> +	if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
> +		if (!capable(CAP_LINUX_IMMUTABLE))
> +			goto flags_out;
> +	}
>  
> -		if (get_user(flags, (int __user *) arg))
> -			return -EFAULT;
> +	/*
> +	 * The JOURNAL_DATA flag can only be changed by
> +	 * the relevant capability.
> +	 */
> +	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
> +		if (!capable(CAP_SYS_RESOURCE))
> +			goto flags_out;
> +	}
> +	if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
> +		migrate = 1;
> +		if (flags & EXT4_EOFBLOCKS_FL) {
> +		/* we don't support adding EOFBLOCKS flag */
> +		if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
> +			err = -EOPNOTSUPP;
> +			goto flags_out;
> +		}
> +	} else if (oldflags & EXT4_EOFBLOCKS_FL)
> +		ext4_truncate(inode);
>  
> -		err = mnt_want_write_file(filp);
> -		if (err)
> -			return err;
> +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> +	if (IS_ERR(handle)) {
> +		err = PTR_ERR(handle);
> +		goto flags_out;
> +	}
> +	if (IS_SYNC(inode))
> +		ext4_handle_sync(handle);
> +	err = ext4_reserve_inode_write(handle, inode, &iloc);
> +	if (err)
> +		goto flags_err;
> +
> +	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
> +		if (!(mask & EXT4_FL_USER_MODIFIABLE))
> +			continue;
> +		if (is_from_xflags && !(mask & EXT4_FL_XFLAG_VISIBLE))
> +			continue;
> +		if (mask & flags)
> +			ext4_set_inode_flag(inode, i);
> +		else
> +			ext4_clear_inode_flag(inode, i);
> +	}
>  
> -		flags = ext4_mask_flags(inode->i_mode, flags);
> +	ext4_set_inode_flags(inode);
> +	inode->i_ctime = ext4_current_time(inode);
>  
> -		err = -EPERM;
> -		mutex_lock(&inode->i_mutex);
> -		/* Is it quota file? Do not allow user to mess with it */
> -		if (IS_NOQUOTA(inode))
> -			goto flags_out;
> +	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> +flags_err:
> +	ext4_journal_stop(handle);
> +	if (err)
> +		goto flags_out;
> +
> +	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
> +		err = ext4_change_inode_journal_flag(inode, jflag);
> +	if (err)
> +		goto flags_out;
> +	if (migrate) {
> +		if (flags & EXT4_EXTENTS_FL)
> +			err = ext4_ext_migrate(inode);
> +		else
> +			err = ext4_ind_migrate(inode);
> +	}
>  
> -		oldflags = ei->i_flags;
> +flags_out:
> +	mutex_unlock(&inode->i_mutex);
> +	mnt_drop_write_file(filp);
> +	return err;
> +}
>  
> -		/* The JOURNAL_DATA flag is modifiable only by root */
> -		jflag = flags & EXT4_JOURNAL_DATA_FL;
> +static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct super_block *sb = inode->i_sb;
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +	int err;
> +	handle_t *handle;
> +	kprojid_t kprojid;
> +	struct ext4_iloc iloc;
> +	struct ext4_inode *raw_inode;
> +
> +	struct dquot *transfer_to[EXT4_MAXQUOTAS] = { };
> +
> +	/* Make sure caller can change project. */
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	if (projid != EXT4_DEF_PROJID
> +	    && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
> +			EXT4_FEATURE_RO_COMPAT_PROJECT))
> +		return -EOPNOTSUPP;
> +
> +	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
> +			EXT4_FEATURE_RO_COMPAT_PROJECT)) {
> +		BUG_ON(__kprojid_val(EXT4_I(inode)->i_projid)
> +		       != EXT4_DEF_PROJID);
> +		if (projid != EXT4_DEF_PROJID)
> +			return -EOPNOTSUPP;
> +		else
> +			return 0;
> +	}
  Why is the test here twice? The first one seems to be redundant...

								Honza
-- 
Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>
SUSE Labs, CR

^ permalink raw reply

* [PATCH 1/2] drm_modes: videomode: add pos/neg pixel clock polarity flag
From: Sébastien Szymanski @ 2015-03-16 17:29 UTC (permalink / raw)
  To: David Airlie, Philipp Zabel, dri-devel, linux-kernel, linux-api
  Cc: Sébastien Szymanski, Steve Longerbeam

From: Steve Longerbeam <steve_longerbeam@mentor.com>

[Sébastien - rebase, update drm_display_mode_to_videomode function]

Signed-off-by: Steve Longerbeam <steve_longerbeam@mentor.com>
Signed-off-by: Sébastien Szymanski <sebastien.szymanski@armadeus.com>
---
 drivers/gpu/drm/drm_modes.c | 8 ++++++++
 include/uapi/drm/drm_mode.h | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 487d0e3..464828f 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -611,6 +611,10 @@ void drm_display_mode_from_videomode(const struct videomode *vm,
 		dmode->flags |= DRM_MODE_FLAG_DBLSCAN;
 	if (vm->flags & DISPLAY_FLAGS_DOUBLECLK)
 		dmode->flags |= DRM_MODE_FLAG_DBLCLK;
+	if (vm->flags & DISPLAY_FLAGS_PIXDATA_POSEDGE)
+		dmode->flags |= DRM_MODE_FLAG_PCLK;
+	else if (vm->flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
+		dmode->flags |= DRM_MODE_FLAG_NCLK;
 	drm_mode_set_name(dmode);
 }
 EXPORT_SYMBOL_GPL(drm_display_mode_from_videomode);
@@ -652,6 +656,10 @@ void drm_display_mode_to_videomode(const struct drm_display_mode *dmode,
 		vm->flags |= DISPLAY_FLAGS_DOUBLESCAN;
 	if (dmode->flags & DRM_MODE_FLAG_DBLCLK)
 		vm->flags |= DISPLAY_FLAGS_DOUBLECLK;
+	if (dmode->flags & DRM_MODE_FLAG_PCLK)
+		vm->flags |= DISPLAY_FLAGS_PIXDATA_POSEDGE;
+	else if (dmode->flags & DRM_MODE_FLAG_NCLK)
+		vm->flags |= DISPLAY_FLAGS_PIXDATA_NEGEDGE;
 }
 EXPORT_SYMBOL_GPL(drm_display_mode_to_videomode);
 
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index ca788e0..1abb2fc 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -72,6 +72,10 @@
 #define  DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH	(6<<14)
 #define  DRM_MODE_FLAG_3D_TOP_AND_BOTTOM	(7<<14)
 #define  DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF	(8<<14)
+/* drive data on rising pixclk edge */
+#define DRM_MODE_FLAG_PCLK			(1<<19)
+/* drive data on falling pixclk edge */
+#define DRM_MODE_FLAG_NCLK			(1<<20)
 
 
 /* DPMS flags */
-- 
2.0.5

^ permalink raw reply related

* [PATCH 2/2] imx-drm: ipuv3-crtc: Use DRM mode flags to configure pixel clock polarity
From: Sébastien Szymanski @ 2015-03-16 17:29 UTC (permalink / raw)
  To: David Airlie, Philipp Zabel,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
  Cc: Sébastien Szymanski, Mohsin Kazmi, Steve Longerbeam
In-Reply-To: <1426526944-16762-1-git-send-email-sebastien.szymanski-d2DlULPkwbNWk0Htik3J/w@public.gmane.org>

From: Steve Longerbeam <steve_longerbeam-nmGgyN9QBj3QT0dZR+AlfA@public.gmane.org>

Previously, pixel clock polarity was hardcoded and wasn't configurable.
This patch adds support to configure the pixel clock polarity from the
DRM mode flags.

[Sébastien - rebase]

Signed-off-by: Mohsin Kazmi <mohsin_kazmi-nmGgyN9QBj3QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Steve Longerbeam <steve_longerbeam-nmGgyN9QBj3QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Sébastien Szymanski <sebastien.szymanski@armadeus.com>
---
 drivers/gpu/drm/imx/ipuv3-crtc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/imx/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3-crtc.c
index 98551e3..71f888b 100644
--- a/drivers/gpu/drm/imx/ipuv3-crtc.c
+++ b/drivers/gpu/drm/imx/ipuv3-crtc.c
@@ -171,10 +171,12 @@ static int ipu_crtc_mode_set(struct drm_crtc *crtc,
 	else
 		sig_cfg.clkflags = 0;
 
+	if (mode->flags & DRM_MODE_FLAG_PCLK)
+		sig_cfg.clk_pol = 1;
+
 	out_pixel_fmt = ipu_crtc->interface_pix_fmt;
 
 	sig_cfg.enable_pol = 1;
-	sig_cfg.clk_pol = 0;
 	sig_cfg.pixel_fmt = out_pixel_fmt;
 	sig_cfg.v_to_h_sync = 0;
 	sig_cfg.hsync_pin = ipu_crtc->di_hsync_pin;
-- 
2.0.5

^ permalink raw reply related

* RE: [PATCH RFCv2 00/21] Drivers: hv: utils: re-implement the kernel/userspace communication layer
From: KY Srinivasan @ 2015-03-16 18:15 UTC (permalink / raw)
  To: Vitaly Kuznetsov,
	devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org
  Cc: Haiyang Zhang,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Dexuan Cui,
	Radim Krcmar, Greg Kroah-Hartman,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>



> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:vkuznets-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org]
> Sent: Wednesday, March 11, 2015 6:29 AM
> To: KY Srinivasan; devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org
> Cc: Haiyang Zhang; linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Dexuan Cui; Radim Krcmar;
> Greg Kroah-Hartman; linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: [PATCH RFCv2 00/21] Drivers: hv: utils: re-implement the
> kernel/userspace communication layer
> 
> Changes in RFCv2:
> - Preserve backwards compatibility with netlink-speaking daemons. [K. Y.
> Srinivasan]
> - Introduce transport abstraction layer. [K. Y. Srinivasan]
> - Get rid of ioctls [Radim Krcmar]
> - Make the series reviewable by splitting it into smaller patches.
> 
> Anatomy of the series:
> Patches 01 - 07 are cleanup with minor functional change.
> Patch 08 defines the state machine.
> Patches 09-11 convert all 3 drivers to using the state machine.
> Patch 12 fixes a bug in fcopy. This change is going away in Patch 15,  I just
> want to highlight the fix.
> Patch 13 introduces a transport abstraction.
> Patch 14-16 convert all drivers to using the transport abstraction.
> Patches 17-18 switch KVP and VSS daemon to using char devices.
> Patches 19-20 convert FCOPY and VSS to hull handshake (the same we have
> in  KVP). These two can be postponed till we really need to distinguish
> between  different kernels in the daemon code.
> Patch 21 unifies log messages on daemons connect across all drivers and
> moves  these messages to debug level.
> 
> I smoke-tested this series with both old (netlink) and new (char devices)
> daemons and tested the daemon upgrade procedure.
> 
> Original description:
> This series converts kvp/vss daemons to use misc char devices instead of
> netlink for userspace/kernel communication and then updates fcopy to be
> consistent with kvp/vss.
> 
> Userspace/kernel communication via netlink has a number of issues:
> - It is hard for userspace to figure out if the kernel part was loaded or not
>   and this fact can change as there is a way to enable/disable the service from
>   host side. Racy daemon startup is also a problem.
> - When the userspace daemon restarts/dies kernel part doesn't receive a
>   notification.
> - Netlink communication is not stable under heavy load.
> - ...
> 
> Vitaly Kuznetsov (21):
>   Drivers: hv: util: move kvp/vss function declarations to
>     hyperv_vmbus.h
>   Drivers: hv: kvp: reset kvp_context
>   Drivers: hv: kvp: move poll_channel() to hyperv_vmbus.h
>   Drivers: hv: fcopy: process deferred messages when we complete the
>     transaction
>   Drivers: hv: vss: process deferred messages when we complete the
>     transaction
>   Drivers: hv: kvp: rename kvp_work -> kvp_timeout_work
>   Drivers: hv: fcopy: rename fcopy_work -> fcopy_timeout_work
>   Drivers: hv: util: introduce state machine for util drivers
>   Drivers: hv: kvp: switch to using the hvutil_device_state state
>     machine
>   Drivers: hv: vss: switch to using the hvutil_device_state state
>     machine
>   Drivers: hv: fcopy: switch to using the hvutil_device_state state
>     machine
>   Drivers: hv: fcopy: set .owner reference for file operations
>   Drivers: hv: util: introduce hv_utils_transport abstraction
>   Drivers: hv: vss: convert to hv_utils_transport
>   Drivers: hv: fcopy: convert to hv_utils_transport
>   Drivers: hv: kvp: convert to hv_utils_transport
>   Tools: hv: kvp: use misc char device to communicate with kernel
>   Tools: hv: vss: use misc char device to communicate with kernel
>   Drivers: hv: vss: full handshake support
>   Drivers: hv: fcopy: full handshake support
>   Drivers: hv: utils: unify driver registration reporting
> 
>  drivers/hv/Makefile             |   2 +-
>  drivers/hv/hv_fcopy.c           | 287 ++++++++++++++--------------------------
>  drivers/hv/hv_kvp.c             | 192 +++++++++++++--------------
>  drivers/hv/hv_snapshot.c        | 168 +++++++++++++++--------
>  drivers/hv/hv_utils_transport.c | 276
> ++++++++++++++++++++++++++++++++++++++
>  drivers/hv/hv_utils_transport.h |  51 +++++++
>  drivers/hv/hyperv_vmbus.h       |  29 ++++
>  include/linux/hyperv.h          |   8 --
>  include/uapi/linux/hyperv.h     |   8 +-
>  tools/hv/hv_fcopy_daemon.c      |  15 +++
>  tools/hv/hv_kvp_daemon.c        | 166 +++++------------------
>  tools/hv/hv_vss_daemon.c        | 149 ++++++---------------
>  12 files changed, 752 insertions(+), 599 deletions(-)  create mode 100644
> drivers/hv/hv_utils_transport.c  create mode 100644
> drivers/hv/hv_utils_transport.h

Vitaly,

Thank you very much for taking on this project; very well done. I have mostly reviewed the code and I should be
done shortly. Also, I am going to test this code as well. If there are no issues, I will send this out to Greg in my next 
installment of patches.

Regards,

K. Y 
> 
> --
> 1.9.3

^ permalink raw reply

* [PATCH v7 0/5] vfs: Non-blockling buffered fs read (page cache only)
From: Milosz Tanski @ 2015-03-16 18:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, linux-fsdevel, linux-aio, Mel Gorman,
	Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
	Al Viro, linux-api, Michael Kerrisk, linux-arch, Dave Chinner,
	Andrew Morton

This patchset introduces two new syscalls preadv2 and pwritev2. They are the
same syscalls as preadv and pwrite but with a flag argument. Additionally,
preadv2 implements an extra RWF_NONBLOCK flag. 

The RWF_NONBLOCK flag in preadv2 introduces an ability to perform a
non-blocking read from regular files in buffered IO mode. This works by only
for those filesystems that have data in the page cache.

We discussed these changes at this year's LSF/MM summit in Boston. More details
on the Samba use case, the numbers, and presentation is available at this link:
https://lists.samba.org/archive/samba-technical/2015-March/106290.html

Please stayed tune for man pages patches and xfstest patches. They will be sent
as In-Reply-To.

Latest changes highlight:
 - Drops RWF_DSYNC from pwritev2, per Christoph and Andrew
 - Updated man pages
 - Added tests for this functionality to xfstests, per Dave Chinner
 - Based on top of 4.1-rc3
 - Tests / numbers using samba and a CIFS client FIO engine

Forward looking:

 Christoph committed to sending a separate patch series for the RWF_DSYNC for
 pwritev2 implementation so it can be evaluated independently. This helps
 with implementing userspace file servers for protocols that have a per operation
 sync flag (CIFS).

 Additionally, Christoph committed to implementing RWF_NONBLOCK for the write
 case as well (in pwritev2) at a later date.

Background:

 Using a threadpool to emulate non-blocking operations on regular buffered
 files is a common pattern today (samba, libuv, etc...) Applications split the
 work between network bound threads (epoll) and IO threadpool. Not every
 application can use sendfile syscall (TLS / post-processing).

 This common pattern leads to increased request latency. Latency can be due to
 additional synchronization between the threads or fast (cached data) request
 stuck behind slow request (large / uncached data).

 The preadv2 syscall with RWF_NONBLOCK lets userspace applications bypass
 enqueuing operation in the threadpool if it's already available in the
 pagecache.

Performance numbers (newer Samba):

 https://drive.google.com/file/d/0B3maCn0jCvYncndGbXJKbGlhejQ/view?usp=sharing
 https://docs.google.com/spreadsheets/d/1GGTivi-MfZU0doMzomG4XUo9ioWtRvOGQ5FId042L6s/edit?usp=sharing

Performance number (older):

 Some perf data generated using fio comparing the posix aio engine to a version
 of the posix AIO engine that attempts to performs "fast" reads before
 submitting the operations to the queue. This workflow is on ext4 partition on
 raid0 (test / build-rig.) Simulating our database access patern workload using
 16kb read accesses. Our database uses a home-spun posix aio like queue (samba
 does the same thing.)

 f1: ~73% rand read over mostly cached data (zipf med-size dataset)
 f2: ~18% rand read over mostly un-cached data (uniform large-dataset)
 f3: ~9% seq-read over large dataset

 before:

 f1:
     bw (KB  /s): min=   11, max= 9088, per=0.56%, avg=969.54, stdev=827.99
     lat (msec) : 50=0.01%, 100=1.06%, 250=5.88%, 500=4.08%, 750=12.48%
     lat (msec) : 1000=17.27%, 2000=49.86%, >=2000=9.42%
 f2:
     bw (KB  /s): min=    2, max= 1882, per=0.16%, avg=273.28, stdev=220.26
     lat (msec) : 250=5.65%, 500=3.31%, 750=15.64%, 1000=24.59%, 2000=46.56%
     lat (msec) : >=2000=4.33%
 f3:
     bw (KB  /s): min=    0, max=265568, per=99.95%, avg=174575.10,
                  stdev=34526.89
     lat (usec) : 2=0.01%, 4=0.01%, 10=0.02%, 20=0.27%, 50=10.82%
     lat (usec) : 100=50.34%, 250=5.05%, 500=7.12%, 750=6.60%, 1000=4.55%
     lat (msec) : 2=8.73%, 4=3.49%, 10=1.83%, 20=0.89%, 50=0.22%
     lat (msec) : 100=0.05%, 250=0.02%, 500=0.01%
 total:
    READ: io=102365MB, aggrb=174669KB/s, minb=240KB/s, maxb=173599KB/s,
          mint=600001msec, maxt=600113msec

 after (with fast read using preadv2 before submit):

 f1:
     bw (KB  /s): min=    3, max=14897, per=1.28%, avg=2276.69, stdev=2930.39
     lat (usec) : 2=70.63%, 4=0.01%
     lat (msec) : 250=0.20%, 500=2.26%, 750=1.18%, 2000=0.22%, >=2000=25.53%
 f2:
     bw (KB  /s): min=    2, max= 2362, per=0.14%, avg=249.83, stdev=222.00
     lat (msec) : 250=6.35%, 500=1.78%, 750=9.29%, 1000=20.49%, 2000=52.18%
     lat (msec) : >=2000=9.99%
 f3:
     bw (KB  /s): min=    1, max=245448, per=100.00%, avg=177366.50,
                  stdev=35995.60
     lat (usec) : 2=64.04%, 4=0.01%, 10=0.01%, 20=0.06%, 50=0.43%
     lat (usec) : 100=0.20%, 250=1.27%, 500=2.93%, 750=3.93%, 1000=7.35%
     lat (msec) : 2=14.27%, 4=2.88%, 10=1.54%, 20=0.81%, 50=0.22%
     lat (msec) : 100=0.05%, 250=0.02%
 total:
    READ: io=103941MB, aggrb=177339KB/s, minb=213KB/s, maxb=176375KB/s,
          mint=600020msec, maxt=600178msec

 Interpreting the results you can see total bandwidth stays the same but overall
 request latency is decreased in f1 (random, mostly cached) and f3 (sequential)
 workloads. There is a slight bump in latency for since it's random data that's
 unlikely to be cached but we're always trying "fast read".

 In our application we have starting keeping track of "fast read" hits/misses
 and for files / requests that have a lot hit ratio we don't do "fast reads"
 mostly getting rid of extra latency in the uncached cases. In our real world
 work load we were able to reduce average response time by 20 to 30% (depends
 on amount of IO done by request).

 I've performed other benchmarks and I have no observed any perf regressions in
 any of the normal (old) code paths.

Full change log:

 Version 7 highlight:
  - Drops RWF_DSYNC from pwritev2, per Christoph and Andrew
  - Updated man pages
  - Added tests for this functionality to xfstests, per Dave Chinner
  - Based on top of 4.1-rc3
  - Tests / numbers using samba and a CIFS client FIO engine

 Version 6 highlight:
  - Compat syscall flag checks, per. Jeff.
  - Minor stylistic suggestions.

 Version 5 highlight:
  - XFS support for RWF_NONBLOCK. from Christoph.
  - RWF_DSYNC flag and support for pwritev2, from Christoph.
  - Implemented compat syscalls, per. Jeff.
  - Missing nfs, ceph changes from older patchset.

 Version 4 highlight:
  - Updated for 3.18-rc1.
  - Performance data from our application.
  - First stab at man page with Jeff's help. Patch is in-reply to.

 RFC Version 3 highlights:
  - Down to 2 syscalls from 4; can user fp or argument position.
  - RWF_NONBLOCK value flag is not the same O_NONBLOCK, per Jeff.

 RFC Version 2 highlights:
  - Put the flags argument into kiocb (less noise), per. Al Viro
  - O_DIRECT checking early in the process, per. Jeff Moyer
  - Resolved duplicate (c&p) code in syscall code, per. Jeff
  - Included perf data in thread cover letter, per. Jeff
  - Created a new flag (not O_NONBLOCK) for readv2, perf Jeff

I have co-developed these changes with Christoph Hellwig.

Christoph Hellwig (1):
  xfs: add RWF_NONBLOCK support

Milosz Tanski (4):
  vfs: Prepare for adding a new preadv/pwritev with user flags.
  vfs: Define new syscalls preadv2,pwritev2
  x86: wire up preadv2 and pwritev2
  vfs: RWF_NONBLOCK flag for preadv2

 arch/x86/syscalls/syscall_32.tbl  |   2 +
 arch/x86/syscalls/syscall_64.tbl  |   2 +
 drivers/target/target_core_file.c |   6 +-
 fs/ceph/file.c                    |   2 +
 fs/cifs/file.c                    |   6 +
 fs/nfs/file.c                     |   5 +-
 fs/nfsd/vfs.c                     |   4 +-
 fs/ocfs2/file.c                   |   6 +
 fs/pipe.c                         |   3 +-
 fs/read_write.c                   | 229 +++++++++++++++++++++++++++++---------
 fs/splice.c                       |   2 +-
 fs/xfs/xfs_file.c                 |  28 ++++-
 include/linux/aio.h               |   2 +
 include/linux/compat.h            |   6 +
 include/linux/fs.h                |   6 +-
 include/linux/syscalls.h          |   6 +
 include/uapi/asm-generic/unistd.h |   6 +-
 mm/filemap.c                      |  23 +++-
 mm/shmem.c                        |   4 +
 19 files changed, 279 insertions(+), 69 deletions(-)

-- 
1.9.1

^ permalink raw reply

* [PATCH v7 1/5] vfs: Prepare for adding a new preadv/pwritev with user flags.
From: Milosz Tanski @ 2015-03-16 18:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, linux-fsdevel, linux-aio, Mel Gorman,
	Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
	Al Viro, linux-api, Michael Kerrisk, linux-arch, Dave Chinner,
	Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz@adfin.com>

Plumbing the flags argument through the vfs code so they can be passed down to
__generic_file_(read/write)_iter function that do the acctual work.

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
---
 drivers/target/target_core_file.c |  6 +++---
 fs/nfsd/vfs.c                     |  4 ++--
 fs/read_write.c                   | 27 +++++++++++++++------------
 fs/splice.c                       |  2 +-
 include/linux/aio.h               |  2 ++
 include/linux/fs.h                |  4 ++--
 6 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 44620fb..fdd0a10 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -351,9 +351,9 @@ static int fd_do_rw(struct se_cmd *cmd, struct scatterlist *sgl,
 	set_fs(get_ds());
 
 	if (is_write)
-		ret = vfs_writev(fd, &iov[0], sgl_nents, &pos);
+		ret = vfs_writev(fd, &iov[0], sgl_nents, &pos, 0);
 	else
-		ret = vfs_readv(fd, &iov[0], sgl_nents, &pos);
+		ret = vfs_readv(fd, &iov[0], sgl_nents, &pos, 0);
 
 	set_fs(old_fs);
 
@@ -534,7 +534,7 @@ fd_execute_write_same(struct se_cmd *cmd)
 
 	old_fs = get_fs();
 	set_fs(get_ds());
-	rc = vfs_writev(f, &iov[0], iov_num, &pos);
+	rc = vfs_writev(f, &iov[0], iov_num, &pos, 0);
 	set_fs(old_fs);
 
 	vfree(iov);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3685265..1c6faaa 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -893,7 +893,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+	host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
 	set_fs(oldfs);
 	return nfsd_finish_read(file, count, host_err);
 }
@@ -980,7 +980,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
 	set_fs(oldfs);
 	if (host_err < 0)
 		goto out_nfserr;
diff --git a/fs/read_write.c b/fs/read_write.c
index 8e1b687..b53bb59 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -711,7 +711,8 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 EXPORT_SYMBOL(iov_shorten);
 
 static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
-		unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+		unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn,
+		int flags)
 {
 	struct kiocb kiocb;
 	struct iov_iter iter;
@@ -720,6 +721,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_nbytes = len;
+	kiocb.ki_rwflags = flags;
 
 	iov_iter_init(&iter, rw, iov, nr_segs, len);
 	ret = fn(&kiocb, &iter);
@@ -858,7 +860,8 @@ out:
 
 static ssize_t do_readv_writev(int type, struct file *file,
 			       const struct iovec __user * uvector,
-			       unsigned long nr_segs, loff_t *pos)
+			       unsigned long nr_segs, loff_t *pos,
+			       int flags)
 {
 	size_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
@@ -892,7 +895,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
 
 	if (iter_fn)
 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
-						pos, iter_fn);
+						pos, iter_fn, flags);
 	else if (fnv)
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
@@ -915,27 +918,27 @@ out:
 }
 
 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
-		  unsigned long vlen, loff_t *pos)
+		  unsigned long vlen, loff_t *pos, int flags)
 {
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
 
-	return do_readv_writev(READ, file, vec, vlen, pos);
+	return do_readv_writev(READ, file, vec, vlen, pos, flags);
 }
 
 EXPORT_SYMBOL(vfs_readv);
 
 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
-		   unsigned long vlen, loff_t *pos)
+		   unsigned long vlen, loff_t *pos, int flags)
 {
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 
-	return do_readv_writev(WRITE, file, vec, vlen, pos);
+	return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
 }
 
 EXPORT_SYMBOL(vfs_writev);
@@ -948,7 +951,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
-		ret = vfs_readv(f.file, vec, vlen, &pos);
+		ret = vfs_readv(f.file, vec, vlen, &pos, 0);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
 		fdput_pos(f);
@@ -968,7 +971,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
-		ret = vfs_writev(f.file, vec, vlen, &pos);
+		ret = vfs_writev(f.file, vec, vlen, &pos, 0);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
 		fdput_pos(f);
@@ -1000,7 +1003,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 	if (f.file) {
 		ret = -ESPIPE;
 		if (f.file->f_mode & FMODE_PREAD)
-			ret = vfs_readv(f.file, vec, vlen, &pos);
+			ret = vfs_readv(f.file, vec, vlen, &pos, 0);
 		fdput(f);
 	}
 
@@ -1024,7 +1027,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 	if (f.file) {
 		ret = -ESPIPE;
 		if (f.file->f_mode & FMODE_PWRITE)
-			ret = vfs_writev(f.file, vec, vlen, &pos);
+			ret = vfs_writev(f.file, vec, vlen, &pos, 0);
 		fdput(f);
 	}
 
@@ -1072,7 +1075,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 
 	if (iter_fn)
 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
-						pos, iter_fn);
+						pos, iter_fn, 0);
 	else if (fnv)
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
diff --git a/fs/splice.c b/fs/splice.c
index 7968da9..ee3fd4c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -576,7 +576,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
 	set_fs(old_fs);
 
 	return res;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index d9c92da..9c1d499 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -52,6 +52,8 @@ struct kiocb {
 	 * this is the underlying eventfd context to deliver events to.
 	 */
 	struct eventfd_ctx	*ki_eventfd;
+
+	int			ki_rwflags;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5..c018335 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1619,9 +1619,9 @@ extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *);
+		unsigned long, loff_t *, int);
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *);
+		unsigned long, loff_t *, int);
 
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH v7 2/5] vfs: Define new syscalls preadv2,pwritev2
From: Milosz Tanski @ 2015-03-16 18:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, linux-fsdevel, linux-aio, Mel Gorman,
	Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
	Al Viro, linux-api, Michael Kerrisk, linux-arch, Dave Chinner,
	Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz@adfin.com>

New syscalls that take an flag argument. This change does not add any specific
flags.

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/read_write.c                   | 172 ++++++++++++++++++++++++++++++--------
 include/linux/compat.h            |   6 ++
 include/linux/syscalls.h          |   6 ++
 include/uapi/asm-generic/unistd.h |   6 +-
 mm/filemap.c                      |   5 +-
 5 files changed, 156 insertions(+), 39 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index b53bb59..e91f46e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -924,6 +924,8 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
+	if (flags & ~0)
+		return -EINVAL;
 
 	return do_readv_writev(READ, file, vec, vlen, pos, flags);
 }
@@ -937,21 +939,23 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
+	if (flags & ~0)
+		return -EINVAL;
 
 	return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
 }
 
 EXPORT_SYMBOL(vfs_writev);
 
-SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen)
+static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
+			unsigned long vlen, int flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
-		ret = vfs_readv(f.file, vec, vlen, &pos, 0);
+		ret = vfs_readv(f.file, vec, vlen, &pos, flags);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
 		fdput_pos(f);
@@ -963,15 +967,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
-SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen)
+static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
+			 unsigned long vlen, int flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
-		ret = vfs_writev(f.file, vec, vlen, &pos, 0);
+		ret = vfs_writev(f.file, vec, vlen, &pos, flags);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
 		fdput_pos(f);
@@ -989,10 +993,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 }
 
-SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
+			 unsigned long vlen, loff_t pos, int flags)
 {
-	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct fd f;
 	ssize_t ret = -EBADF;
 
@@ -1003,7 +1006,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 	if (f.file) {
 		ret = -ESPIPE;
 		if (f.file->f_mode & FMODE_PREAD)
-			ret = vfs_readv(f.file, vec, vlen, &pos, 0);
+			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
 		fdput(f);
 	}
 
@@ -1013,10 +1016,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
-SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
+			  unsigned long vlen, loff_t pos, int flags)
 {
-	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct fd f;
 	ssize_t ret = -EBADF;
 
@@ -1027,7 +1029,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 	if (f.file) {
 		ret = -ESPIPE;
 		if (f.file->f_mode & FMODE_PWRITE)
-			ret = vfs_writev(f.file, vec, vlen, &pos, 0);
+			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
 		fdput(f);
 	}
 
@@ -1037,11 +1039,63 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
+{
+	return do_readv(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
+{
+	return do_writev(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+	return do_preadv(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+		int, flags)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+	if (pos == -1)
+		return do_readv(fd, vec, vlen, flags);
+
+	return do_preadv(fd, vec, vlen, pos, flags);
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+	return do_pwritev(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+		int, flags)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+	if (pos == -1)
+		return do_writev(fd, vec, vlen, flags);
+
+	return do_pwritev(fd, vec, vlen, pos, flags);
+}
+
 #ifdef CONFIG_COMPAT
 
 static ssize_t compat_do_readv_writev(int type, struct file *file,
 			       const struct compat_iovec __user *uvector,
-			       unsigned long nr_segs, loff_t *pos)
+			       unsigned long nr_segs, loff_t *pos, int flags)
 {
 	compat_ssize_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
@@ -1075,7 +1129,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 
 	if (iter_fn)
 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
-						pos, iter_fn, 0);
+						pos, iter_fn, flags);
 	else if (fnv)
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
@@ -1099,7 +1153,7 @@ out:
 
 static size_t compat_readv(struct file *file,
 			   const struct compat_iovec __user *vec,
-			   unsigned long vlen, loff_t *pos)
+			   unsigned long vlen, loff_t *pos, int flags)
 {
 	ssize_t ret = -EBADF;
 
@@ -1109,8 +1163,10 @@ static size_t compat_readv(struct file *file,
 	ret = -EINVAL;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		goto out;
+	if (flags & ~0)
+		goto out;
 
-	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+	ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
 
 out:
 	if (ret > 0)
@@ -1119,9 +1175,9 @@ out:
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
-		const struct compat_iovec __user *,vec,
-		compat_ulong_t, vlen)
+static size_t __compat_sys_readv(compat_ulong_t fd,
+				 const struct compat_iovec __user *vec,
+				 compat_ulong_t vlen, int flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret;
@@ -1130,16 +1186,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
 	if (!f.file)
 		return -EBADF;
 	pos = f.file->f_pos;
-	ret = compat_readv(f.file, vec, vlen, &pos);
+	ret = compat_readv(f.file, vec, vlen, &pos, flags);
 	if (ret >= 0)
 		f.file->f_pos = pos;
 	fdput_pos(f);
 	return ret;
+
+}
+
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+		const struct compat_iovec __user *,vec,
+		compat_ulong_t, vlen)
+{
+	return __compat_sys_readv(fd, vec, vlen, 0);
 }
 
 static long __compat_sys_preadv64(unsigned long fd,
 				  const struct compat_iovec __user *vec,
-				  unsigned long vlen, loff_t pos)
+				  unsigned long vlen, loff_t pos, int flags)
 {
 	struct fd f;
 	ssize_t ret;
@@ -1151,7 +1215,7 @@ static long __compat_sys_preadv64(unsigned long fd,
 		return -EBADF;
 	ret = -ESPIPE;
 	if (f.file->f_mode & FMODE_PREAD)
-		ret = compat_readv(f.file, vec, vlen, &pos);
+		ret = compat_readv(f.file, vec, vlen, &pos, flags);
 	fdput(f);
 	return ret;
 }
@@ -1161,7 +1225,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
 		unsigned long, vlen, loff_t, pos)
 {
-	return __compat_sys_preadv64(fd, vec, vlen, pos);
+	return __compat_sys_preadv64(fd, vec, vlen, pos, 0);
 }
 #endif
 
@@ -1171,12 +1235,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
-	return __compat_sys_preadv64(fd, vec, vlen, pos);
+	return __compat_sys_preadv64(fd, vec, vlen, pos, 0);
+}
+
+COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
+		const struct compat_iovec __user *,vec,
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
+		int, flags)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+	if (pos == -1)
+		return __compat_sys_readv(fd, vec, vlen, flags);
+
+	return __compat_sys_preadv64(fd, vec, vlen, pos, flags);
 }
 
 static size_t compat_writev(struct file *file,
 			    const struct compat_iovec __user *vec,
-			    unsigned long vlen, loff_t *pos)
+			    unsigned long vlen, loff_t *pos, int flags)
 {
 	ssize_t ret = -EBADF;
 
@@ -1186,8 +1263,10 @@ static size_t compat_writev(struct file *file,
 	ret = -EINVAL;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		goto out;
+	if (flags & ~0)
+		goto out;
 
-	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags);
 
 out:
 	if (ret > 0)
@@ -1196,9 +1275,9 @@ out:
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
-		const struct compat_iovec __user *, vec,
-		compat_ulong_t, vlen)
+static size_t __compat_sys_writev(compat_ulong_t fd,
+				  const struct compat_iovec __user* vec,
+				  compat_ulong_t vlen, int flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret;
@@ -1207,28 +1286,36 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
 	if (!f.file)
 		return -EBADF;
 	pos = f.file->f_pos;
-	ret = compat_writev(f.file, vec, vlen, &pos);
+	ret = compat_writev(f.file, vec, vlen, &pos, flags);
 	if (ret >= 0)
 		f.file->f_pos = pos;
 	fdput_pos(f);
 	return ret;
 }
 
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+		const struct compat_iovec __user *, vec,
+		compat_ulong_t, vlen)
+{
+	return __compat_sys_writev(fd, vec, vlen, 0);
+}
+
 static long __compat_sys_pwritev64(unsigned long fd,
 				   const struct compat_iovec __user *vec,
-				   unsigned long vlen, loff_t pos)
+				   unsigned long vlen, loff_t pos, int flags)
 {
 	struct fd f;
 	ssize_t ret;
 
 	if (pos < 0)
 		return -EINVAL;
+
 	f = fdget(fd);
 	if (!f.file)
 		return -EBADF;
 	ret = -ESPIPE;
 	if (f.file->f_mode & FMODE_PWRITE)
-		ret = compat_writev(f.file, vec, vlen, &pos);
+		ret = compat_writev(f.file, vec, vlen, &pos, flags);
 	fdput(f);
 	return ret;
 }
@@ -1238,7 +1325,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
 		unsigned long, vlen, loff_t, pos)
 {
-	return __compat_sys_pwritev64(fd, vec, vlen, pos);
+	return __compat_sys_pwritev64(fd, vec, vlen, pos, 0);
 }
 #endif
 
@@ -1248,8 +1335,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
-	return __compat_sys_pwritev64(fd, vec, vlen, pos);
+	return __compat_sys_pwritev64(fd, vec, vlen, pos, 0);
+}
+
+COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
+		const struct compat_iovec __user *,vec,
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+	if (pos == -1)
+		return __compat_sys_writev(fd, vec, vlen, flags);
+
+	return __compat_sys_pwritev64(fd, vec, vlen, pos, flags);
 }
+
 #endif
 
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ab25814..6e4be9e 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -340,6 +340,12 @@ asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd,
 asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
 		compat_ulong_t vlen, u32 pos_low, u32 pos_high);
+asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
+		const struct compat_iovec __user *vec,
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
+asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
+		const struct compat_iovec __user *vec,
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
 asmlinkage long compat_sys_preadv64(unsigned long fd,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38..f25ed7b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -575,8 +575,14 @@ asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			     size_t count, loff_t pos);
 asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
 			   unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
+			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
+			    int flags);
 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
+			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
+			    int flags);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
 asmlinkage long sys_chdir(const char __user *filename);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9..4d2c4c5 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -213,6 +213,10 @@ __SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64)
 __SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv)
 #define __NR_pwritev 70
 __SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev)
+#define __NR_preadv2 282
+__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
+#define __NR_pwritev2 283
+__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
 
 /* fs/sendfile.c */
 #define __NR3264_sendfile 71
@@ -711,7 +715,7 @@ __SYSCALL(__NR_bpf, sys_bpf)
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
 
 #undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 284
 
 /*
  * All syscalls below here should go away really,
diff --git a/mm/filemap.c b/mm/filemap.c
index ad72420..7865f64 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1453,6 +1453,7 @@ static void shrink_readahead_size_eio(struct file *filp,
  * @ppos:	current file position
  * @iter:	data destination
  * @written:	already copied
+ * @flags:	optional flags
  *
  * This is a generic file read routine, and uses the
  * mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1461,7 +1462,7 @@ static void shrink_readahead_size_eio(struct file *filp,
  * of the logic when it comes to error handling etc.
  */
 static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
-		struct iov_iter *iter, ssize_t written)
+		struct iov_iter *iter, ssize_t written, int flags)
 {
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1732,7 +1733,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		}
 	}
 
-	retval = do_generic_file_read(file, ppos, iter, retval);
+	retval = do_generic_file_read(file, ppos, iter, retval, iocb->ki_rwflags);
 out:
 	return retval;
 }
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH v7 3/5] x86: wire up preadv2 and pwritev2
From: Milosz Tanski @ 2015-03-16 18:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, linux-fsdevel, linux-aio, Mel Gorman,
	Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
	Al Viro, linux-api, Michael Kerrisk, linux-arch, Dave Chinner,
	Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz@adfin.com>

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 arch/x86/syscalls/syscall_32.tbl | 2 ++
 arch/x86/syscalls/syscall_64.tbl | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ec..b37aa9c 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -365,3 +365,5 @@
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
 358	i386	execveat		sys_execveat			stub32_execveat
+359	i386	preadv2			sys_preadv2
+360	i386	pwritev2		sys_pwritev2
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fb..3802ebf 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -329,6 +329,8 @@
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
+323	64	preadv2			sys_preadv2
+324	64	pwritev2		sys_pwritev2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH v7 4/5] vfs: RWF_NONBLOCK flag for preadv2
From: Milosz Tanski @ 2015-03-16 18:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, linux-fsdevel, linux-aio, Mel Gorman,
	Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
	Al Viro, linux-api, Michael Kerrisk, linux-arch, Dave Chinner,
	Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz@adfin.com>

generic_file_read_iter() supports a new flag RWF_NONBLOCK which says that we
only want to read the data if it's already in the page cache.

Additionally, there are a few filesystems that we have to specifically
bail early if RWF_NONBLOCK because the op would block. Christoph Hellwig
contributed this code.

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Sage Weil <sage@redhat.com>
---
 fs/ceph/file.c     |  2 ++
 fs/cifs/file.c     |  6 ++++++
 fs/nfs/file.c      |  5 ++++-
 fs/ocfs2/file.c    |  6 ++++++
 fs/pipe.c          |  3 ++-
 fs/read_write.c    | 44 ++++++++++++++++++++++++++++++--------------
 fs/xfs/xfs_file.c  |  4 ++++
 include/linux/fs.h |  2 ++
 mm/filemap.c       | 18 ++++++++++++++++++
 mm/shmem.c         |  4 ++++
 10 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d533075..78bdde3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -831,6 +831,8 @@ again:
 	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
 	    (fi->flags & CEPH_F_SYNC)) {
+		if (iocb->ki_rwflags & O_NONBLOCK)
+			return -EAGAIN;
 
 		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a94b3e6..1d16b5a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3003,6 +3003,9 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 	struct cifs_readdata *rdata, *tmp;
 	struct list_head rdata_list;
 
+	if (iocb->ki_rwflags & RWF_NONBLOCK)
+		return -EAGAIN;
+
 	len = iov_iter_count(to);
 	if (!len)
 		return 0;
@@ -3121,6 +3124,9 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
 	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
 		return generic_file_read_iter(iocb, to);
 
+	if (iocb->ki_rwflags & RWF_NONBLOCK)
+		return -EAGAIN;
+
 	/*
 	 * We need to hold the sem to be sure nobody modifies lock list
 	 * with a brlock that prevents reading.
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e679d24..58c21d7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -171,8 +171,11 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t result;
 
-	if (iocb->ki_filp->f_flags & O_DIRECT)
+	if (iocb->ki_filp->f_flags & O_DIRECT) {
+		if (iocb->ki_rwflags & O_NONBLOCK)
+			return -EAGAIN;
 		return nfs_file_direct_read(iocb, to, iocb->ki_pos);
+	}
 
 	dprintk("NFS: read(%pD2, %zu@%lu)\n",
 		iocb->ki_filp,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 46e0d4e..c155752 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2536,6 +2536,12 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 			filp->f_path.dentry->d_name.name,
 			to->nr_segs);	/* GRRRRR */
 
+	/*
+	 * No non-blocking reads for ocfs2 for now.  Might be doable with
+	 * non-blocking cluster lock helpers.
+	 */
+	if (iocb->ki_rwflags & RWF_NONBLOCK)
+		return -EAGAIN;
 
 	if (!inode) {
 		ret = -EINVAL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e5..212bf68 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,7 +302,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 			 */
 			if (ret)
 				break;
-			if (filp->f_flags & O_NONBLOCK) {
+			if ((filp->f_flags & O_NONBLOCK) ||
+			    (iocb->ki_rwflags & RWF_NONBLOCK)) {
 				ret = -EAGAIN;
 				break;
 			}
diff --git a/fs/read_write.c b/fs/read_write.c
index e91f46e..339477b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -893,14 +893,19 @@ static ssize_t do_readv_writev(int type, struct file *file,
 		file_start_write(file);
 	}
 
-	if (iter_fn)
+	if (iter_fn) {
 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
 						pos, iter_fn, flags);
-	else if (fnv)
-		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
-						pos, fnv);
-	else
-		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+	} else {
+		if (type == READ && (flags & RWF_NONBLOCK))
+			return -EAGAIN;
+
+		if (fnv)
+			ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+							pos, fnv);
+		else
+			ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+	}
 
 	if (type != READ)
 		file_end_write(file);
@@ -924,8 +929,10 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
-	if (flags & ~0)
+	if (flags & ~RWF_NONBLOCK)
 		return -EINVAL;
+	if ((file->f_flags & O_DIRECT) && (flags & RWF_NONBLOCK))
+		return -EAGAIN;
 
 	return do_readv_writev(READ, file, vec, vlen, pos, flags);
 }
@@ -1127,14 +1134,19 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 		file_start_write(file);
 	}
 
-	if (iter_fn)
+	if (iter_fn) {
 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
 						pos, iter_fn, flags);
-	else if (fnv)
-		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
-						pos, fnv);
-	else
-		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+	} else {
+		if (type == READ && (flags & RWF_NONBLOCK))
+			return -EAGAIN;
+
+		if (fnv)
+			ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+							pos, fnv);
+		else
+			ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+	}
 
 	if (type != READ)
 		file_end_write(file);
@@ -1163,7 +1175,11 @@ static size_t compat_readv(struct file *file,
 	ret = -EINVAL;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		goto out;
-	if (flags & ~0)
+	if (flags & ~RWF_NONBLOCK)
+		goto out;
+
+	ret = -EAGAIN;
+	if ((file->f_flags & O_DIRECT) && (flags & RWF_NONBLOCK))
 		goto out;
 
 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8..a38ddc1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -280,6 +280,10 @@ xfs_file_read_iter(
 
 	XFS_STATS_INC(xs_read_calls);
 
+	/* XXX: need a non-blocking iolock helper, shouldn't be too hard */
+	if (iocb->ki_rwflags & RWF_NONBLOCK)
+		return -EAGAIN;
+
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= XFS_IO_ISDIRECT;
 	if (file->f_mode & FMODE_NOCMTIME)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c018335..fb2de58 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1531,6 +1531,8 @@ struct block_device_operations;
 #define NOMMU_VMFLAGS \
 	(NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
 
+/* These flags are used for the readv/writev syscalls with flags. */
+#define RWF_NONBLOCK 0x00000001
 
 struct iov_iter;
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 7865f64..ad789e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1490,6 +1490,8 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
 find_page:
 		page = find_get_page(mapping, index);
 		if (!page) {
+			if (flags & RWF_NONBLOCK)
+				goto would_block;
 			page_cache_sync_readahead(mapping,
 					ra, filp,
 					index, last_index - index);
@@ -1581,6 +1583,11 @@ page_ok:
 		continue;
 
 page_not_up_to_date:
+		if (flags & RWF_NONBLOCK) {
+			page_cache_release(page);
+			goto would_block;
+		}
+
 		/* Get exclusive access to the page ... */
 		error = lock_page_killable(page);
 		if (unlikely(error))
@@ -1600,6 +1607,12 @@ page_not_up_to_date_locked:
 			goto page_ok;
 		}
 
+		if (flags & RWF_NONBLOCK) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto would_block;
+		}
+
 readpage:
 		/*
 		 * A previous I/O error may have been due to temporary
@@ -1670,6 +1683,8 @@ no_cached_page:
 		goto readpage;
 	}
 
+would_block:
+	error = -EAGAIN;
 out:
 	ra->prev_pos = prev_index;
 	ra->prev_pos <<= PAGE_CACHE_SHIFT;
@@ -1702,6 +1717,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		size_t count = iov_iter_count(iter);
 		loff_t size;
 
+		if (iocb->ki_rwflags & RWF_NONBLOCK)
+			return -EAGAIN;
+
 		if (!count)
 			goto out; /* skip atime */
 		size = i_size_read(inode);
diff --git a/mm/shmem.c b/mm/shmem.c
index cf2d0ca..c5b78f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1528,6 +1528,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t retval = 0;
 	loff_t *ppos = &iocb->ki_pos;
 
+	/* XXX: should be easily supportable */
+	if (iocb->ki_rwflags & RWF_NONBLOCK)
+		return -EAGAIN;
+
 	/*
 	 * Might this read be for a stacking filesystem?  Then when reading
 	 * holes of a sparse file, we actually need to allocate those pages,
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH v7 5/5] xfs: add RWF_NONBLOCK support
From: Milosz Tanski @ 2015-03-16 18:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, Christoph Hellwig, linux-fsdevel, linux-aio,
	Mel Gorman, Volker Lendecke, Tejun Heo, Jeff Moyer,
	Theodore Ts'o, Al Viro, linux-api, Michael Kerrisk,
	linux-arch, Dave Chinner, Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz@adfin.com>

From: Christoph Hellwig <hch@lst.de>

Add support for non-blocking reads.  The guts are handled by the generic
code, the only addition is a non-blocking variant of xfs_rw_ilock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a38ddc1..69333a7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -59,6 +59,25 @@ xfs_rw_ilock(
 	xfs_ilock(ip, type);
 }
 
+static inline bool
+xfs_rw_ilock_nowait(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	if (type & XFS_IOLOCK_EXCL) {
+		if (!mutex_trylock(&VFS_I(ip)->i_mutex))
+			return false;
+		if (!xfs_ilock_nowait(ip, type)) {
+			mutex_unlock(&VFS_I(ip)->i_mutex);
+			return false;
+		}
+	} else {
+		if (!xfs_ilock_nowait(ip, type))
+			return false;
+	}
+	return true;
+}
+
 static inline void
 xfs_rw_iunlock(
 	struct xfs_inode	*ip,
@@ -280,10 +299,6 @@ xfs_file_read_iter(
 
 	XFS_STATS_INC(xs_read_calls);
 
-	/* XXX: need a non-blocking iolock helper, shouldn't be too hard */
-	if (iocb->ki_rwflags & RWF_NONBLOCK)
-		return -EAGAIN;
-
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= XFS_IO_ISDIRECT;
 	if (file->f_mode & FMODE_NOCMTIME)
@@ -321,7 +336,14 @@ xfs_file_read_iter(
 	 * This allows the normal direct IO case of no page cache pages to
 	 * proceeed concurrently without serialisation.
 	 */
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	if (iocb->ki_rwflags & RWF_NONBLOCK) {
+		if (ioflags & XFS_IO_ISDIRECT)
+			return -EAGAIN;
+		if (!xfs_rw_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+			return -EAGAIN;
+	} else {
+		xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	}
 	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
 		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH] Add preadv2/pwritev2 documentation.
From: Milosz Tanski @ 2015-03-16 18:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Hellwig, linux-fsdevel, linux-aio, Mel Gorman,
	Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
	Al Viro, linux-api, Michael Kerrisk, linux-arch, Dave Chinner,
	Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz@adfin.com>

New syscalls that are a variation on the preadv/pwritev but support an extra
flag argument.

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Suggested-by: Jeff Moyer <jmoyer@redhat.com>
Fixes: Jeff Moyer <jmoyer@redhat.com>
---
 man2/readv.2 | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 61 insertions(+), 10 deletions(-)

diff --git a/man2/readv.2 b/man2/readv.2
index 756a23f..83265c6 100644
--- a/man2/readv.2
+++ b/man2/readv.2
@@ -45,6 +45,12 @@ readv, writev, preadv, pwritev \- read or write data into multiple buffers
 .sp
 .BI "ssize_t pwritev(int " fd ", const struct iovec *" iov ", int " iovcnt ,
 .BI "                off_t " offset );
+.sp
+.BI "ssize_t preadv2(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI "                off_t " offset ", int " flags );
+.sp
+.BI "ssize_t pwritev2(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI "                 off_t " offset ", int " flags );
 .fi
 .sp
 .in -4n
@@ -162,9 +168,9 @@ The
 system call combines the functionality of
 .BR writev ()
 and
-.BR pwrite (2).
+.BR pwrite (2) "."
 It performs the same task as
-.BR writev (),
+.BR writev () ","
 but adds a fourth argument,
 .IR offset ,
 which specifies the file offset at which the output operation
@@ -174,15 +180,41 @@ The file offset is not changed by these system calls.
 The file referred to by
 .I fd
 must be capable of seeking.
+.SS preadv2() and pwritev2()
+
+This pair of system calls has similar functionality to the
+.BR preadv ()
+and
+.BR pwritev ()
+calls, but adds a fifth argument, \fIflags\fP, which modifies the behavior on a per call basis.
+
+Like the
+.BR preadv ()
+and
+.BR pwritev ()
+calls, they accept an \fIoffset\fP argument. Unlike those calls, if the \fIoffset\fP argument is set to -1 then the current file offset is used and updated.
+
+The \fIflags\fP arguments to
+.BR preadv2 ()
+and
+.BR pwritev2 ()
+contains a bitwise OR of one or more of the following flags:
+.TP
+.BR RWF_NONBLOCK " (only " preadv2() " since Linux 3.19)"
+Performs a non-blocking operation for regular files (not sockets) opened in buffered mode (not
+.BR O_DIRECT ")."
+
 .SH RETURN VALUE
 On success,
-.BR readv ()
-and
+.BR readv () ","
 .BR preadv ()
-return the number of bytes read;
-.BR writev ()
 and
+.BR preadv2 ()
+return the number of bytes read;
+.BR writev () ","
 .BR pwritev ()
+and
+.BR pwritev2 ()
 return the number of bytes written.
 On error, \-1 is returned, and \fIerrno\fP is set appropriately.
 .SH ERRORS
@@ -191,12 +223,22 @@ The errors are as given for
 and
 .BR write (2).
 Furthermore,
-.BR preadv ()
-and
+.BR preadv () ","
+.BR preadv2 () ","
 .BR pwritev ()
+and
+.BR pwritev2 ()
 can also fail for the same reasons as
 .BR lseek (2).
-Additionally, the following error is defined:
+Additionally, the following errors are defined:
+.TP
+.B EAGAIN
+The operation would block. This is possible if the file descriptor \fIfd\fP refers to a socket and has been marked nonblocking
+.RB ( O_NONBLOCK ),
+or the operation is a
+.BR preadv2
+and the \fIflags\fP argument is set to
+.BR RWF_NONBLOCK.
 .TP
 .B EINVAL
 The sum of the
@@ -207,12 +249,17 @@ value.
 .TP
 .B EINVAL
 The vector count \fIiovcnt\fP is less than zero or greater than the
-permitted maximum.
+permitted maximum. Or, an unknown flag is specified in \fIflags\fP.
 .SH VERSIONS
 .BR preadv ()
 and
 .BR pwritev ()
 first appeared in Linux 2.6.30; library support was added in glibc 2.10.
+.sp
+.BR preadv2 ()
+and
+.BR pwritev2 ()
+first appeared in Linux 4.1
 .SH CONFORMING TO
 .BR readv (),
 .BR writev ():
@@ -225,6 +272,10 @@ first appeared in Linux 2.6.30; library support was added in glibc 2.10.
 .BR preadv (),
 .BR pwritev ():
 nonstandard, but present also on the modern BSDs.
+.sp
+.BR preadv2 (),
+.BR pwritev2 ():
+nonstandard, Linux extension.
 .SH NOTES
 POSIX.1-2001 allows an implementation to place a limit on
 the number of items that can be passed in
-- 
1.9.1


^ permalink raw reply related

* [PATCH] fstests: generic test for preadv2 behavior on linux
From: Milosz Tanski @ 2015-03-16 18:34 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Christoph Hellwig, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-aio-Bw31MaZKKs3YtjvyW6yDsg, Mel Gorman, Volker Lendecke,
	Tejun Heo, Jeff Moyer, Theodore Ts'o, Al Viro,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Michael Kerrisk,
	linux-arch-u79uwXL29TY76Z2rM5mHXA, Dave Chinner, Andrew Morton
In-Reply-To: <cover.1426528417.git.milosz-B5zB6C1i6pkAvxtiuMwx3w@public.gmane.org>

preadv2 is a new syscall introduced that is like preadv2 but with flag
argument. The first use case of this is to let us add a flag to perform a
non-blocking file using the page cache.
---
 src/Makefile           |   2 +-
 src/preadv2-pwritev2.h |  52 +++++++++++++++++
 src/preadv2.c          | 150 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/067      |  85 ++++++++++++++++++++++++++++
 tests/generic/067.out  |   9 +++
 tests/generic/group    |   1 +
 6 files changed, 298 insertions(+), 1 deletion(-)
 create mode 100644 src/preadv2-pwritev2.h
 create mode 100644 src/preadv2.c
 create mode 100755 tests/generic/067
 create mode 100644 tests/generic/067.out

diff --git a/src/Makefile b/src/Makefile
index 4781736..f7d3681 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -19,7 +19,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
 	bulkstat_unlink_test_modified t_dir_offset t_futimens t_immutable \
 	stale_handle pwrite_mmap_blocked t_dir_offset2 seek_sanity_test \
 	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
-	renameat2 t_getcwd e4compact
+	renameat2 t_getcwd e4compact preadv2
 
 SUBDIRS =
 
diff --git a/src/preadv2-pwritev2.h b/src/preadv2-pwritev2.h
new file mode 100644
index 0000000..786e524
--- /dev/null
+++ b/src/preadv2-pwritev2.h
@@ -0,0 +1,52 @@
+#ifndef PREADV2_PWRITEV2_H
+#define PREADV2_PWRITEV2_H
+
+#include "global.h"
+
+#ifndef HAVE_PREADV2
+#include <sys/syscall.h>
+
+#if !defined(SYS_preadv2) && defined(__x86_64__)
+#define SYS_preadv2 323
+#define SYS_pwritev2 324
+#endif
+
+#if !defined (SYS_preadv2) && defined(__i386__)
+#define SYS_preadv2 359
+#define SYS_pwritev2 360
+#endif
+
+/* LO_HI_LONG taken from glibc */
+#define LO_HI_LONG(val)							\
+  (off_t) val,                                                          \
+  (off_t) ((((uint64_t) (val)) >> (sizeof (long) * 4)) >> (sizeof (long) * 4))
+
+static inline ssize_t
+preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
+{
+#ifdef SYS_preadv2
+        return syscall(SYS_preadv2, fd, iov, iovcnt, LO_HI_LONG(offset),
+		       flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+static inline ssize_t
+pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
+{
+#ifdef SYS_pwritev2
+        return syscall(SYS_pwritev2, fd, iov, iovcnt, LO_HI_LONG(offset),
+		       flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+#define RWF_NONBLOCK	0x00000001
+#define RWF_DSYNC	0x00000002
+
+#endif /* HAVE_PREADV2 */
+#endif /* PREADV2_PWRITEV2_H */
diff --git a/src/preadv2.c b/src/preadv2.c
new file mode 100644
index 0000000..a4f89b5
--- /dev/null
+++ b/src/preadv2.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2014 Red Hat, Inc.  All rights reserved.
+ * Copyright 2015 Milosz Tanski
+ *
+ * License: GPLv2
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/fs.h> /* for RWF_NONBLOCK */
+
+/*
+ * Once preadv2 is part of the upstream kernel and there is glibc support for
+ * it. We'll add support for preadv2 to xfs_io and this will be unnecessary.
+ */
+#include "preadv2-pwritev2.h"
+
+/*
+ * Test to see if the system call is implemented.  If -EINVAL or -ENOSYS
+ * are returned, consider the call unimplemented.  All other errors are
+ * considered success.
+ *
+ * Returns: 0 if the system call is implemented, 1 if the system call
+ * is not implemented.
+ */
+int
+preadv2_check(int fd)
+{
+	int ret;
+	struct iovec iov[] = {};
+
+	/* 0 length read; just check iof the syscall is there.
+         *
+         * - 0 length iovec
+         * - Position is -1 (eg. use current position)
+         */
+	ret = preadv2(fd, iov, 0, -1, 0);
+
+	if (ret < 0) {
+		if (errno == ENOSYS || errno == EINVAL)
+			return 1;
+	}
+
+	return 0;
+}
+
+void
+usage(char *prog)
+{
+	fprintf(stderr, "Usage: %s [-v] [-ctdw] [-n] -p POS -l LEN <filename>\n\n", prog);
+	fprintf(stderr, "General arguments:\n");
+	fprintf(stderr, "  -v Verify that the syscall is supported and quit:\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Open arguments:\n");
+	fprintf(stderr, "  -c Open file with O_CREAT flag\n");
+	fprintf(stderr, "  -t Open file with O_TRUNC flag\n");
+	fprintf(stderr, "  -d Open file with O_DIRECT flag\n");
+	fprintf(stderr, "  -w Open file with O_RDWR flag vs O_RDONLY (default)\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "preadv2 arguments:\n");
+	fprintf(stderr, "  -n use RWF_NONBLOCK when performing read\n");
+	fprintf(stderr, "  -p POS offset file to read at\n");
+	fprintf(stderr, "  -l LEN length of file data to read\n");
+	fprintf(stderr, "\n");
+	fflush(stderr);
+}
+
+int
+main(int argc, char **argv)
+{
+	int fd;
+	int ret;
+	int opt;
+	off_t pos = -1;
+	struct iovec iov = { NULL, 0 };
+	int o_flags = 0;
+	int r_flags = 0;
+	char *filename;
+
+	while ((opt = getopt(argc, argv, "vctdwnp:l:")) != -1) {
+		switch (opt) {
+		case 'v':
+			/*
+			 * See if we were called to check for availability of
+			 * sys_preadv2. STDIN is okay, since we do a zero
+			 * length read (see man 2 read).
+			 */
+			ret = preadv2_check(STDIN_FILENO);
+			exit(ret);
+		case 'c':
+			o_flags |= O_CREAT;
+			break;
+		case 't':
+			o_flags |= O_TRUNC;
+			break;
+		case 'd':
+			o_flags |= O_DIRECT;
+			break;
+		case 'w':
+			o_flags |= O_RDWR;
+			break;
+		case 'n':
+			r_flags |= RWF_NONBLOCK;
+			break;
+		case 'p':
+			pos = atoll(optarg);
+			break;
+		case 'l':
+			iov.iov_len = atoll(optarg);
+			break;
+		default:
+			fprintf(stderr, "invalid option: %c\n", opt);
+			usage(argv[0]);
+			exit(1);
+		}
+	}
+
+	if (optind >= argc) {
+		usage(argv[0]);
+		exit(1);
+	}
+
+	if ((o_flags & O_RDWR) != O_RDWR)
+		o_flags |= O_RDONLY;
+
+	if ((iov.iov_base = malloc(iov.iov_len)) == NULL) {
+		perror("malloc");
+		exit(1);
+	}
+
+	filename = argv[optind];
+	fd = open(filename, o_flags);
+
+	if (fd < 0) {
+		perror("open");
+		exit(1);
+	}
+
+	if ((ret = preadv2(fd, &iov, 1, pos, r_flags)) == -1) {
+		perror("preadv2");
+		exit(ret);
+	}
+
+	free(iov.iov_base);
+	exit(0);
+}
diff --git a/tests/generic/067 b/tests/generic/067
new file mode 100755
index 0000000..4cc58f8
--- /dev/null
+++ b/tests/generic/067
@@ -0,0 +1,85 @@
+#! /bin/bash
+# FS QA Test No. 067
+#
+# Test for the preadv2 syscall
+#
+#-----------------------------------------------------------------------
+# Copyright (c) 2015 Milosz Tanski <mtanski-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#-----------------------------------------------------------------------
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+    cd /
+    rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs generic
+_supported_os Linux
+_require_test
+
+# test file we'll be using
+file=$SCRATCH_MNT/067.preadv2.$$
+
+# Create a file:
+# two regions of data and a hole in the middle
+# use O_DIRECT so it's not in the page cache
+echo "create file"
+$XFS_IO_PROG -t -f -d \
+	-c "pwrite 0 1024" \
+	-c "pwrite 2048 1024" \
+	$file > /dev/null
+
+# Make sure it returns EAGAIN on uncached data
+echo "uncached"
+$here/src/preadv2 -n -p 0 -l 1024 $file
+
+# Make sure we read in the whole file, after that RWF_NONBLOCK should return us all the data
+echo "cached"
+$XFS_IO_PROG -f $file -c "pread 0 4096" $file > /dev/null
+$here/src/preadv2 -n -p 0 -l 1024 $file
+
+# O_DIRECT and RWF_NONBLOCK should return EAGAIN always
+echo "O_DIRECT"
+$here/src/preadv2 -d -n -p 0 -l 1024 $file
+
+# Holes do not block
+echo "holes"
+$here/src/preadv2 -n -p 2048 -l 1024 $file
+
+# EOF behavior (no EAGAIN)
+echo "EOF"
+$here/src/preadv2 -n -p 3072 -l 1 $file
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/067.out b/tests/generic/067.out
new file mode 100644
index 0000000..6e3740f
--- /dev/null
+++ b/tests/generic/067.out
@@ -0,0 +1,9 @@
+QA output created by 067
+create file
+uncached
+preadv2: Resource temporarily unavailable
+cached
+O_DIRECT
+preadv2: Resource temporarily unavailable
+holes
+EOF
diff --git a/tests/generic/group b/tests/generic/group
index e5db772..91c5870 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -69,6 +69,7 @@
 064 auto quick prealloc
 065 metadata auto quick
 066 metadata auto quick
+067 auto quick rw
 068 other auto freeze dangerous stress
 069 rw udf auto quick
 070 attr udf auto quick stress
-- 
1.9.1

^ permalink raw reply related

* [PATCH v2 tip/core/rcu 01/22] smpboot: Add common code for notification from dying CPU
From: Paul E. McKenney @ 2015-03-16 18:37 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, tglx,
	peterz, rostedt, dhowells, edumazet, dvhart, fweisbec, oleg,
	bobby.prani, Paul E. McKenney, linux-api, linux-arch
In-Reply-To: <20150316183743.GA21453@linux.vnet.ibm.com>

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>

RCU ignores offlined CPUs, so they cannot safely run RCU read-side code.
(They -can- use SRCU, but not RCU.)  This means that any use of RCU
during or after the call to arch_cpu_idle_dead().  Unfortunately,
commit 2ed53c0d6cc99 added a complete() call, which will contain RCU
read-side critical sections if there is a task waiting to be awakened.

Which, as it turns out, there almost never is.  In my qemu/KVM testing,
the to-be-awakened task is not yet asleep more than 99.5% of the time.
In current mainline, failure is even harder to reproduce, requiring a
virtualized environment that delays the outgoing CPU by at least three
jiffies between the time it exits its stop_machine() task at CPU_DYING
time and the time it calls arch_cpu_idle_dead() from the idle loop.
However, this problem really can occur, especially in virtualized
environments, and therefore really does need to be fixed

This suggests moving back to the polling loop, but using a much shorter
wait, with gentle exponential backoff instead of the old 100-millisecond
wait.  Most of the time, the loop will exit without waiting at all,
and almost all of the remaining uses will wait only five microseconds.
If the outgoing CPU is preempted, a loop will wait one jiffy, then
increase the wait by a factor of 11/10ths, rounding up.  As before, there
is a five-second timeout.

This commit therefore provides common-code infrastructure to do the
dying-to-surviving CPU handoff in a safe manner.  This code also
provides an indication at CPU-online of whether the CPU to be onlined
previously timed out on offline.  The new cpu_check_up_prepare() function
returns -EBUSY if this CPU previously took more than five seconds to
go offline, or -EAGAIN if it has not yet managed to go offline.  The
rationale for -EAGAIN is that it might still be preempted, so an additional
wait might well find it correctly offlined.  Architecture-specific code
can decide how to handle these conditions.  Systems in which CPUs take
themselves completely offline might respond to an -EBUSY return as if
it was a zero (success) return.  Systems in which the surviving CPU must
take some action might take it at this time, or might simply mark the
other CPU as unusable.

Note that architectures that take the easy way out and simply pass the
-EBUSY and -EAGAIN upwards will change the sysfs API.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
[ paulmck: Fixed state machine for architectures that don't check earlier
  CPU-hotplug results as suggested by James Hogan. ]
---
 include/linux/cpu.h |  12 ++++
 kernel/smpboot.c    | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4260e8594bd7..4744ef915acd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -95,6 +95,8 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
+#define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
+					* perhaps due to preemption. */
 
 /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
  * operation in progress
@@ -271,4 +273,14 @@ void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
 void arch_cpu_idle_dead(void);
 
+DECLARE_PER_CPU(bool, cpu_dead_idle);
+
+int cpu_report_state(int cpu);
+int cpu_check_up_prepare(int cpu);
+void cpu_set_state_online(int cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+bool cpu_wait_death(unsigned int cpu, int seconds);
+bool cpu_report_death(void);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+	return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success.  Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out.  And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying.  In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+		return 0;
+	}
+
+	switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+	case CPU_POST_DEAD:
+
+		/* The CPU died properly, so just start it up again. */
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+		return 0;
+
+	case CPU_DEAD_FROZEN:
+
+		/*
+		 * Timeout during CPU death, so let caller know.
+		 * The outgoing CPU completed its processing, but after
+		 * cpu_wait_death() timed out and reported the error. The
+		 * caller is free to proceed, in which case the state
+		 * will be reset properly by cpu_set_state_online().
+		 * Proceeding despite this -EBUSY return makes sense
+		 * for systems where the outgoing CPUs take themselves
+		 * offline, with no post-death manipulation required from
+		 * a surviving CPU.
+		 */
+		return -EBUSY;
+
+	case CPU_BROKEN:
+
+		/*
+		 * The most likely reason we got here is that there was
+		 * a timeout during CPU death, and the outgoing CPU never
+		 * did complete its processing.  This could happen on
+		 * a virtualized system if the outgoing VCPU gets preempted
+		 * for more than five seconds, and the user attempts to
+		 * immediately online that same CPU.  Trying again later
+		 * might return -EBUSY above, hence -EAGAIN.
+		 */
+		return -EAGAIN;
+
+	default:
+
+		/* Should not happen.  Famous last words. */
+		return -EIO;
+	}
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+	(void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+	int jf_left = seconds * HZ;
+	int oldstate;
+	bool ret = true;
+	int sleep_jf = 1;
+
+	might_sleep();
+
+	/* The outgoing CPU will normally get done quite quickly. */
+	if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+		goto update_state;
+	udelay(5);
+
+	/* But if the outgoing CPU dawdles, wait increasingly long times. */
+	while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+		schedule_timeout_uninterruptible(sleep_jf);
+		jf_left -= sleep_jf;
+		if (jf_left <= 0)
+			break;
+		sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+	}
+update_state:
+	oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+	if (oldstate == CPU_DEAD) {
+		/* Outgoing CPU died normally, update state. */
+		smp_mb(); /* atomic_read() before update. */
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+	} else {
+		/* Outgoing CPU still hasn't died, set state accordingly. */
+		if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+				   oldstate, CPU_BROKEN) != oldstate)
+			goto update_state;
+		ret = false;
+	}
+	return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death.  Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out.  This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+	int oldstate;
+	int newstate;
+	int cpu = smp_processor_id();
+
+	do {
+		oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+		if (oldstate != CPU_BROKEN)
+			newstate = CPU_DEAD;
+		else
+			newstate = CPU_DEAD_FROZEN;
+	} while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+				oldstate, newstate) != oldstate);
+	return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-- 
1.8.1.5

^ permalink raw reply related

* [Patch] firmware: dmi_scan: split dmisubsystem from dmi-sysfs
From: Ivan Khoronzhuk @ 2015-03-16 20:57 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	matt.fleming-ral2JQCrhuEAvxtiuMwx3w, jdelvare-l3A5Bk7waGM,
	ard.biesheuvel-QSEj5FYQhm4dnm+yROfE0A,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-doc-u79uwXL29TY76Z2rM5mHXA, mikew-hpIqsD4AKlfQT0dZR+AlfA
  Cc: dmidecode-devel-qX2TKyscuCcdnm+yROfE0A,
	leif.lindholm-QSEj5FYQhm4dnm+yROfE0A,
	msalter-H+wXaHxf7aLQT0dZR+AlfA, Ivan Khoronzhuk

Some utils, like dmidecode and smbios, need to access SMBIOS entry
table area in order to get information like SMBIOS version, size, etc.
Currently it's done via /dev/mem. But for situation when /dev/mem
usage is disabled, the utils have to use dmi sysfs instead, which
doesn't represent SMBIOS entry and adds code/delay redundancy when direct
access for table is needed.

So this patch creates dmi subsystem and adds SMBIOS entry point to allow
utils in question to work correctly without /dev/mem. Also patch adds
raw dmi table to simplify dmi table processing in user space, as were
proposed by Jean Delvare.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk-hExfYMNmJl/Cnp4W7fqMDg@public.gmane.org>
---

This patch is logical continuation of
"[dmidecode] [Patch v4] firmware: dmi-sysfs: add SMBIOS entry point area attribute"
https://lkml.org/lkml/2015/2/4/475

Pay attention that this includes /sys/firmware/dmi for holding tables instead of
/sys/firmware/dmi/table as were proposed.

 Documentation/ABI/testing/sysfs-firmware-dmi       | 122 +++------------------
 .../ABI/testing/sysfs-firmware-dmi-entries         | 110 +++++++++++++++++++
 drivers/firmware/dmi-sysfs.c                       |  12 +-
 drivers/firmware/dmi_scan.c                        | 115 +++++++++++++++++--
 include/linux/dmi.h                                |   2 +
 5 files changed, 238 insertions(+), 123 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-firmware-dmi-entries

diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi b/Documentation/ABI/testing/sysfs-firmware-dmi
index c78f9ab..6413128 100644
--- a/Documentation/ABI/testing/sysfs-firmware-dmi
+++ b/Documentation/ABI/testing/sysfs-firmware-dmi
@@ -1,110 +1,16 @@
 What:		/sys/firmware/dmi/
-Date:		February 2011
-Contact:	Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
+Date:		March 2015
+Contact:	Ivan Khoronzhuk <ivan.khoronzhuk-hExfYMNmJl/Cnp4W7fqMDg@public.gmane.org>
 Description:
-		Many machines' firmware (x86 and ia64) export DMI /
-		SMBIOS tables to the operating system.  Getting at this
-		information is often valuable to userland, especially in
-		cases where there are OEM extensions used.
-
-		The kernel itself does not rely on the majority of the
-		information in these tables being correct.  It equally
-		cannot ensure that the data as exported to userland is
-		without error either.
-
-		DMI is structured as a large table of entries, where
-		each entry has a common header indicating the type and
-		length of the entry, as well as a firmware-provided
-		'handle' that is supposed to be unique amongst all
-		entries.
-
-		Some entries are required by the specification, but many
-		others are optional.  In general though, users should
-		never expect to find a specific entry type on their
-		system unless they know for certain what their firmware
-		is doing.  Machine to machine experiences will vary.
-
-		Multiple entries of the same type are allowed.  In order
-		to handle these duplicate entry types, each entry is
-		assigned by the operating system an 'instance', which is
-		derived from an entry type's ordinal position.  That is
-		to say, if there are 'N' multiple entries with the same type
-		'T' in the DMI tables (adjacent or spread apart, it
-		doesn't matter), they will be represented in sysfs as
-		entries "T-0" through "T-(N-1)":
-
-		Example entry directories:
-
-			/sys/firmware/dmi/entries/17-0
-			/sys/firmware/dmi/entries/17-1
-			/sys/firmware/dmi/entries/17-2
-			/sys/firmware/dmi/entries/17-3
-			...
-
-		Instance numbers are used in lieu of the firmware
-		assigned entry handles as the kernel itself makes no
-		guarantees that handles as exported are unique, and
-		there are likely firmware images that get this wrong in
-		the wild.
-
-		Each DMI entry in sysfs has the common header values
-		exported as attributes:
-
-		handle	: The 16bit 'handle' that is assigned to this
-			  entry by the firmware.  This handle may be
-			  referred to by other entries.
-		length	: The length of the entry, as presented in the
-			  entry itself.  Note that this is _not the
-			  total count of bytes associated with the
-			  entry_.  This value represents the length of
-			  the "formatted" portion of the entry.  This
-			  "formatted" region is sometimes followed by
-			  the "unformatted" region composed of nul
-			  terminated strings, with termination signalled
-			  by a two nul characters in series.
-		raw	: The raw bytes of the entry. This includes the
-			  "formatted" portion of the entry, the
-			  "unformatted" strings portion of the entry,
-			  and the two terminating nul characters.
-		type	: The type of the entry.  This value is the same
-			  as found in the directory name.  It indicates
-			  how the rest of the entry should be interpreted.
-		instance: The instance ordinal of the entry for the
-			  given type.  This value is the same as found
-			  in the parent directory name.
-		position: The ordinal position (zero-based) of the entry
-			  within the entirety of the DMI entry table.
-
-		=== Entry Specialization ===
-
-		Some entry types may have other information available in
-		sysfs.  Not all types are specialized.
-
-		--- Type 15 - System Event Log ---
-
-		This entry allows the firmware to export a log of
-		events the system has taken.  This information is
-		typically backed by nvram, but the implementation
-		details are abstracted by this table.  This entry's data
-		is exported in the directory:
-
-		/sys/firmware/dmi/entries/15-0/system_event_log
-
-		and has the following attributes (documented in the
-		SMBIOS / DMI specification under "System Event Log (Type 15)":
-
-		area_length
-		header_start_offset
-		data_start_offset
-		access_method
-		status
-		change_token
-		access_method_address
-		header_format
-		per_log_type_descriptor_length
-		type_descriptors_supported_count
-
-		As well, the kernel exports the binary attribute:
-
-		raw_event_log	: The raw binary bits of the event log
-				  as described by the DMI entry.
+		The firmware provides DMI structures as a packed list of
+		data referenced by a SMBIOS table entry point. The SMBIOS
+		entry point contains general information, like SMBIOS
+		version, DMI table size, etc. The structure, content and
+		size of SMBIOS entry point is dependent on SMBIOS version.
+		That's why SMBIOS entry point is represented in dmi sysfs
+		like a raw attribute and is accessible via
+		/sys/firmware/dmi/smbios_entry_point. The format of SMBIOS
+		entry point header can be read in SMBIOS specification.
+		To simplify access and processing delay in user space,
+		subsystem provides also raw dmi table under
+		/sys/firmware/dmi/dmi_table.
diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi-entries b/Documentation/ABI/testing/sysfs-firmware-dmi-entries
new file mode 100644
index 0000000..c3b4d4c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-dmi-entries
@@ -0,0 +1,110 @@
+What:		/sys/firmware/dmi/entries
+Date:		February 2011
+Contact:	Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
+Description:
+		Many machines' firmware (x86 and ia64) export DMI /
+		SMBIOS tables to the operating system.  Getting at this
+		information is often valuable to userland, especially in
+		cases where there are OEM extensions used.
+
+		The kernel itself does not rely on the majority of the
+		information in these tables being correct.  It equally
+		cannot ensure that the data as exported to userland is
+		without error either.
+
+		DMI is structured as a large table of entries, where
+		each entry has a common header indicating the type and
+		length of the entry, as well as a firmware-provided
+		'handle' that is supposed to be unique amongst all
+		entries.
+
+		Some entries are required by the specification, but many
+		others are optional.  In general though, users should
+		never expect to find a specific entry type on their
+		system unless they know for certain what their firmware
+		is doing.  Machine to machine experiences will vary.
+
+		Multiple entries of the same type are allowed.  In order
+		to handle these duplicate entry types, each entry is
+		assigned by the operating system an 'instance', which is
+		derived from an entry type's ordinal position.  That is
+		to say, if there are 'N' multiple entries with the same type
+		'T' in the DMI tables (adjacent or spread apart, it
+		doesn't matter), they will be represented in sysfs as
+		entries "T-0" through "T-(N-1)":
+
+		Example entry directories:
+
+			/sys/firmware/dmi/entries/17-0
+			/sys/firmware/dmi/entries/17-1
+			/sys/firmware/dmi/entries/17-2
+			/sys/firmware/dmi/entries/17-3
+			...
+
+		Instance numbers are used in lieu of the firmware
+		assigned entry handles as the kernel itself makes no
+		guarantees that handles as exported are unique, and
+		there are likely firmware images that get this wrong in
+		the wild.
+
+		Each DMI entry in sysfs has the common header values
+		exported as attributes:
+
+		handle	: The 16bit 'handle' that is assigned to this
+			  entry by the firmware.  This handle may be
+			  referred to by other entries.
+		length	: The length of the entry, as presented in the
+			  entry itself.  Note that this is _not the
+			  total count of bytes associated with the
+			  entry_.  This value represents the length of
+			  the "formatted" portion of the entry.  This
+			  "formatted" region is sometimes followed by
+			  the "unformatted" region composed of nul
+			  terminated strings, with termination signalled
+			  by a two nul characters in series.
+		raw	: The raw bytes of the entry. This includes the
+			  "formatted" portion of the entry, the
+			  "unformatted" strings portion of the entry,
+			  and the two terminating nul characters.
+		type	: The type of the entry.  This value is the same
+			  as found in the directory name.  It indicates
+			  how the rest of the entry should be interpreted.
+		instance: The instance ordinal of the entry for the
+			  given type.  This value is the same as found
+			  in the parent directory name.
+		position: The ordinal position (zero-based) of the entry
+			  within the entirety of the DMI entry table.
+
+		=== Entry Specialization ===
+
+		Some entry types may have other information available in
+		sysfs.  Not all types are specialized.
+
+		--- Type 15 - System Event Log ---
+
+		This entry allows the firmware to export a log of
+		events the system has taken.  This information is
+		typically backed by nvram, but the implementation
+		details are abstracted by this table.  This entry's data
+		is exported in the directory:
+
+		/sys/firmware/dmi/entries/15-0/system_event_log
+
+		and has the following attributes (documented in the
+		SMBIOS / DMI specification under "System Event Log (Type 15)":
+
+		area_length
+		header_start_offset
+		data_start_offset
+		access_method
+		status
+		change_token
+		access_method_address
+		header_format
+		per_log_type_descriptor_length
+		type_descriptors_supported_count
+
+		As well, the kernel exports the binary attribute:
+
+		raw_event_log	: The raw binary bits of the event log
+				  as described by the DMI entry.
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
index e0f1cb3..390067d 100644
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -566,7 +566,6 @@ static struct kobj_type dmi_sysfs_entry_ktype = {
 	.default_attrs = dmi_sysfs_entry_attrs,
 };
 
-static struct kobject *dmi_kobj;
 static struct kset *dmi_kset;
 
 /* Global count of all instances seen.  Only for setup */
@@ -651,10 +650,10 @@ static int __init dmi_sysfs_init(void)
 	int error = -ENOMEM;
 	int val;
 
-	/* Set up our directory */
-	dmi_kobj = kobject_create_and_add("dmi", firmware_kobj);
-	if (!dmi_kobj)
-		goto err;
+	if (!dmi_kobj) {
+		pr_err("dmi-sysfs: dmi subsysterm is absent.\n");
+		return -EINVAL;
+	}
 
 	dmi_kset = kset_create_and_add("entries", NULL, dmi_kobj);
 	if (!dmi_kset)
@@ -675,7 +674,6 @@ static int __init dmi_sysfs_init(void)
 err:
 	cleanup_entry_list();
 	kset_unregister(dmi_kset);
-	kobject_put(dmi_kobj);
 	return error;
 }
 
@@ -685,8 +683,6 @@ static void __exit dmi_sysfs_exit(void)
 	pr_debug("dmi-sysfs: unloading.\n");
 	cleanup_entry_list();
 	kset_unregister(dmi_kset);
-	kobject_del(dmi_kobj);
-	kobject_put(dmi_kobj);
 }
 
 module_init(dmi_sysfs_init);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index c9cb725..3fca52a 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -10,6 +10,9 @@
 #include <asm/dmi.h>
 #include <asm/unaligned.h>
 
+struct kobject *dmi_kobj;
+EXPORT_SYMBOL_GPL(dmi_kobj);
+
 /*
  * DMI stands for "Desktop Management Interface".  It is part
  * of and an antecedent to, SMBIOS, which stands for System
@@ -20,6 +23,9 @@ static const char dmi_empty_string[] = "        ";
 static u32 dmi_ver __initdata;
 static u32 dmi_len;
 static u16 dmi_num;
+static u8 smbios_entry_point[32];
+static int smbios_entry_point_size;
+
 /*
  * Catch too early calls to dmi_check_system():
  */
@@ -118,6 +124,7 @@ static void dmi_table(u8 *buf,
 }
 
 static phys_addr_t dmi_base;
+static u8 *dmi_tb;
 
 static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
 		void *))
@@ -476,6 +483,8 @@ static int __init dmi_present(const u8 *buf)
 	if (memcmp(buf, "_SM_", 4) == 0 &&
 	    buf[5] < 32 && dmi_checksum(buf, buf[5])) {
 		smbios_ver = get_unaligned_be16(buf + 6);
+		smbios_entry_point_size = buf[5];
+		memcpy(smbios_entry_point, buf, smbios_entry_point_size);
 
 		/* Some BIOS report weird SMBIOS version, fix that up */
 		switch (smbios_ver) {
@@ -508,6 +517,8 @@ static int __init dmi_present(const u8 *buf)
 					dmi_ver >> 8, dmi_ver & 0xFF,
 					(dmi_ver < 0x0300) ? "" : ".x");
 			} else {
+				smbios_entry_point_size = 15;
+				memcpy(smbios_entry_point, buf, 15);
 				dmi_ver = (buf[14] & 0xF0) << 4 |
 					   (buf[14] & 0x0F);
 				pr_info("Legacy DMI %d.%d present.\n",
@@ -535,6 +546,8 @@ static int __init dmi_smbios3_present(const u8 *buf)
 		dmi_ver &= 0xFFFFFF;
 		dmi_len = get_unaligned_le32(buf + 12);
 		dmi_base = get_unaligned_le64(buf + 16);
+		smbios_entry_point_size = buf[6];
+		memcpy(smbios_entry_point, buf, smbios_entry_point_size);
 
 		/*
 		 * The 64-bit SMBIOS 3.0 entry point no longer has a field
@@ -638,6 +651,95 @@ void __init dmi_scan_machine(void)
 	dmi_initialized = 1;
 }
 
+static ssize_t smbios_entry_point_read(struct file *filp,
+				       struct kobject *kobj,
+				       struct bin_attribute *bin_attr,
+				       char *buf, loff_t pos, size_t count)
+{
+	ssize_t size;
+
+	size = bin_attr->size;
+
+	if (size > pos)
+		size -= pos;
+	else
+		return 0;
+
+	if (count < size)
+		size = count;
+
+	memcpy(buf, &smbios_entry_point[pos], size);
+
+	return size;
+}
+
+static ssize_t dmi_table_read(struct file *filp,
+			      struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t pos, size_t count)
+{
+	ssize_t size;
+
+	size = bin_attr->size;
+
+	if (size > pos)
+		size -= pos;
+	else
+		return 0;
+
+	if (count < size)
+		size = count;
+
+	memcpy(buf, &dmi_tb[pos], size);
+
+	return size;
+}
+
+BIN_ATTR_RO(dmi_table, 0);
+BIN_ATTR_RO(smbios_entry_point, 0);
+
+/*
+ * Register the dmi subsystem under the firmware subsysterm
+ */
+static int __init dmisubsys_init(void)
+{
+	int ret = -ENOMEM;
+
+	if (!smbios_entry_point_size || !dmi_available) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Set up dmi directory at /sys/firmware/dmi */
+	dmi_kobj = kobject_create_and_add("dmi", firmware_kobj);
+	if (!dmi_kobj)
+		goto err;
+
+	bin_attr_smbios_entry_point.size = smbios_entry_point_size;
+	ret = sysfs_create_bin_file(dmi_kobj, &bin_attr_smbios_entry_point);
+	if (ret)
+		goto err;
+
+	if (!dmi_tb) {
+		dmi_tb = dmi_remap(dmi_base, dmi_len);
+		if (!dmi_tb)
+			goto err;
+	}
+
+	bin_attr_dmi_table.size = dmi_len;
+	ret = sysfs_create_bin_file(dmi_kobj, &bin_attr_dmi_table);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	pr_err("dmi: Firmware registration failed.\n");
+	kobject_del(dmi_kobj);
+	kobject_put(dmi_kobj);
+	return ret;
+}
+subsys_initcall(dmisubsys_init);
+
 /**
  * dmi_set_dump_stack_arch_desc - set arch description for dump_stack()
  *
@@ -897,18 +999,17 @@ EXPORT_SYMBOL(dmi_get_date);
 int dmi_walk(void (*decode)(const struct dmi_header *, void *),
 	     void *private_data)
 {
-	u8 *buf;
-
 	if (!dmi_available)
 		return -1;
 
-	buf = dmi_remap(dmi_base, dmi_len);
-	if (buf == NULL)
-		return -1;
+	if (!dmi_tb) {
+		dmi_tb = dmi_remap(dmi_base, dmi_len);
+		if (!dmi_tb)
+			return -1;
+	}
 
-	dmi_table(buf, decode, private_data);
+	dmi_table(dmi_tb, decode, private_data);
 
-	dmi_unmap(buf);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dmi_walk);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index f820f0a..316293e 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -93,6 +93,7 @@ struct dmi_dev_onboard {
 	int devfn;
 };
 
+extern struct kobject *dmi_kobj;
 extern int dmi_check_system(const struct dmi_system_id *list);
 const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list);
 extern const char * dmi_get_system_info(int field);
@@ -112,6 +113,7 @@ extern void dmi_memdev_name(u16 handle, const char **bank, const char **device);
 
 #else
 
+extern struct kobject *dmi_kobj;
 static inline int dmi_check_system(const struct dmi_system_id *list) { return 0; }
 static inline const char * dmi_get_system_info(int field) { return NULL; }
 static inline const struct dmi_device * dmi_find_device(int type, const char *name,
-- 
1.9.1

^ permalink raw reply related

* Re: [dmidecode] [Patch v4] firmware: dmi-sysfs: add SMBIOS entry point area attribute
From: Ivan.khoronzhuk @ 2015-03-16 21:02 UTC (permalink / raw)
  To: Matt Fleming, Jean Delvare
  Cc: Ivan Khoronzhuk, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	ard.biesheuvel-QSEj5FYQhm4dnm+yROfE0A,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-doc-u79uwXL29TY76Z2rM5mHXA,
	dmidecode-devel-qX2TKyscuCcdnm+yROfE0A,
	leif.lindholm-QSEj5FYQhm4dnm+yROfE0A,
	msalter-H+wXaHxf7aLQT0dZR+AlfA
In-Reply-To: <1426162975.2784.31.camel-ZqTwcBeJ+wsBof6jY8KHXm7IUlhRatedral2JQCrhuEAvxtiuMwx3w@public.gmane.org>



On 12.03.15 14:22, Matt Fleming wrote:
> On Tue, 2015-03-10 at 10:13 +0100, Jean Delvare wrote:
>>> If Matt is OK to get another version,
>>> Let it be smbios_entry_point.
>>> If it's more convenient, it should be changed while it's possible.
>> Great, thanks.
> Ivan, please go ahead and submit a new version, we've got time to get
> this right for v4.1.
>

Matt, I've sent new patch that replaces this one.
"[Patch] firmware: dmi_scan: split dmisubsystem from dmi-sysfs"
It can take a while to go via review.

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: [PATCH v7 1/5] vfs: Prepare for adding a new preadv/pwritev with user flags.
From: Andreas Dilger @ 2015-03-16 21:05 UTC (permalink / raw)
  To: Milosz Tanski
  Cc: linux-kernel, Christoph Hellwig, linux-fsdevel, linux-aio,
	Mel Gorman, Volker Lendecke, Tejun Heo, Jeff Moyer,
	Theodore Ts'o, Al Viro, linux-api, Michael Kerrisk,
	linux-arch, Dave Chinner, Andrew Morton
In-Reply-To: <cbe6f9910ed4603916c1082c47944c96330d4a15.1426528417.git.milosz@adfin.com>

On Mar 16, 2015, at 12:27 PM, Milosz Tanski <milosz@adfin.com> wrote:
> 
> Plumbing the flags argument through the vfs code so they can be passed
> down to __generic_file_(read/write)_iter function that do the acctual work.
> 
> Signed-off-by: Milosz Tanski <milosz@adfin.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 8e1b687..b53bb59 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -711,7 +711,8 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
> EXPORT_SYMBOL(iov_shorten);
> 
> static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
> -		unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
> +		unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn,
> +		int flags)

Using "int flags" as an argument is too generic IMHO.  We have sooo many
different "int flags" arguments, but there is no easy way to figure out
which flags are being used.  A better solution is to declare a named enum:

enum iov_iter {
	RWF_NONBLOCK = 0x00000001,	/* only access pages in cache */
};

and use "enum iov_iter flags" as the function argument (or even "iter_flags"
if you wanted to make it that much easier to understand).  That makes
it immediately clear to the reader and the compiler what the valid flag
values are here, and it works with tags, etc.

Thoughts?

Cheers, Andreas


> {
> 	struct kiocb kiocb;
> 	struct iov_iter iter;
> @@ -720,6 +721,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
> 	init_sync_kiocb(&kiocb, filp);
> 	kiocb.ki_pos = *ppos;
> 	kiocb.ki_nbytes = len;
> +	kiocb.ki_rwflags = flags;
> 
> 	iov_iter_init(&iter, rw, iov, nr_segs, len);
> 	ret = fn(&kiocb, &iter);
> @@ -858,7 +860,8 @@ out:
> 
> static ssize_t do_readv_writev(int type, struct file *file,
> 			       const struct iovec __user * uvector,
> -			       unsigned long nr_segs, loff_t *pos)
> +			       unsigned long nr_segs, loff_t *pos,
> +			       int flags)
> {
> 	size_t tot_len;
> 	struct iovec iovstack[UIO_FASTIOV];
> @@ -892,7 +895,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
> 
> 	if (iter_fn)
> 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
> -						pos, iter_fn);
> +						pos, iter_fn, flags);
> 	else if (fnv)
> 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
> 						pos, fnv);
> @@ -915,27 +918,27 @@ out:
> }
> 
> ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
> -		  unsigned long vlen, loff_t *pos)
> +		  unsigned long vlen, loff_t *pos, int flags)
> {
> 	if (!(file->f_mode & FMODE_READ))
> 		return -EBADF;
> 	if (!(file->f_mode & FMODE_CAN_READ))
> 		return -EINVAL;
> 
> -	return do_readv_writev(READ, file, vec, vlen, pos);
> +	return do_readv_writev(READ, file, vec, vlen, pos, flags);
> }
> 
> EXPORT_SYMBOL(vfs_readv);
> 
> ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
> -		   unsigned long vlen, loff_t *pos)
> +		   unsigned long vlen, loff_t *pos, int flags)
> {
> 	if (!(file->f_mode & FMODE_WRITE))
> 		return -EBADF;
> 	if (!(file->f_mode & FMODE_CAN_WRITE))
> 		return -EINVAL;
> 
> -	return do_readv_writev(WRITE, file, vec, vlen, pos);
> +	return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
> }
> 
> EXPORT_SYMBOL(vfs_writev);
> @@ -948,7 +951,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
> 
> 	if (f.file) {
> 		loff_t pos = file_pos_read(f.file);
> -		ret = vfs_readv(f.file, vec, vlen, &pos);
> +		ret = vfs_readv(f.file, vec, vlen, &pos, 0);
> 		if (ret >= 0)
> 			file_pos_write(f.file, pos);
> 		fdput_pos(f);
> @@ -968,7 +971,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
> 
> 	if (f.file) {
> 		loff_t pos = file_pos_read(f.file);
> -		ret = vfs_writev(f.file, vec, vlen, &pos);
> +		ret = vfs_writev(f.file, vec, vlen, &pos, 0);
> 		if (ret >= 0)
> 			file_pos_write(f.file, pos);
> 		fdput_pos(f);
> @@ -1000,7 +1003,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
> 	if (f.file) {
> 		ret = -ESPIPE;
> 		if (f.file->f_mode & FMODE_PREAD)
> -			ret = vfs_readv(f.file, vec, vlen, &pos);
> +			ret = vfs_readv(f.file, vec, vlen, &pos, 0);
> 		fdput(f);
> 	}
> 
> @@ -1024,7 +1027,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
> 	if (f.file) {
> 		ret = -ESPIPE;
> 		if (f.file->f_mode & FMODE_PWRITE)
> -			ret = vfs_writev(f.file, vec, vlen, &pos);
> +			ret = vfs_writev(f.file, vec, vlen, &pos, 0);
> 		fdput(f);
> 	}
> 
> @@ -1072,7 +1075,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
> 
> 	if (iter_fn)
> 		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
> -						pos, iter_fn);
> +						pos, iter_fn, 0);
> 	else if (fnv)
> 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
> 						pos, fnv);
> diff --git a/fs/splice.c b/fs/splice.c
> index 7968da9..ee3fd4c 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -576,7 +576,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
> 	old_fs = get_fs();
> 	set_fs(get_ds());
> 	/* The cast to a user pointer is valid due to the set_fs() */
> -	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
> +	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
> 	set_fs(old_fs);
> 
> 	return res;
> diff --git a/include/linux/aio.h b/include/linux/aio.h
> index d9c92da..9c1d499 100644
> --- a/include/linux/aio.h
> +++ b/include/linux/aio.h
> @@ -52,6 +52,8 @@ struct kiocb {
> 	 * this is the underlying eventfd context to deliver events to.
> 	 */
> 	struct eventfd_ctx	*ki_eventfd;
> +
> +	int			ki_rwflags;
> };
> 
> static inline bool is_sync_kiocb(struct kiocb *kiocb)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b4d71b5..c018335 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1619,9 +1619,9 @@ extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
> extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
> extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
> -		unsigned long, loff_t *);
> +		unsigned long, loff_t *, int);
> extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
> -		unsigned long, loff_t *);
> +		unsigned long, loff_t *, int);
> 
> struct super_operations {
>    	struct inode *(*alloc_inode)(struct super_block *sb);
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* Re: [PATCH] fstests: generic test for preadv2 behavior on linux
From: Andreas Dilger @ 2015-03-16 21:07 UTC (permalink / raw)
  To: Milosz Tanski
  Cc: linux-kernel, Christoph Hellwig, linux-fsdevel, linux-aio,
	Mel Gorman, Volker Lendecke, Tejun Heo, Jeff Moyer,
	Theodore Ts'o, Al Viro, linux-api, Michael Kerrisk,
	linux-arch, Dave Chinner, Andrew Morton
In-Reply-To: <1426530862-32276-1-git-send-email-milosz@adfin.com>


> On Mar 16, 2015, at 12:34 PM, Milosz Tanski <milosz@adfin.com> wrote:
> 
> preadv2 is a new syscall introduced that is like preadv2 but with flag

Sorry, "preadv2 ... is like preadv2"?

> argument. The first use case of this is to let us add a flag to perform a
> non-blocking file using the page cache.

This is also missing a Signed-off-by: line.

Cheers, Andreas
> ---
> src/Makefile           |   2 +-
> src/preadv2-pwritev2.h |  52 +++++++++++++++++
> src/preadv2.c          | 150 +++++++++++++++++++++++++++++++++++++++++++++++++
> tests/generic/067      |  85 ++++++++++++++++++++++++++++
> tests/generic/067.out  |   9 +++
> tests/generic/group    |   1 +
> 6 files changed, 298 insertions(+), 1 deletion(-)
> create mode 100644 src/preadv2-pwritev2.h
> create mode 100644 src/preadv2.c
> create mode 100755 tests/generic/067
> create mode 100644 tests/generic/067.out
> 
> diff --git a/src/Makefile b/src/Makefile
> index 4781736..f7d3681 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -19,7 +19,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
> 	bulkstat_unlink_test_modified t_dir_offset t_futimens t_immutable \
> 	stale_handle pwrite_mmap_blocked t_dir_offset2 seek_sanity_test \
> 	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
> -	renameat2 t_getcwd e4compact
> +	renameat2 t_getcwd e4compact preadv2
> 
> SUBDIRS =
> 
> diff --git a/src/preadv2-pwritev2.h b/src/preadv2-pwritev2.h
> new file mode 100644
> index 0000000..786e524
> --- /dev/null
> +++ b/src/preadv2-pwritev2.h
> @@ -0,0 +1,52 @@
> +#ifndef PREADV2_PWRITEV2_H
> +#define PREADV2_PWRITEV2_H
> +
> +#include "global.h"
> +
> +#ifndef HAVE_PREADV2
> +#include <sys/syscall.h>
> +
> +#if !defined(SYS_preadv2) && defined(__x86_64__)
> +#define SYS_preadv2 323
> +#define SYS_pwritev2 324
> +#endif
> +
> +#if !defined (SYS_preadv2) && defined(__i386__)
> +#define SYS_preadv2 359
> +#define SYS_pwritev2 360
> +#endif
> +
> +/* LO_HI_LONG taken from glibc */
> +#define LO_HI_LONG(val)							\
> +  (off_t) val,                                                          \
> +  (off_t) ((((uint64_t) (val)) >> (sizeof (long) * 4)) >> (sizeof (long) * 4))
> +
> +static inline ssize_t
> +preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
> +{
> +#ifdef SYS_preadv2
> +        return syscall(SYS_preadv2, fd, iov, iovcnt, LO_HI_LONG(offset),
> +		       flags);
> +#else
> +	errno = ENOSYS;
> +	return -1;
> +#endif
> +}
> +
> +static inline ssize_t
> +pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
> +{
> +#ifdef SYS_pwritev2
> +        return syscall(SYS_pwritev2, fd, iov, iovcnt, LO_HI_LONG(offset),
> +		       flags);
> +#else
> +	errno = ENOSYS;
> +	return -1;
> +#endif
> +}
> +
> +#define RWF_NONBLOCK	0x00000001
> +#define RWF_DSYNC	0x00000002
> +
> +#endif /* HAVE_PREADV2 */
> +#endif /* PREADV2_PWRITEV2_H */
> diff --git a/src/preadv2.c b/src/preadv2.c
> new file mode 100644
> index 0000000..a4f89b5
> --- /dev/null
> +++ b/src/preadv2.c
> @@ -0,0 +1,150 @@
> +/*
> + * Copyright 2014 Red Hat, Inc.  All rights reserved.
> + * Copyright 2015 Milosz Tanski
> + *
> + * License: GPLv2
> + *
> + */
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <linux/fs.h> /* for RWF_NONBLOCK */
> +
> +/*
> + * Once preadv2 is part of the upstream kernel and there is glibc support for
> + * it. We'll add support for preadv2 to xfs_io and this will be unnecessary.
> + */
> +#include "preadv2-pwritev2.h"
> +
> +/*
> + * Test to see if the system call is implemented.  If -EINVAL or -ENOSYS
> + * are returned, consider the call unimplemented.  All other errors are
> + * considered success.
> + *
> + * Returns: 0 if the system call is implemented, 1 if the system call
> + * is not implemented.
> + */
> +int
> +preadv2_check(int fd)
> +{
> +	int ret;
> +	struct iovec iov[] = {};
> +
> +	/* 0 length read; just check iof the syscall is there.
> +         *
> +         * - 0 length iovec
> +         * - Position is -1 (eg. use current position)
> +         */
> +	ret = preadv2(fd, iov, 0, -1, 0);
> +
> +	if (ret < 0) {
> +		if (errno == ENOSYS || errno == EINVAL)
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +void
> +usage(char *prog)
> +{
> +	fprintf(stderr, "Usage: %s [-v] [-ctdw] [-n] -p POS -l LEN <filename>\n\n", prog);
> +	fprintf(stderr, "General arguments:\n");
> +	fprintf(stderr, "  -v Verify that the syscall is supported and quit:\n");
> +	fprintf(stderr, "\n");
> +	fprintf(stderr, "Open arguments:\n");
> +	fprintf(stderr, "  -c Open file with O_CREAT flag\n");
> +	fprintf(stderr, "  -t Open file with O_TRUNC flag\n");
> +	fprintf(stderr, "  -d Open file with O_DIRECT flag\n");
> +	fprintf(stderr, "  -w Open file with O_RDWR flag vs O_RDONLY (default)\n");
> +	fprintf(stderr, "\n");
> +	fprintf(stderr, "preadv2 arguments:\n");
> +	fprintf(stderr, "  -n use RWF_NONBLOCK when performing read\n");
> +	fprintf(stderr, "  -p POS offset file to read at\n");
> +	fprintf(stderr, "  -l LEN length of file data to read\n");
> +	fprintf(stderr, "\n");
> +	fflush(stderr);
> +}
> +
> +int
> +main(int argc, char **argv)
> +{
> +	int fd;
> +	int ret;
> +	int opt;
> +	off_t pos = -1;
> +	struct iovec iov = { NULL, 0 };
> +	int o_flags = 0;
> +	int r_flags = 0;
> +	char *filename;
> +
> +	while ((opt = getopt(argc, argv, "vctdwnp:l:")) != -1) {
> +		switch (opt) {
> +		case 'v':
> +			/*
> +			 * See if we were called to check for availability of
> +			 * sys_preadv2. STDIN is okay, since we do a zero
> +			 * length read (see man 2 read).
> +			 */
> +			ret = preadv2_check(STDIN_FILENO);
> +			exit(ret);
> +		case 'c':
> +			o_flags |= O_CREAT;
> +			break;
> +		case 't':
> +			o_flags |= O_TRUNC;
> +			break;
> +		case 'd':
> +			o_flags |= O_DIRECT;
> +			break;
> +		case 'w':
> +			o_flags |= O_RDWR;
> +			break;
> +		case 'n':
> +			r_flags |= RWF_NONBLOCK;
> +			break;
> +		case 'p':
> +			pos = atoll(optarg);
> +			break;
> +		case 'l':
> +			iov.iov_len = atoll(optarg);
> +			break;
> +		default:
> +			fprintf(stderr, "invalid option: %c\n", opt);
> +			usage(argv[0]);
> +			exit(1);
> +		}
> +	}
> +
> +	if (optind >= argc) {
> +		usage(argv[0]);
> +		exit(1);
> +	}
> +
> +	if ((o_flags & O_RDWR) != O_RDWR)
> +		o_flags |= O_RDONLY;
> +
> +	if ((iov.iov_base = malloc(iov.iov_len)) == NULL) {
> +		perror("malloc");
> +		exit(1);
> +	}
> +
> +	filename = argv[optind];
> +	fd = open(filename, o_flags);
> +
> +	if (fd < 0) {
> +		perror("open");
> +		exit(1);
> +	}
> +
> +	if ((ret = preadv2(fd, &iov, 1, pos, r_flags)) == -1) {
> +		perror("preadv2");
> +		exit(ret);
> +	}
> +
> +	free(iov.iov_base);
> +	exit(0);
> +}
> diff --git a/tests/generic/067 b/tests/generic/067
> new file mode 100755
> index 0000000..4cc58f8
> --- /dev/null
> +++ b/tests/generic/067
> @@ -0,0 +1,85 @@
> +#! /bin/bash
> +# FS QA Test No. 067
> +#
> +# Test for the preadv2 syscall
> +#
> +#-----------------------------------------------------------------------
> +# Copyright (c) 2015 Milosz Tanski <mtanski@gmail.com>.  All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#-----------------------------------------------------------------------
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1	# failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +    cd /
> +    rm -f $tmp.*
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +
> +# real QA test starts here
> +
> +# Modify as appropriate.
> +_supported_fs generic
> +_supported_os Linux
> +_require_test
> +
> +# test file we'll be using
> +file=$SCRATCH_MNT/067.preadv2.$$
> +
> +# Create a file:
> +# two regions of data and a hole in the middle
> +# use O_DIRECT so it's not in the page cache
> +echo "create file"
> +$XFS_IO_PROG -t -f -d \
> +	-c "pwrite 0 1024" \
> +	-c "pwrite 2048 1024" \
> +	$file > /dev/null
> +
> +# Make sure it returns EAGAIN on uncached data
> +echo "uncached"
> +$here/src/preadv2 -n -p 0 -l 1024 $file
> +
> +# Make sure we read in the whole file, after that RWF_NONBLOCK should return us all the data
> +echo "cached"
> +$XFS_IO_PROG -f $file -c "pread 0 4096" $file > /dev/null
> +$here/src/preadv2 -n -p 0 -l 1024 $file
> +
> +# O_DIRECT and RWF_NONBLOCK should return EAGAIN always
> +echo "O_DIRECT"
> +$here/src/preadv2 -d -n -p 0 -l 1024 $file
> +
> +# Holes do not block
> +echo "holes"
> +$here/src/preadv2 -n -p 2048 -l 1024 $file
> +
> +# EOF behavior (no EAGAIN)
> +echo "EOF"
> +$here/src/preadv2 -n -p 3072 -l 1 $file
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/067.out b/tests/generic/067.out
> new file mode 100644
> index 0000000..6e3740f
> --- /dev/null
> +++ b/tests/generic/067.out
> @@ -0,0 +1,9 @@
> +QA output created by 067
> +create file
> +uncached
> +preadv2: Resource temporarily unavailable
> +cached
> +O_DIRECT
> +preadv2: Resource temporarily unavailable
> +holes
> +EOF
> diff --git a/tests/generic/group b/tests/generic/group
> index e5db772..91c5870 100644
> --- a/tests/generic/group
> +++ b/tests/generic/group
> @@ -69,6 +69,7 @@
> 064 auto quick prealloc
> 065 metadata auto quick
> 066 metadata auto quick
> +067 auto quick rw
> 068 other auto freeze dangerous stress
> 069 rw udf auto quick
> 070 attr udf auto quick stress
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: Kees Cook @ 2015-03-16 21:44 UTC (permalink / raw)
  To: Josh Triplett
  Cc: Al Viro, Andrew Morton, Andy Lutomirski, Ingo Molnar,
	Oleg Nesterov, Paul E. McKenney, H. Peter Anvin, Rik van Riel,
	Thomas Gleixner, Michael Kerrisk, Thiago Macieira, LKML,
	Linux API, linux-fsdevel@vger.kernel.org, x86@kernel.org
In-Reply-To: <cover.1426376419.git.josh@joshtriplett.org>

On Sun, Mar 15, 2015 at 12:59 AM, Josh Triplett <josh@joshtriplett.org> wrote:
> This patch series introduces a new clone flag, CLONE_FD, which lets the caller
> receive child process exit notification via a file descriptor rather than
> SIGCHLD.  CLONE_FD makes it possible for libraries to safely launch and manage
> child processes on behalf of their caller, *without* taking over process-wide
> SIGCHLD handling (either via signal handler or signalfd).
>
> Note that signalfd for SIGCHLD does not suffice here, because that still
> receives notification for all child processes, and interferes with process-wide
> signal handling.
>
> The CLONE_FD file descriptor uniquely identifies a process on the system in a
> race-free way, by holding a reference to the task_struct.  In the future, we
> may introduce APIs that support using process file descriptors instead of PIDs.
>
> This patch series also introduces a clone flag CLONE_AUTOREAP, which causes the
> kernel to automatically reap the child process when it exits, just as it does
> for processes using SIGCHLD when the parent has SIGCHLD ignored or marked as
> SA_NOCLDSTOP.
>
> Taken together, a library can launch a process with CLONE_FD, CLONE_AUTOREAP,
> and no exit signal, and completely avoid affecting either process-wide signal
> handling or an existing child wait loop.
>
> Introducing CLONE_FD and CLONE_AUTOREAP required two additional bits of yak
> shaving: Since clone has no more usable flags (with the three currently unused
> flags unusable because old kernels ignore them without EINVAL), also introduce
> a new clone4 system call with more flag bits and an extensible argument
> structure.  And since the magic pt_regs-based syscall argument processing for
> clone's tls argument would otherwise prevent introducing a sane clone4 system
> call, fix that too.
>
> I tested the CLONE_SETTLS changes with a thread-local storage test program (two
> threads independently reading and writing a __thread variable), on both 32-bit
> and 64-bit, and I observed no issues there.
>
> I tested clone4 and the new flags with several additional test programs,
> launching either a process or thread (in the former case using syscall(), in
> the latter case by calling clone4 via assembly and returning to C), sleeping in
> parent and child to test the case of either exiting first, and then printing
> the received clone4_info structure.
>
> Changes in v2:
> - Split out autoreaping into a separate CLONE_AUTOREAP.  CLONE_FD no longer
>   implies autoreaping and no exit signal, and CLONE_AUTOREAP does not affect
>   ptracers or signal handling.  Thanks to Oleg Nesterov for careful
>   investigation and discussion on v1.
> - Accept O_CLOEXEC and O_NONBLOCK via a clonefd_flags parameter in clone4_args.
>   Stop overloading the low byte of the main clone flags, since CLONE_FD now
>   works with a non-zero signal.
> - Return the file descriptor via an out parameter in clone4_args.
> - Drop patch to export alloc_fd; CLONE_FD now uses the next available file
>   descriptor, even if that's 0-2, since clone4 no longer needs to avoid
>   ambiguity with the 0 return indicating the child process.
> - Make poll on a CLONE_FD for an exited task also return POLLHUP, for
>   compatibility with FreeBSD's pdfork.  Thanks to David Drysdale for calling
>   attention to pdfork.

I think POLLHUP should be mentioned in the manpage (now it only
mentions POLLIN).

> - Fix typo in squelch_clone_flags.
> - Pass arguments to _do_fork and copy_process as a structure.
> - Construct the 64-bit flags in a separate variable, rather than inline in the
>   call to do_fork.
> - Fix error return for copy_from_user faults.
> - Add the new syscall to asm-generic.
> - Add ack from Andy Lutomirski to patches 1 and 2.
>
> I've included the manpages patch at the end of this series.  (Note that the
> manpage documents the behavior of the future glibc wrapper as well as the raw
> syscall.)  Here's a formatted plain-text version of the manpage for reference:
>
> CLONE4(2)                  Linux Programmer's Manual                 CLONE4(2)
>
>
>
> NAME
>        clone4 - create a child process
>
> SYNOPSIS
>        /* Prototype for the glibc wrapper function */
>
>        #define _GNU_SOURCE
>        #include <sched.h>
>
>        int clone4(uint64_t flags,
>                   size_t args_size,
>                   struct clone4_args *args,
>                   int (*fn)(void *), void *arg);
>
>        /* Prototype for the raw system call */
>
>        int clone4(unsigned flags_high, unsigned flags_low,
>                   unsigned long args_size,
>                   struct clone4_args *args);
>
>        struct clone4_args {
>            pid_t *ptid;
>            pid_t *ctid;
>            unsigned long stack_start;
>            unsigned long stack_size;
>            unsigned long tls;
>            int *clonefd;
>            unsigned clonefd_flags;
>        };
>
>
> DESCRIPTION
>        clone4()  creates  a  new  process,  similar  to  clone(2) and fork(2).
>        clone4() supports additional flags that clone(2) does not, and  accepts
>        arguments via an extensible structure.
>
>        args  points to a clone4_args structure, and args_size must contain the
>        size of that structure, as understood by the  caller.   If  the  caller
>        passes  a  shorter  structure  than  the  kernel expects, the remaining
>        fields will default to 0.  If the caller passes a larger structure than
>        the  kernel  expects  (such  as one from a newer kernel), clone4() will
>        return EINVAL.  The clone4_args structure may gain additional fields at
>        the  end  in  the future, and callers must only pass a size that encom‐
>        passes the number of fields they understand.  If the  caller  passes  0
>        for args_size, args is ignored and may be NULL.
>
>        In  the clone4_args structure, ptid, ctid, stack_start, stack_size, and
>        tls have the same semantics as they do with clone(2) and clone2(2).
>
>        In the glibc wrapper, fn and arg have the same  semantics  as  they  do
>        with clone(2).  As with clone(2), the underlying system call works more
>        like fork(2), returning 0 in the child process; the glibc wrapper  sim‐
>        plifies  thread execution by calling fn(arg) and exiting the child when
>        that function exits.
>
>        The 64-bit  flags  argument  (split  into  the  32-bit  flags_high  and
>        flags_low  arguments  in  the  kernel  interface for portability across
>        architectures) accepts all the same flags as clone(2), with the  excep‐
>        tion  of the obsolete CLONE_PID, CLONE_DETACHED, and CLONE_STOPPED.  In
>        addition, flags accepts the following flags:
>
>
>        CLONE_AUTOREAP
>               When the new process exits, immediately  reap  it,  rather  than
>               keeping  it  around  as a "zombie" until a call to waitpid(2) or
>               similar.  Without this flag, the kernel will automatically  reap
>               a  process if its exit signal is set to SIGCHLD, and if the par‐
>               ent process has SIGCHLD set to SIG_IGN or has a SIGCHLD  handler
>               installed  with SA_NOCLDWAIT (see sigaction(2)).  CLONE_AUTOREAP
>               allows the calling process to enable automatic reaping  with  an
>               exit  signal other than SIGCHLD (including 0 to disable the exit
>               signal), and does not depend on the  configuration  of  process-
>               wide signal handling.
>
>
>        CLONE_FD
>               Return  a file descriptor associated with the new process, stor‐
>               ing it in location clonefd in the parent's address space.   When
>               the new process exits, the file descriptor will become available
>               for reading.
>
>               Unlike using  signalfd(2)  for  the  SIGCHLD  signal,  the  file
>               descriptor  returned  by  clone4()  with the CLONE_FD flag works
>               even with SIGCHLD unblocked in one or more threads of the parent
>               process,  allowing  the  process  to have different handlers for
>               different child processes, such as those created by  a  library,
>               without  introducing  race conditions around process-wide signal
>               handling.
>
>               clonefd_flags may contain the following additional flags for use
>               with CLONE_FD:
>
>
>               O_CLOEXEC
>                      Set  the  close-on-exec  flag on the new file descriptor.
>                      See the description of the O_CLOEXEC flag in open(2)  for
>                      reasons why this may be useful.

This begs the question: what happens when all CLONE_FD fds for a
process are closed? Will the parent get SIGCHLD instead, will it
auto-reap, or will it be un-wait-able (I assume not this...)

>
>
>               O_NONBLOCK
>                      Set  the  O_NONBLOCK  flag  on  the  new file descriptor.
>                      Using this flag saves extra calls to fcntl(2) to  achieve
>                      the same result.
>
>
>               The returned file descriptor supports the following operations:
>
>               read(2) (and similar)
>                      When  the  new  process  exits,  reading  from  the  file
>                      descriptor produces a single clonefd_info structure:
>
>                      struct clonefd_info {
>                          uint32_t code;   /* Signal code */
>                          uint32_t status; /* Exit status or signal */
>                          uint64_t utime;  /* User CPU time */
>                          uint64_t stime;  /* System CPU time */
>                      };
>
>
>                      If the new process has not  yet  exited,  read(2)  either
>                      blocks  until  it does, or fails with the error EAGAIN if
>                      the file descriptor has O_NONBLOCK set.
>
>                      Future kernels may extend clonefd_info by appending addi‐
>                      tional  fields  to  the end.  Callers should read as many
>                      bytes as they understand; unread data will be  discarded,
>                      and  subsequent  reads  after  the first will return 0 to
>                      indicate end-of-file.  Callers requesting more bytes than
>                      the  kernel  provides  (such as callers expecting a newer
>                      clonefd_info structure) will receive a shorter  structure
>                      from older kernels.
>
>               poll(2), select(2), epoll(7) (and similar)
>                      The  file  descriptor  is readable (the select(2) readfds
>                      argument; the poll(2) POLLIN flag) if the new process has
>                      exited.
>
>               close(2)
>                      When  the file descriptor is no longer required it should
>                      be closed.
>
>
>    C library/kernel ABI differences
>        As with clone(2), the raw clone4() system call corresponds more closely
>        to  fork(2)  in that execution in the child continues from the point of
>        the call.
>
>        Unlike clone(2), the raw system call  interface  for  clone4()  accepts
>        arguments in the same order on all architectures.
>
>        The  raw  system call accepts flags as two 32-bit arguments, flags_high
>        and flags_low, to simplify portability across 32-bit and 64-bit  archi‐
>        tectures and calling conventions.  The glibc wrapper accepts flags as a
>        single 64-bit argument for convenience.
>
>
> RETURN VALUE
>        For the glibc wrapper, on success, clone4() returns the new process  ID
>        to the calling process, and the new process begins running at the spec‐
>        ified function.
>
>        For the raw syscall, on success, clone4() returns the new process ID to
>        the calling process, and returns 0 in the new process.
>
>        On failure, clone4() returns -1 and sets errno accordingly.
>
>
> ERRORS
>        clone4()  can  return any error from clone(2), as well as the following
>        additional errors:
>
>        EFAULT args is outside your accessible address space.
>
>        EINVAL flags contained an unknown flag.
>
>        EINVAL flags included CLONE_FD and clonefd_flags contained  an  unknown
>               flag.
>
>        EINVAL flags  included  CLONE_FD, but the kernel configuration does not
>               have the CONFIG_CLONEFD option enabled.
>
>        EMFILE flags included CLONE_FD,  but  the  new  file  descriptor  would
>               exceed the process limit on open file descriptors.
>
>        ENFILE flags  included  CLONE_FD,  but  the  new  file descriptor would
>               exceed the system-wide limit on open file descriptors.
>
>        ENODEV flags included  CLONE_FD,  but  clone4()  could  not  mount  the
>               (internal) anonymous inode device.
>
>
> CONFORMING TO
>        clone4()  is Linux-specific and should not be used in programs intended
>        to be portable.
>
>
> SEE ALSO
>        clone(2), epoll(7), poll(2), pthreads(7), read(2), select(2)
>
>
>
> Linux                             2015-03-14                         CLONE4(2)
>
>
> Josh Triplett and Thiago Macieira (7):
>   clone: Support passing tls argument via C rather than pt_regs magic
>   x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit
>   Introduce a new clone4 syscall with more flag bits and extensible arguments
>   kernel/fork.c: Pass arguments to _do_fork and copy_process using clone4_args
>   clone4: Add a CLONE_AUTOREAP flag to automatically reap the child process
>   signal: Factor out a helper function to process task_struct exit_code
>   clone4: Add a CLONE_FD flag to get task exit notification via fd
>
>  arch/Kconfig                      |   7 ++
>  arch/x86/Kconfig                  |   1 +
>  arch/x86/ia32/ia32entry.S         |   3 +-
>  arch/x86/kernel/entry_64.S        |   1 +
>  arch/x86/kernel/process_32.c      |   6 +-
>  arch/x86/kernel/process_64.c      |   8 +--
>  arch/x86/syscalls/syscall_32.tbl  |   1 +
>  arch/x86/syscalls/syscall_64.tbl  |   2 +
>  include/linux/compat.h            |  14 ++++
>  include/linux/sched.h             |  22 ++++++
>  include/linux/syscalls.h          |   6 +-
>  include/uapi/asm-generic/unistd.h |   4 +-
>  include/uapi/linux/sched.h        |  55 ++++++++++++++-
>  init/Kconfig                      |  21 ++++++
>  kernel/Makefile                   |   1 +
>  kernel/clonefd.c                  | 121 ++++++++++++++++++++++++++++++++
>  kernel/clonefd.h                  |  32 +++++++++
>  kernel/exit.c                     |   4 ++
>  kernel/fork.c                     | 142 ++++++++++++++++++++++++++++++--------
>  kernel/signal.c                   |  26 ++++---
>  kernel/sys_ni.c                   |   1 +
>  21 files changed, 426 insertions(+), 52 deletions(-)
>  create mode 100644 kernel/clonefd.c
>  create mode 100644 kernel/clonefd.h
>
> --
> 2.1.4
>

Looks promising!

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [v9 1/5] vfs: adds general codes to enforces project quota limits
From: Dave Chinner @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Jan Kara
  Cc: Li Xi, linux-fsdevel, linux-ext4, linux-api, tytso, adilger, viro,
	hch, dmonakhov, dchinner
In-Reply-To: <20150316142944.GN4934@quack.suse.cz>

On Mon, Mar 16, 2015 at 03:29:44PM +0100, Jan Kara wrote:
> On Wed 11-03-15 12:03:19, Li Xi wrote:
> > This patch adds support for a new quota type PRJQUOTA for project quota
> > enforcement. Also a new method get_projid() is added into dquot_operations
> > structure.
> > 
> > Signed-off-by: Li Xi <lixi@ddn.com>
> > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> > Reviewed-by: Jan Kara <jack@suse.cz>
> ...
> > diff --git a/fs/quota/quota.c b/fs/quota/quota.c
> > index 2aa4151..c76b350 100644
> > --- a/fs/quota/quota.c
> > +++ b/fs/quota/quota.c
> > @@ -30,11 +30,15 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
> >  	case Q_XGETQSTATV:
> >  	case Q_XQUOTASYNC:
> >  		break;
> > -	/* allow to query information for dquots we "own" */
> > +	/*
> > +	 * allow to query information for dquots we "own"
> > +	 * always allow querying project quota
> > +	 */
> >  	case Q_GETQUOTA:
> >  	case Q_XGETQUOTA:
> >  		if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) ||
> > -		    (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))))
> > +		    (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))) ||
> > +		    (type == PRJQUOTA))
> >  			break;
>   I wanted to merge this patch but this hunk caught my eye. Why do we
> suddently allow querying of project quotas? Traditionally that has been
> allowed only with CAP_SYS_ADMIN. I agree it looks too restrictive to me but
> unless that's a bug, I think we have to adhere to original behavior and
> drop this hunk. Dave, was that behavior of project quotas intended? 

This is for quota reports, right?

Project quotas are managed by the administrator as individual users
may not even have access to all the files under a project and hence
often cannot do anything about running out of quota space. i.e. users
don't own project quotas like they "own" user and group quotas.
user/group quotas imply the user has permission to access/modify the
files within the quota, whereas that is not true of project quotas.

e.g. Think about a project that compartmentalises information along
user acess bounds: even if a user can't access parts of the project
quota space, allowing them to query the accounting of space used by
the project is leaking information about how much data there is in
the project they can't access....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply

* [PATCH v7 tip 0/8] tracing: attach eBPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel

Hi Steven, Peter,

since there were no more comments after last _notrace thread. I'm assuming
the rest looks ok? Please ack. Thanks!

V6->V7:
- rebase and remove confusing _notrace suffix from preempt_disable/enable
  everything else unchanged

V5->V6:
- added simple recursion check to trace_call_bpf()
- added tracex4 example that does kmem_cache_alloc/free tracking.
  It remembers every allocated object in a map and user space periodically
  prints a set of old objects. With more work in can be made into
  simple kmemleak detector.
  It was used as a test of recursive kmalloc/kfree: attached to
  kprobe/__kmalloc and let program to call kmalloc again.

V4->V5:
- switched to ktime_get_mono_fast_ns() as suggested by Peter
- in libbpf.c fixed zero init of 'union bpf_attr' padding
- fresh rebase on tip/master

This is targeting 'tip' tree, since most of the changes are perf_event related.
There will be a small conflict between net-next and tip, since they both
add new bpf_prog_type (BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_KPROBE).

V3 discussion:
https://lkml.org/lkml/2015/2/9/738

V3->V4:
- since the boundary of stable ABI in bpf+tracepoints is not clear yet,
  I've dropped them for now.
- bpf+syscalls are ok from stable ABI point of view, but bpf+seccomp
  would want to do very similar analysis of syscalls, so I've dropped
  them as well to take time and define common bpf+syscalls and bpf+seccomp
  infra in the future.
- so only bpf+kprobes left. kprobes by definition is not a stable ABI,
  so bpf+kprobe is not stable ABI either. To stress on that point added
  kernel version attribute that user space must pass along with the program
  and kernel will reject programs when version code doesn't match.
  So bpf+kprobe is very similar to kernel modules, but unlike modules
  version check is not used for safety, but for enforcing 'non-ABI-ness'.
  (version check doesn't apply to bpf+sockets which are stable)

Patch 1 is in net-next and needs to be in tip too, since patch 2 depends on it.

Patch 2 actually adds bpf+kprobe infra:
programs receive 'struct pt_regs' on input and can walk data structures
using bpf_probe_read() helper which is a wrapper of probe_kernel_read()

Programs are attached to kprobe events via API:

prog_fd = bpf_prog_load(...);
struct perf_event_attr attr = {
  .type = PERF_TYPE_TRACEPOINT,
  .config = event_id, /* ID of just created kprobe event */
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

Patch 3 adds bpf_ktime_get_ns() helper function, so that bpf programs can
measure time delta between events to compute disk io latency, etc.

Patch 4 adds bpf_trace_printk() helper that is used to debug programs.
When bpf verifier sees that program is calling bpf_trace_printk() it inits
trace_printk buffers which emits nasty 'this is debug only' banner.
That's exactly what we want. bpf_trace_printk() is for debugging only.

Patch 5 sample code that shows how to use bpf_probe_read/bpf_trace_printk

Patch 6 sample code - combination of kfree_skb and sys_write tracing.

Patch 7 sample code that computes disk io latency and prints it as 'heatmap'

Interesting bit is that patch 6 has log2() function implemented in C
and patch 7 has another log2() function using different algorithm in C.
In the future if 'log2' usage becomes common, we can add it as in-kernel
helper function, but for now bpf programs can implement them on bpf side.

Another interesting bit from patch 7 is that it does approximation of
floating point log10(X)*10 using integer arithmetic, which demonstrates
the power of C->BPF vs traditional tracing language alternatives,
where one would need to introduce new helper functions to add functionality,
whereas bpf can just implement such things in C as part of the program.

Next step is to prototype TCP stack instrumentation (like web10g) using
bpf+kprobe, but without adding any new code tcp stack.
Though kprobes are slow comparing to tracepoints, they are good enough
for prototyping and trace_marker/debug_tracepoint ideas can accelerate
them in the future.

Alexei Starovoitov (7):
  tracing: attach BPF programs to kprobes
  tracing: allow BPF programs to call bpf_ktime_get_ns()
  tracing: allow BPF programs to call bpf_trace_printk()
  samples: bpf: simple non-portable kprobe filter example
  samples: bpf: counting example for kfree_skb and write syscall
  samples: bpf: IO latency analysis (iosnoop/heatmap)
  samples: bpf: kmem_alloc/free tracker

Daniel Borkmann (1):
  bpf: make internal bpf API independent of CONFIG_BPF_SYSCALL ifdefs

 include/linux/bpf.h             |   20 +++-
 include/linux/ftrace_event.h    |   14 +++
 include/uapi/linux/bpf.h        |    5 +
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 +-
 kernel/events/core.c            |   59 ++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  198 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 +-
 samples/bpf/Makefile            |   16 ++++
 samples/bpf/bpf_helpers.h       |    6 ++
 samples/bpf/bpf_load.c          |  125 ++++++++++++++++++++++--
 samples/bpf/bpf_load.h          |    3 +
 samples/bpf/libbpf.c            |   14 ++-
 samples/bpf/libbpf.h            |    5 +-
 samples/bpf/sock_example.c      |    2 +-
 samples/bpf/test_verifier.c     |    2 +-
 samples/bpf/tracex1_kern.c      |   50 ++++++++++
 samples/bpf/tracex1_user.c      |   25 +++++
 samples/bpf/tracex2_kern.c      |   86 +++++++++++++++++
 samples/bpf/tracex2_user.c      |   95 +++++++++++++++++++
 samples/bpf/tracex3_kern.c      |   89 ++++++++++++++++++
 samples/bpf/tracex3_user.c      |  150 +++++++++++++++++++++++++++++
 samples/bpf/tracex4_kern.c      |   54 +++++++++++
 samples/bpf/tracex4_user.c      |   69 ++++++++++++++
 25 files changed, 1088 insertions(+), 18 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.c
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

-- 
1.7.9.5

^ permalink raw reply

* [PATCH v7 tip 1/8] bpf: make internal bpf API independent of CONFIG_BPF_SYSCALL ifdefs
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426542584-9406-1-git-send-email-ast@plumgrid.com>

From: Daniel Borkmann <daniel@iogearbox.net>

Socket filter code and other subsystems with upcoming eBPF support should
not need to deal with the fact that we have CONFIG_BPF_SYSCALL defined or
not.

Having the bpf syscall as a config option is a nice thing and I'd expect
it to stay that way for expert users (I presume one day the default setting
of it might change, though), but code making use of it should not care if
it's actually enabled or not.

Instead, hide this via header files and let the rest deal with it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/bpf.h |   20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..c2e21113ecc0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
-
 struct bpf_prog;
 
 struct bpf_prog_aux {
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
-static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+}
+
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_prog_put(struct bpf_prog *prog)
+{
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v7 tip 2/8] tracing: attach BPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426542584-9406-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

prog_fd is a file descriptor associated with BPF program previously loaded.
event_id is an ID of created kprobe

close(event_fd) - automatically detaches BPF program from it

BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any kernel
  data structures

BPF programs receive 'struct pt_regs *' as an input
('struct pt_regs' is architecture dependent)

Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
kprobes must be recompiled for every kernel version and user must supply correct
LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/linux/ftrace_event.h    |   14 +++++
 include/uapi/linux/bpf.h        |    3 +
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 ++-
 kernel/events/core.c            |   59 +++++++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  119 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 +++-
 8 files changed, 212 insertions(+), 2 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -252,6 +253,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +267,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +277,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
@@ -303,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -548,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..4486d36d2e9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..ad4dade2a502 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
+	if (type == BPF_PROG_TYPE_KPROBE &&
+	    attr->kern_version != LINUX_VERSION_CODE)
+		return -EINVAL;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2709063eb26b..3a45e7f6b2df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3402,6 +3404,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,6 +3414,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3923,6 +3927,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3976,6 +3981,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6436,6 +6444,49 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+		/* bpf programs can only be attached to kprobes */
+		return -EINVAL;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6451,6 +6502,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..c575a300103b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..ba95b131082c
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,119 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+	int cpu;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	preempt_disable();
+
+	cpu = raw_smp_processor_id();
+	if (unlikely(per_cpu(bpf_prog_active, cpu)++ != 0)) {
+		/* since some bpf program is already running on this cpu,
+		 * don't call into another bpf program (same or different)
+		 * and don't send kprobe event into ring-buffer,
+		 * so return zero here
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+ out:
+	per_cpu(bpf_prog_active, cpu)--;
+	preempt_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *dst = (void *) (long) r1;
+	int size = (int) r2;
+	void *unsafe_ptr = (void *) (long) r3;
+
+	return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static struct bpf_func_proto kprobe_prog_funcs[] = {
+	[BPF_FUNC_probe_read] = {
+		.func = bpf_probe_read,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+		.arg3_type = ARG_ANYTHING,
+	},
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(kprobe_prog_funcs))
+			return NULL;
+		return &kprobe_prog_funcs[func_id];
+	}
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto = kprobe_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops = &kprobe_prog_ops,
+	.type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..dc3462507d7c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1286,7 +1294,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v7 tip 3/8] tracing: allow BPF programs to call bpf_ktime_get_ns()
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426542584-9406-1-git-send-email-ast@plumgrid.com>

bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4486d36d2e9e..101e509d1001 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ba95b131082c..74eb6abda6a1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -56,6 +56,12 @@ static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return probe_kernel_read(dst, unsafe_ptr, size);
 }
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
 static struct bpf_func_proto kprobe_prog_funcs[] = {
 	[BPF_FUNC_probe_read] = {
 		.func = bpf_probe_read,
@@ -65,6 +71,11 @@ static struct bpf_func_proto kprobe_prog_funcs[] = {
 		.arg2_type = ARG_CONST_STACK_SIZE,
 		.arg3_type = ARG_ANYTHING,
 	},
+	[BPF_FUNC_ktime_get_ns] = {
+		.func = bpf_ktime_get_ns,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+	},
 };
 
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v7 tip 4/8] tracing: allow BPF programs to call bpf_trace_printk()
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426542584-9406-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

Debugging of BPF programs needs some form of printk from the program,
so let programs call limited trace_printk() with %d %u %x %p modifiers only.

Similar to kernel modules, during program load verifier checks whether program
is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers
and emits big 'this is debug only' banner.

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   68 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 101e509d1001..4095f3d9a716 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 74eb6abda6a1..a22763a4d2e2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -62,6 +62,60 @@ static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return ktime_get_mono_fast_ns();
 }
 
+/* limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int fmt_cnt = 0;
+	bool mod_l[3] = {};
+	int i;
+
+	/* bpf_check() guarantees that fmt points to bpf program stack and
+	 * fmt_size bytes of it were initialized by bpf program
+	 */
+	if (fmt[fmt_size - 1] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++)
+		if (fmt[i] == '%') {
+			if (fmt_cnt >= 3)
+				return -EINVAL;
+			i++;
+			if (i >= fmt_size)
+				return -EINVAL;
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			} else if (fmt[i] == 'p') {
+				mod_l[fmt_cnt] = true;
+				fmt_cnt++;
+				continue;
+			}
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			}
+
+			if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+				return -EINVAL;
+			fmt_cnt++;
+		}
+
+	return __trace_printk(1/* fake ip will not be printed */, fmt,
+			      mod_l[0] ? r3 : (u32) r3,
+			      mod_l[1] ? r4 : (u32) r4,
+			      mod_l[2] ? r5 : (u32) r5);
+}
+
 static struct bpf_func_proto kprobe_prog_funcs[] = {
 	[BPF_FUNC_probe_read] = {
 		.func = bpf_probe_read,
@@ -76,6 +130,13 @@ static struct bpf_func_proto kprobe_prog_funcs[] = {
 		.gpl_only = true,
 		.ret_type = RET_INTEGER,
 	},
+	[BPF_FUNC_trace_printk] = {
+		.func = bpf_trace_printk,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+	},
 };
 
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
@@ -90,6 +151,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 	default:
 		if (func_id < 0 || func_id >= ARRAY_SIZE(kprobe_prog_funcs))
 			return NULL;
+
+		if (func_id == BPF_FUNC_trace_printk)
+			/* this program might be calling bpf_trace_printk,
+			 * so allocate per-cpu printk buffers
+			 */
+			trace_printk_init_buffers();
+
 		return &kprobe_prog_funcs[func_id];
 	}
 }
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v7 tip 5/8] samples: bpf: simple non-portable kprobe filter example
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426542584-9406-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

tracex1_kern.c - C program compiled into BPF.
It attaches to kprobe:netif_receive_skb
When skb->dev->name == "lo", it prints sample debug message into trace_pipe
via bpf_trace_printk() helper function.

tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens kprobes:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- prints from trace_pipe

Note, this bpf program is completely non-portable. It must be recompiled
with current kernel headers. kprobe is not a stable ABI and bpf+kprobe scripts
may stop working any time.

bpf verifier will detect that it's using bpf_trace_printk() and kernel will
print warning banner:
** trace_printk() being used. Allocating extra memory.  **
**                                                      **
** This means that this is a DEBUG kernel and it is     **
** unsafe for production use.                           **

bpf_trace_printk() should be used for debugging of bpf program only.

Usage:
$ sudo tracex1
            ping-19826 [000] d.s2 63103.382648: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63103.382684: : skb ffff880466b1d300 len 84

            ping-19826 [000] d.s2 63104.382533: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63104.382594: : skb ffff880466b1d300 len 84

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 samples/bpf/Makefile        |    4 ++
 samples/bpf/bpf_helpers.h   |    6 +++
 samples/bpf/bpf_load.c      |  125 ++++++++++++++++++++++++++++++++++++++++---
 samples/bpf/bpf_load.h      |    3 ++
 samples/bpf/libbpf.c        |   14 ++++-
 samples/bpf/libbpf.h        |    5 +-
 samples/bpf/sock_example.c  |    2 +-
 samples/bpf/test_verifier.c |    2 +-
 samples/bpf/tracex1_kern.c  |   50 +++++++++++++++++
 samples/bpf/tracex1_user.c  |   25 +++++++++
 10 files changed, 224 insertions(+), 12 deletions(-)
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..51f6f01e5a3a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,27 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += tracex1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..1c872bcf5a80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) BPF_FUNC_trace_printk;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..95c106e4bcdb 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,70 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
+static int kern_version;
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
 int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
-
-	if (!is_socket)
-		/* tracing events tbd */
+	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
+	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	enum bpf_prog_type prog_type;
+	char buf[256];
+	int fd, efd, err, id;
+	struct perf_event_attr attr = {};
+
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	if (is_socket) {
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	} else if (is_kprobe || is_kretprobe) {
+		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else {
+		printf("Unknown event '%s'\n", event);
 		return -1;
+	}
+
+	if (is_kprobe || is_kretprobe) {
+		if (is_kprobe)
+			event += 7;
+		else
+			event += 10;
+
+		snprintf(buf, sizeof(buf),
+			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+			 is_kprobe ? 'p' : 'r', event, event);
+		err = system(buf);
+		if (err < 0) {
+			printf("failed to create kprobe '%s' error '%s'\n",
+			       event, strerror(errno));
+			return -1;
+		}
+	}
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	strcpy(buf, DEBUGFS);
+	strcat(buf, "events/kprobes/");
+	strcat(buf, event);
+	strcat(buf, "/id");
+
+	efd = open(buf, O_RDONLY, 0);
+	if (efd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = read(efd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
+	close(efd);
+
+	buf[err] = 0;
+	id = atoi(buf);
+	attr.config = id;
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+		return -1;
+	}
+	event_fd[prog_cnt - 1] = efd;
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
 	return 0;
 }
 
@@ -135,6 +211,9 @@ int load_bpf_file(char *path)
 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
 		return 1;
 
+	/* clear all kprobes */
+	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
+
 	/* scan over all elf sections to get license and map info */
 	for (i = 1; i < ehdr.e_shnum; i++) {
 
@@ -149,6 +228,14 @@ int load_bpf_file(char *path)
 		if (strcmp(shname, "license") == 0) {
 			processed_sec[i] = true;
 			memcpy(license, data->d_buf, data->d_size);
+		} else if (strcmp(shname, "version") == 0) {
+			processed_sec[i] = true;
+			if (data->d_size != sizeof(int)) {
+				printf("invalid size of version section %zd\n",
+				       data->d_size);
+				return 1;
+			}
+			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
 			processed_sec[i] = true;
 			if (load_maps(data->d_buf, data->d_size))
@@ -178,7 +265,8 @@ int load_bpf_file(char *path)
 			if (parse_relo_and_apply(data, symbols, &shdr, insns))
 				continue;
 
-			if (memcmp(shname_prog, "events/", 7) == 0 ||
+			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
+			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -193,7 +281,8 @@ int load_bpf_file(char *path)
 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
 			continue;
 
-		if (memcmp(shname, "events/", 7) == 0 ||
+		if (memcmp(shname, "kprobe/", 7) == 0 ||
+		    memcmp(shname, "kretprobe/", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
@@ -201,3 +290,23 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+void read_trace_pipe(void)
+{
+	int trace_fd;
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0)
+		return;
+
+	while (1) {
+		static char buf[4096];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf));
+		if (sz) {
+			buf[sz] = 0;
+			puts(buf);
+		}
+	}
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cbd7c2b532b9 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+void read_trace_pipe(void);
+
 #endif
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..7e1efa7e2ed7 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int prog_len,
-		  const char *license)
+		  const char *license, int kern_version)
 {
 	union bpf_attr attr = {
 		.prog_type = prog_type,
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		.log_level = 1,
 	};
 
+	/* assign one field outside of struct init to make sure any
+	 * padding is zero initialized
+	 */
+	attr.kern_version = kern_version;
+
 	bpf_log_buf[0] = 0;
 
 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
 
 	return sock;
 }
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
+		       group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..ac7b09672b46 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int insn_len,
-		  const char *license);
+		  const char *license, int kern_version);
 
 #define LOG_BUF_SIZE 65536
 extern char bpf_log_buf[LOG_BUF_SIZE];
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 /* create RAW socket and bind to interface 'name' */
 int open_raw_sock(const char *name);
 
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags);
 #endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index c8ad0404416f..a0ce251c5390 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
 	};
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
-				"GPL");
+				"GPL", 0);
 	if (prog_fd < 0) {
 		printf("failed to load prog '%s'\n", strerror(errno));
 		goto cleanup;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..740ce97cda5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
 
 		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
 					prog_len * sizeof(struct bpf_insn),
-					"GPL");
+					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
 			if (prog_fd < 0) {
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..42176fce4847
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of argumnets and their posistions can change, etc.
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	/* attaches to kprobe netif_receive_skb,
+	 * looks for packets on loobpack device and prints them
+	 */
+	char devname[IFNAMSIZ] = {};
+	struct net_device *dev;
+	struct sk_buff *skb;
+	int len;
+
+	/* non-portable! works for the given kernel only */
+	skb = (struct sk_buff *) ctx->di;
+
+	dev = _(skb->dev);
+
+	len = _(skb->len);
+
+	bpf_probe_read(devname, sizeof(devname), dev->name);
+
+	if (devname[0] == 'l' && devname[1] == 'o') {
+		char fmt[] = "skb %p len %d\n";
+		/* using bpf_trace_printk() for DEBUG ONLY */
+		bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..31a48183beea
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("taskset 1 ping -c5 localhost", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox