Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v4 07/16] fs-verity: add the hook for file ->open()
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add the fsverity_file_open() function, which prepares an fs-verity file
to be read from.  If not already done, it loads the fs-verity descriptor
from the filesystem and sets up an fsverity_info structure for the inode
which describes the Merkle tree and contains the file measurement.  It
also denies all attempts to open verity files for writing.

This commit also begins the include/linux/fsverity.h header, which
declares the interface between fs/verity/ and filesystems.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/Makefile           |   3 +-
 fs/verity/fsverity_private.h |  54 +++++-
 fs/verity/init.c             |   6 +
 fs/verity/open.c             | 325 +++++++++++++++++++++++++++++++++++
 include/linux/fsverity.h     |  71 ++++++++
 5 files changed, 456 insertions(+), 3 deletions(-)
 create mode 100644 fs/verity/open.c
 create mode 100644 include/linux/fsverity.h

diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 398f3f85fa18..e6a8951c493a 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_FS_VERITY) += hash_algs.o \
-			   init.o
+			   init.o \
+			   open.o
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index c879189bfd61..c7fc3c7fc714 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -15,8 +15,7 @@
 #define pr_fmt(fmt) "fs-verity: " fmt
 
 #include <crypto/sha.h>
-#include <linux/fs.h>
-#include <uapi/linux/fsverity.h>
+#include <linux/fsverity.h>
 
 struct ahash_request;
 
@@ -62,6 +61,40 @@ struct merkle_tree_params {
 	u64 level_start[FS_VERITY_MAX_LEVELS];
 };
 
+/**
+ * fsverity_info - cached verity metadata for an inode
+ *
+ * When a verity file is first opened, an instance of this struct is allocated
+ * and stored in ->i_verity_info; it remains until the inode is evicted.  It
+ * caches information about the Merkle tree that's needed to efficiently verify
+ * data read from the file.  It also caches the file measurement.  The Merkle
+ * tree pages themselves are not cached here, but the filesystem may cache them.
+ */
+struct fsverity_info {
+	struct merkle_tree_params tree_params;
+	u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
+	const struct inode *inode;
+};
+
+/*
+ * Merkle tree properties.  The file measurement is the hash of this structure.
+ */
+struct fsverity_descriptor {
+	__u8 version;		/* must be 1 */
+	__u8 hash_algorithm;	/* Merkle tree hash algorithm */
+	__u8 log_blocksize;	/* log2 of size of data and tree blocks */
+	__u8 salt_size;		/* size of salt in bytes; 0 if none */
+	__le32 sig_size;	/* reserved, must be 0 */
+	__le64 data_size;	/* size of file the Merkle tree is built over */
+	__u8 root_hash[64];	/* Merkle tree root hash */
+	__u8 salt[32];		/* salt prepended to each hashed block */
+	__u8 __reserved[144];	/* must be 0's */
+};
+
+/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
+#define FS_VERITY_MAX_DESCRIPTOR_SIZE	16384
+
 /* hash_algs.c */
 
 extern struct fsverity_hash_alg fsverity_hash_algs[];
@@ -88,4 +121,21 @@ fsverity_msg(const struct inode *inode, const char *level,
 #define fsverity_err(inode, fmt, ...)		\
 	fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
 
+/* open.c */
+
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+				     const struct inode *inode,
+				     unsigned int hash_algorithm,
+				     unsigned int log_blocksize,
+				     const u8 *salt, size_t salt_size);
+
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+					   const void *desc, size_t desc_size);
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
+
+void fsverity_free_info(struct fsverity_info *vi);
+
+int __init fsverity_init_info_cache(void);
+
 #endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/init.c b/fs/verity/init.c
index 40076bbe452a..fff1fd634335 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -33,8 +33,14 @@ void fsverity_msg(const struct inode *inode, const char *level,
 
 static int __init fsverity_init(void)
 {
+	int err;
+
 	fsverity_check_hash_algs();
 
+	err = fsverity_init_info_cache();
+	if (err)
+		return err;
+
 	pr_debug("Initialized fs-verity\n");
 	return 0;
 }
diff --git a/fs/verity/open.c b/fs/verity/open.c
new file mode 100644
index 000000000000..a8a1261f0e30
--- /dev/null
+++ b/fs/verity/open.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/open.c: opening fs-verity files
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/slab.h>
+
+static struct kmem_cache *fsverity_info_cachep;
+
+/**
+ * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters
+ * @params: the parameters struct to initialize
+ * @inode: the inode for which the Merkle tree is being built
+ * @hash_algorithm: number of hash algorithm to use
+ * @log_blocksize: log base 2 of block size to use
+ * @salt: pointer to salt (optional)
+ * @salt_size: size of salt, possibly 0
+ *
+ * Validate the hash algorithm and block size, then compute the tree topology
+ * (num levels, num blocks in each level, etc.) and initialize @params.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+				     const struct inode *inode,
+				     unsigned int hash_algorithm,
+				     unsigned int log_blocksize,
+				     const u8 *salt, size_t salt_size)
+{
+	const struct fsverity_hash_alg *hash_alg;
+	int err;
+	u64 blocks;
+	u64 offset;
+	int level;
+
+	memset(params, 0, sizeof(*params));
+
+	if (inode->i_size <= 0) {
+		fsverity_warn(inode, "File is empty.  This is not supported.");
+		return -EINVAL;
+	}
+
+	hash_alg = fsverity_get_hash_alg(inode, hash_algorithm);
+	if (IS_ERR(hash_alg))
+		return PTR_ERR(hash_alg);
+	params->hash_alg = hash_alg;
+	params->digest_size = hash_alg->digest_size;
+
+	params->hashstate = fsverity_prepare_hash_state(hash_alg, salt,
+							salt_size);
+	if (IS_ERR(params->hashstate)) {
+		err = PTR_ERR(params->hashstate);
+		params->hashstate = NULL;
+		fsverity_err(inode, "Error %d preparing hash state", err);
+		goto out_err;
+	}
+
+	if (log_blocksize != PAGE_SHIFT) {
+		fsverity_warn(inode, "Unsupported log_blocksize: %u",
+			      log_blocksize);
+		err = -EINVAL;
+		goto out_err;
+	}
+	params->log_blocksize = log_blocksize;
+	params->block_size = 1 << log_blocksize;
+
+	if (WARN_ON(!is_power_of_2(params->digest_size))) {
+		err = -EINVAL;
+		goto out_err;
+	}
+	if (params->block_size < 2 * params->digest_size) {
+		fsverity_warn(inode,
+			      "Merkle tree block size (%u) too small for hash algorithm \"%s\"",
+			      params->block_size, hash_alg->name);
+		err = -EINVAL;
+		goto out_err;
+	}
+	params->log_arity = params->log_blocksize - ilog2(params->digest_size);
+	params->hashes_per_block = 1 << params->log_arity;
+
+	pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n",
+		 hash_alg->name, params->block_size, params->hashes_per_block,
+		 (int)salt_size, salt);
+
+	/*
+	 * Compute the number of levels in the Merkle tree and create a map from
+	 * level to the starting block of that level.  Level 'num_levels - 1' is
+	 * the root and is stored first.  Level 0 is the level directly "above"
+	 * the data blocks and is stored last.
+	 */
+
+	/* Compute number of levels and the number of blocks in each level */
+	blocks = (inode->i_size + params->block_size - 1) >> log_blocksize;
+	pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
+	while (blocks > 1) {
+		if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
+			fsverity_err(inode, "Too many levels in Merkle tree");
+			err = -EINVAL;
+			goto out_err;
+		}
+		blocks = (blocks + params->hashes_per_block - 1) >>
+			 params->log_arity;
+		/* temporarily using level_start[] to store blocks in level */
+		params->level_start[params->num_levels++] = blocks;
+	}
+
+	/* Compute the starting block of each level */
+	offset = 0;
+	for (level = (int)params->num_levels - 1; level >= 0; level--) {
+		blocks = params->level_start[level];
+		params->level_start[level] = offset;
+		pr_debug("Level %d is %llu blocks starting at index %llu\n",
+			 level, blocks, offset);
+		offset += blocks;
+	}
+
+	params->data_size = inode->i_size;
+	params->tree_size = offset << log_blocksize;
+	return 0;
+
+out_err:
+	kfree(params->hashstate);
+	memset(params, 0, sizeof(*params));
+	return err;
+}
+
+/* Compute the file measurement by hashing the fsverity_descriptor. */
+static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
+				    const struct fsverity_descriptor *desc,
+				    u8 *measurement)
+{
+	return fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+}
+
+/*
+ * Validate the given fsverity_descriptor and create a new fsverity_info from
+ * it.  The signature (if present) is also checked.
+ */
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+					   const void *_desc, size_t desc_size)
+{
+	const struct fsverity_descriptor *desc = _desc;
+	struct fsverity_info *vi;
+	int err;
+
+	if (desc_size < sizeof(*desc)) {
+		fsverity_err(inode, "Unrecognized descriptor size (%zu)",
+			     desc_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->version != 1) {
+		fsverity_err(inode, "Unrecognized descriptor version: %u",
+			     desc->version);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->sig_size ||
+	    memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+		fsverity_err(inode, "Reserved bits set in descriptor");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->salt_size > sizeof(desc->salt)) {
+		fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (le64_to_cpu(desc->data_size) != inode->i_size) {
+		fsverity_err(inode,
+			     "Wrong data_size: %llu (desc) != %lld (inode)",
+			     le64_to_cpu(desc->data_size), inode->i_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
+	if (!vi)
+		return ERR_PTR(-ENOMEM);
+	vi->inode = inode;
+
+	err = fsverity_init_merkle_tree_params(&vi->tree_params, inode,
+					       desc->hash_algorithm,
+					       desc->log_blocksize,
+					       desc->salt, desc->salt_size);
+	if (err) {
+		fsverity_err(inode,
+			     "Error %d initializing Merkle tree parameters",
+			     err);
+		goto out;
+	}
+
+	memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
+
+	err = compute_file_measurement(vi->tree_params.hash_alg, desc,
+				       vi->measurement);
+	if (err) {
+		fsverity_err(vi->inode, "Error %d computing file measurement",
+			     err);
+		goto out;
+	}
+	pr_debug("Computed file measurement: %s:%*phN\n",
+		 vi->tree_params.hash_alg->name,
+		 vi->tree_params.digest_size, vi->measurement);
+out:
+	if (err) {
+		fsverity_free_info(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
+{
+	/*
+	 * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
+	 * This pairs with the READ_ONCE() in fsverity_get_info().
+	 */
+	if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL)
+		fsverity_free_info(vi);
+}
+
+void fsverity_free_info(struct fsverity_info *vi)
+{
+	if (!vi)
+		return;
+	kfree(vi->tree_params.hashstate);
+	kmem_cache_free(fsverity_info_cachep, vi);
+}
+
+/* Ensure the inode has an ->i_verity_info */
+static int ensure_verity_info(struct inode *inode)
+{
+	struct fsverity_info *vi = fsverity_get_info(inode);
+	struct fsverity_descriptor *desc;
+	int res;
+
+	if (vi)
+		return 0;
+
+	res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
+	if (res < 0) {
+		fsverity_err(inode,
+			     "Error %d getting verity descriptor size", res);
+		return res;
+	}
+	if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+		fsverity_err(inode, "Verity descriptor is too large (%d bytes)",
+			     res);
+		return -EMSGSIZE;
+	}
+	desc = kmalloc(res, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
+	if (res < 0) {
+		fsverity_err(inode, "Error %d reading verity descriptor", res);
+		goto out_free_desc;
+	}
+
+	vi = fsverity_create_info(inode, desc, res);
+	if (IS_ERR(vi)) {
+		res = PTR_ERR(vi);
+		goto out_free_desc;
+	}
+
+	fsverity_set_info(inode, vi);
+	res = 0;
+out_free_desc:
+	kfree(desc);
+	return res;
+}
+
+/**
+ * fsverity_file_open - prepare to open a verity file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * When opening a verity file, deny the open if it is for writing.  Otherwise,
+ * set up the inode's ->i_verity_info if not already done.
+ *
+ * When combined with fscrypt, this must be called after fscrypt_file_open().
+ * Otherwise, we won't have the key set up to decrypt the verity metadata.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_file_open(struct inode *inode, struct file *filp)
+{
+	if (!IS_VERITY(inode))
+		return 0;
+
+	if (filp->f_mode & FMODE_WRITE) {
+		pr_debug("Denying opening verity file (ino %lu) for write\n",
+			 inode->i_ino);
+		return -EPERM;
+	}
+
+	return ensure_verity_info(inode);
+}
+EXPORT_SYMBOL_GPL(fsverity_file_open);
+
+/**
+ * fsverity_cleanup_inode - free the inode's verity info, if present
+ *
+ * Filesystems must call this on inode eviction to free ->i_verity_info.
+ */
+void fsverity_cleanup_inode(struct inode *inode)
+{
+	fsverity_free_info(inode->i_verity_info);
+	inode->i_verity_info = NULL;
+}
+EXPORT_SYMBOL_GPL(fsverity_cleanup_inode);
+
+int __init fsverity_init_info_cache(void)
+{
+	fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
+						   SLAB_RECLAIM_ACCOUNT,
+						   measurement);
+	if (!fsverity_info_cachep)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
new file mode 100644
index 000000000000..1372c236c877
--- /dev/null
+++ b/include/linux/fsverity.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs-verity: read-only file-based authenticity protection
+ *
+ * This header declares the interface between the fs/verity/ support layer and
+ * filesystems that support fs-verity.
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef _LINUX_FSVERITY_H
+#define _LINUX_FSVERITY_H
+
+#include <linux/fs.h>
+#include <uapi/linux/fsverity.h>
+
+/* Verity operations for filesystems */
+struct fsverity_operations {
+
+	/**
+	 * Get the verity descriptor of the given inode.
+	 *
+	 * @inode: an inode with the S_VERITY flag set
+	 * @buf: buffer in which to place the verity descriptor
+	 * @bufsize: size of @buf, or 0 to retrieve the size only
+	 *
+	 * If bufsize == 0, then the size of the verity descriptor is returned.
+	 * Otherwise the verity descriptor is written to 'buf' and its actual
+	 * size is returned; -ERANGE is returned if it's too large.  This may be
+	 * called by multiple processes concurrently on the same inode.
+	 *
+	 * Return: the size on success, -errno on failure
+	 */
+	int (*get_verity_descriptor)(struct inode *inode, void *buf,
+				     size_t bufsize);
+};
+
+#ifdef CONFIG_FS_VERITY
+
+static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
+{
+	/* pairs with the cmpxchg_release() in fsverity_set_info() */
+	return READ_ONCE(inode->i_verity_info);
+}
+
+/* open.c */
+
+extern int fsverity_file_open(struct inode *inode, struct file *filp);
+extern void fsverity_cleanup_inode(struct inode *inode);
+
+#else /* !CONFIG_FS_VERITY */
+
+static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
+{
+	return NULL;
+}
+
+/* open.c */
+
+static inline int fsverity_file_open(struct inode *inode, struct file *filp)
+{
+	return IS_VERITY(inode) ? -EOPNOTSUPP : 0;
+}
+
+static inline void fsverity_cleanup_inode(struct inode *inode)
+{
+}
+
+#endif	/* !CONFIG_FS_VERITY */
+
+#endif	/* _LINUX_FSVERITY_H */
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 06/16] fs-verity: add inode and superblock fields
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Analogous to fs/crypto/, add fields to the VFS inode and superblock for
use by the fs/verity/ support layer:

- ->s_vop: points to the fsverity_operations if the filesystem supports
  fs-verity, otherwise is NULL.

- ->i_verity_info: points to cached fs-verity information for the inode
  after someone opens it, otherwise is NULL.

- S_VERITY: bit in ->i_flags that identifies verity inodes, even when
  they haven't been opened yet and thus still have NULL ->i_verity_info.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fs.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..a80a192cdcf2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -64,6 +64,8 @@ struct workqueue_struct;
 struct iov_iter;
 struct fscrypt_info;
 struct fscrypt_operations;
+struct fsverity_info;
+struct fsverity_operations;
 struct fs_context;
 struct fs_parameter_description;
 
@@ -723,6 +725,10 @@ struct inode {
 	struct fscrypt_info	*i_crypt_info;
 #endif
 
+#ifdef CONFIG_FS_VERITY
+	struct fsverity_info	*i_verity_info;
+#endif
+
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
@@ -1429,6 +1435,9 @@ struct super_block {
 	const struct xattr_handler **s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
+#endif
+#ifdef CONFIG_FS_VERITY
+	const struct fsverity_operations *s_vop;
 #endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
@@ -1964,6 +1973,7 @@ struct super_operations {
 #endif
 #define S_ENCRYPTED	16384	/* Encrypted file (using fs/crypto/) */
 #define S_CASEFOLD	32768	/* Casefolded file */
+#define S_VERITY	65536	/* Verity file (using fs/verity/) */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2005,6 +2015,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_DAX(inode)		((inode)->i_flags & S_DAX)
 #define IS_ENCRYPTED(inode)	((inode)->i_flags & S_ENCRYPTED)
 #define IS_CASEFOLDED(inode)	((inode)->i_flags & S_CASEFOLD)
+#define IS_VERITY(inode)	((inode)->i_flags & S_VERITY)
 
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 05/16] fs-verity: add Kconfig and the helper functions for hashing
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add the beginnings of the fs/verity/ support layer, including the
Kconfig option and various helper functions for hashing.  To start, only
SHA-256 is supported, but other hash algorithms can easily be added.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/Kconfig                   |   2 +
 fs/Makefile                  |   1 +
 fs/verity/Kconfig            |  38 +++++
 fs/verity/Makefile           |   4 +
 fs/verity/fsverity_private.h |  91 ++++++++++++
 fs/verity/hash_algs.c        | 274 +++++++++++++++++++++++++++++++++++
 fs/verity/init.c             |  41 ++++++
 7 files changed, 451 insertions(+)
 create mode 100644 fs/verity/Kconfig
 create mode 100644 fs/verity/Makefile
 create mode 100644 fs/verity/fsverity_private.h
 create mode 100644 fs/verity/hash_algs.c
 create mode 100644 fs/verity/init.c

diff --git a/fs/Kconfig b/fs/Kconfig
index f1046cf6ad85..4b66dafbdc7b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -113,6 +113,8 @@ config MANDATORY_FILE_LOCKING
 
 source "fs/crypto/Kconfig"
 
+source "fs/verity/Kconfig"
+
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c9aea23aba56..fe7f2c07f482 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_IO_URING)		+= io_uring.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
+obj-$(CONFIG_FS_VERITY)		+= verity/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
new file mode 100644
index 000000000000..c2bca0b01ecf
--- /dev/null
+++ b/fs/verity/Kconfig
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config FS_VERITY
+	bool "FS Verity (read-only file-based authenticity protection)"
+	select CRYPTO
+	# SHA-256 is selected as it's intended to be the default hash algorithm.
+	# To avoid bloat, other wanted algorithms must be selected explicitly.
+	select CRYPTO_SHA256
+	help
+	  This option enables fs-verity.  fs-verity is the dm-verity
+	  mechanism implemented at the file level.  On supported
+	  filesystems (currently EXT4 and F2FS), userspace can use an
+	  ioctl to enable verity for a file, which causes the filesystem
+	  to build a Merkle tree for the file.  The filesystem will then
+	  transparently verify any data read from the file against the
+	  Merkle tree.  The file is also made read-only.
+
+	  This serves as an integrity check, but the availability of the
+	  Merkle tree root hash also allows efficiently supporting
+	  various use cases where normally the whole file would need to
+	  be hashed at once, such as: (a) auditing (logging the file's
+	  hash), or (b) authenticity verification (comparing the hash
+	  against a known good value, e.g. from a digital signature).
+
+	  fs-verity is especially useful on large files where not all
+	  the contents may actually be needed.  Also, fs-verity verifies
+	  data each time it is paged back in, which provides better
+	  protection against malicious disks vs. an ahead-of-time hash.
+
+	  If unsure, say N.
+
+config FS_VERITY_DEBUG
+	bool "FS Verity debugging"
+	depends on FS_VERITY
+	help
+	  Enable debugging messages related to fs-verity by default.
+
+	  Say N unless you are an fs-verity developer.
diff --git a/fs/verity/Makefile b/fs/verity/Makefile
new file mode 100644
index 000000000000..398f3f85fa18
--- /dev/null
+++ b/fs/verity/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_FS_VERITY) += hash_algs.o \
+			   init.o
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
new file mode 100644
index 000000000000..c879189bfd61
--- /dev/null
+++ b/fs/verity/fsverity_private.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs-verity: read-only file-based authenticity protection
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef _FSVERITY_PRIVATE_H
+#define _FSVERITY_PRIVATE_H
+
+#ifdef CONFIG_FS_VERITY_DEBUG
+#define DEBUG
+#endif
+
+#define pr_fmt(fmt) "fs-verity: " fmt
+
+#include <crypto/sha.h>
+#include <linux/fs.h>
+#include <uapi/linux/fsverity.h>
+
+struct ahash_request;
+
+/*
+ * Maximum depth of the Merkle tree.  Up to 64 levels are theoretically possible
+ * with a very small block size, but we'd like to limit stack usage during
+ * verification, and in practice this is plenty.  E.g., with SHA-256 and 4K
+ * blocks, a file with size UINT64_MAX bytes needs just 8 levels.
+ */
+#define FS_VERITY_MAX_LEVELS		16
+
+/*
+ * Largest digest size among all hash algorithms supported by fs-verity.
+ * Currently assumed to be <= size of fsverity_descriptor::root_hash.
+ */
+#define FS_VERITY_MAX_DIGEST_SIZE	SHA256_DIGEST_SIZE
+
+/* A hash algorithm supported by fs-verity */
+struct fsverity_hash_alg {
+	struct crypto_ahash *tfm; /* hash tfm, allocated on demand */
+	const char *name;	  /* crypto API name, e.g. sha256 */
+	unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
+	unsigned int block_size;  /* block size in bytes, e.g. 64 for SHA-256 */
+};
+
+/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
+struct merkle_tree_params {
+	const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
+	const u8 *hashstate;		/* initial hash state or NULL */
+	unsigned int digest_size;	/* same as hash_alg->digest_size */
+	unsigned int block_size;	/* size of data and tree blocks */
+	unsigned int hashes_per_block;	/* number of hashes per tree block */
+	unsigned int log_blocksize;	/* log2(block_size) */
+	unsigned int log_arity;		/* log2(hashes_per_block) */
+	unsigned int num_levels;	/* number of levels in Merkle tree */
+	u64 data_size;			/* data size in bytes */
+	u64 tree_size;			/* Merkle tree size in bytes */
+
+	/*
+	 * Starting block index for each tree level, ordered from leaf level (0)
+	 * to root level ('num_levels - 1')
+	 */
+	u64 level_start[FS_VERITY_MAX_LEVELS];
+};
+
+/* hash_algs.c */
+
+extern struct fsverity_hash_alg fsverity_hash_algs[];
+
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+						      unsigned int num);
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+				      const u8 *salt, size_t salt_size);
+int fsverity_hash_page(const struct merkle_tree_params *params,
+		       const struct inode *inode,
+		       struct ahash_request *req, struct page *page, u8 *out);
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+			 const void *data, size_t size, u8 *out);
+void __init fsverity_check_hash_algs(void);
+
+/* init.c */
+
+extern void __printf(3, 4) __cold
+fsverity_msg(const struct inode *inode, const char *level,
+	     const char *fmt, ...);
+
+#define fsverity_warn(inode, fmt, ...)		\
+	fsverity_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fsverity_err(inode, fmt, ...)		\
+	fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
+
+#endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
new file mode 100644
index 000000000000..46df17094fc2
--- /dev/null
+++ b/fs/verity/hash_algs.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/hash_algs.c: fs-verity hash algorithms
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/scatterlist.h>
+
+/* The hash algorithms supported by fs-verity */
+struct fsverity_hash_alg fsverity_hash_algs[] = {
+	[FS_VERITY_HASH_ALG_SHA256] = {
+		.name = "sha256",
+		.digest_size = SHA256_DIGEST_SIZE,
+		.block_size = SHA256_BLOCK_SIZE,
+	},
+};
+
+/**
+ * fsverity_get_hash_alg() - validate and prepare a hash algorithm
+ * @inode: optional inode for logging purposes
+ * @num: the hash algorithm number
+ *
+ * Get the struct fsverity_hash_alg for the given hash algorithm number, and
+ * ensure it has a hash transform ready to go.  The hash transforms are
+ * allocated on-demand so that we don't waste resources unnecessarily, and
+ * because the crypto modules may be initialized later than fs/verity/.
+ *
+ * Return: pointer to the hash alg on success, else an ERR_PTR()
+ */
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+						      unsigned int num)
+{
+	struct fsverity_hash_alg *alg;
+	struct crypto_ahash *tfm;
+	int err;
+
+	if (num >= ARRAY_SIZE(fsverity_hash_algs) ||
+	    !fsverity_hash_algs[num].name) {
+		fsverity_warn(inode, "Unknown hash algorithm number: %u", num);
+		return ERR_PTR(-EINVAL);
+	}
+	alg = &fsverity_hash_algs[num];
+
+	/* pairs with cmpxchg() below */
+	tfm = READ_ONCE(alg->tfm);
+	if (likely(tfm != NULL))
+		return alg;
+	/*
+	 * Using the shash API would make things a bit simpler, but the ahash
+	 * API is preferable as it allows the use of crypto accelerators.
+	 */
+	tfm = crypto_alloc_ahash(alg->name, 0, 0);
+	if (IS_ERR(tfm)) {
+		if (PTR_ERR(tfm) == -ENOENT)
+			fsverity_warn(inode,
+				      "Missing crypto API support for hash algorithm \"%s\"",
+				      alg->name);
+		else
+			fsverity_err(inode,
+				     "Error allocating hash algorithm \"%s\": %ld",
+				     alg->name, PTR_ERR(tfm));
+		return ERR_CAST(tfm);
+	}
+
+	err = -EINVAL;
+	if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm)))
+		goto err_free_tfm;
+	if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm)))
+		goto err_free_tfm;
+
+	pr_info("%s using implementation \"%s\"\n",
+		alg->name, crypto_ahash_driver_name(tfm));
+
+	/* pairs with READ_ONCE() above */
+	if (cmpxchg(&alg->tfm, NULL, tfm) != NULL)
+		crypto_free_ahash(tfm);
+
+	return alg;
+
+err_free_tfm:
+	crypto_free_ahash(tfm);
+	return ERR_PTR(err);
+}
+
+/**
+ * fsverity_prepare_hash_state() - precompute the initial hash state
+ * @alg: hash algorithm
+ * @salt: a salt which is to be prepended to all data to be hashed
+ * @salt_size: salt size in bytes, possibly 0
+ *
+ * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
+ *	   initial hash state on success or an ERR_PTR() on failure.
+ */
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+				      const u8 *salt, size_t salt_size)
+{
+	u8 *hashstate = NULL;
+	struct ahash_request *req = NULL;
+	u8 *padded_salt = NULL;
+	size_t padded_salt_size;
+	struct scatterlist sg;
+	DECLARE_CRYPTO_WAIT(wait);
+	int err;
+
+	if (salt_size == 0)
+		return NULL;
+
+	hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL);
+	if (!hashstate)
+		return ERR_PTR(-ENOMEM);
+
+	req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
+	if (!req) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	/*
+	 * Zero-pad the salt to the next multiple of the input size of the hash
+	 * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128
+	 * bytes for SHA-512.  This ensures that the hash algorithm won't have
+	 * any bytes buffered internally after processing the salt, thus making
+	 * salted hashing just as fast as unsalted hashing.
+	 */
+	padded_salt_size = round_up(salt_size, alg->block_size);
+	padded_salt = kzalloc(padded_salt_size, GFP_KERNEL);
+	if (!padded_salt) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+	memcpy(padded_salt, salt, salt_size);
+
+	sg_init_one(&sg, padded_salt, padded_salt_size);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &wait);
+	ahash_request_set_crypt(req, &sg, NULL, padded_salt_size);
+
+	err = crypto_wait_req(crypto_ahash_init(req), &wait);
+	if (err)
+		goto err_free;
+
+	err = crypto_wait_req(crypto_ahash_update(req), &wait);
+	if (err)
+		goto err_free;
+
+	err = crypto_ahash_export(req, hashstate);
+	if (err)
+		goto err_free;
+out:
+	kfree(padded_salt);
+	ahash_request_free(req);
+	return hashstate;
+
+err_free:
+	kfree(hashstate);
+	hashstate = ERR_PTR(err);
+	goto out;
+}
+
+/**
+ * fsverity_hash_page() - hash a single data or hash page
+ * @params: the Merkle tree's parameters
+ * @inode: inode for which the hashing is being done
+ * @req: preallocated hash request
+ * @page: the page to hash
+ * @out: output digest, size 'params->digest_size' bytes
+ *
+ * Hash a single data or hash block, assuming block_size == PAGE_SIZE.
+ * The hash is salted if a salt is specified in the Merkle tree parameters.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_hash_page(const struct merkle_tree_params *params,
+		       const struct inode *inode,
+		       struct ahash_request *req, struct page *page, u8 *out)
+{
+	struct scatterlist sg;
+	DECLARE_CRYPTO_WAIT(wait);
+	int err;
+
+	if (WARN_ON(params->block_size != PAGE_SIZE))
+		return -EINVAL;
+
+	sg_init_table(&sg, 1);
+	sg_set_page(&sg, page, PAGE_SIZE, 0);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &wait);
+	ahash_request_set_crypt(req, &sg, out, PAGE_SIZE);
+
+	if (params->hashstate) {
+		err = crypto_ahash_import(req, params->hashstate);
+		if (err) {
+			fsverity_err(inode,
+				     "Error %d importing hash state", err);
+			return err;
+		}
+		err = crypto_ahash_finup(req);
+	} else {
+		err = crypto_ahash_digest(req);
+	}
+
+	err = crypto_wait_req(err, &wait);
+	if (err)
+		fsverity_err(inode, "Error %d computing page hash", err);
+	return err;
+}
+
+/**
+ * fsverity_hash_buffer() - hash some data
+ * @alg: the hash algorithm to use
+ * @data: the data to hash
+ * @size: size of data to hash
+ * @out: output digest, size 'alg->digest_size' bytes
+ *
+ * Hash some data which is located in physically contiguous memory (i.e. memory
+ * allocated by kmalloc(), not by vmalloc()).  No salt is used.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+			 const void *data, size_t size, u8 *out)
+{
+	struct ahash_request *req;
+	struct scatterlist sg;
+	DECLARE_CRYPTO_WAIT(wait);
+	int err;
+
+	req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	sg_init_one(&sg, data, size);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &wait);
+	ahash_request_set_crypt(req, &sg, out, size);
+
+	err = crypto_wait_req(crypto_ahash_digest(req), &wait);
+
+	ahash_request_free(req);
+	return err;
+}
+
+void __init fsverity_check_hash_algs(void)
+{
+	size_t i;
+
+	/*
+	 * Sanity check the hash algorithms (could be a build-time check, but
+	 * they're in an array)
+	 */
+	for (i = 0; i < ARRAY_SIZE(fsverity_hash_algs); i++) {
+		const struct fsverity_hash_alg *alg = &fsverity_hash_algs[i];
+
+		if (!alg->name)
+			continue;
+
+		BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE);
+
+		/*
+		 * For efficiency, the implementation currently assumes the
+		 * digest and block sizes are powers of 2.  This limitation can
+		 * be lifted if the code is updated to handle other values.
+		 */
+		BUG_ON(!is_power_of_2(alg->digest_size));
+		BUG_ON(!is_power_of_2(alg->block_size));
+	}
+}
diff --git a/fs/verity/init.c b/fs/verity/init.c
new file mode 100644
index 000000000000..40076bbe452a
--- /dev/null
+++ b/fs/verity/init.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/init.c: fs-verity module initialization and logging
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/ratelimit.h>
+
+void fsverity_msg(const struct inode *inode, const char *level,
+		  const char *fmt, ...)
+{
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct va_format vaf;
+	va_list args;
+
+	if (!__ratelimit(&rs))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	if (inode)
+		printk("%sfs-verity (%s, inode %lu): %pV\n",
+		       level, inode->i_sb->s_id, inode->i_ino, &vaf);
+	else
+		printk("%sfs-verity: %pV\n", level, &vaf);
+	va_end(args);
+}
+
+static int __init fsverity_init(void)
+{
+	fsverity_check_hash_algs();
+
+	pr_debug("Initialized fs-verity\n");
+	return 0;
+}
+late_initcall(fsverity_init)
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 04/16] fs: uapi: define verity bit for FS_IOC_GETFLAGS
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add FS_VERITY_FL to the flags for FS_IOC_GETFLAGS, so that applications
can easily determine whether a file is a verity file at the same time as
they're checking other file flags.  This flag will be gettable only;
FS_IOC_SETFLAGS won't allow setting it, since an ioctl must be used
instead to provide more parameters.

This flag matches the on-disk bit that was already allocated for ext4.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/uapi/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 59c71fa8c553..df261b7e0587 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -306,6 +306,7 @@ struct fscrypt_key {
 #define FS_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define FS_HUGE_FILE_FL			0x00040000 /* Reserved for ext4 */
 #define FS_EXTENT_FL			0x00080000 /* Extents */
+#define FS_VERITY_FL			0x00100000 /* Verity protected inode */
 #define FS_EA_INODE_FL			0x00200000 /* Inode used for large EA */
 #define FS_EOFBLOCKS_FL			0x00400000 /* Reserved for ext4 */
 #define FS_NOCOW_FL			0x00800000 /* Do not cow file */
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 03/16] fs-verity: add UAPI header
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add the UAPI header for fs-verity, including two ioctls:

- FS_IOC_ENABLE_VERITY
- FS_IOC_MEASURE_VERITY

These ioctls are documented in the "User API" section of
Documentation/filesystems/fsverity.rst.

Examples of using these ioctls can be found in fsverity-utils
(https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/fsverity-utils.git).

I've also written xfstests that test these ioctls
(https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfstests-dev.git/log/?h=fsverity).

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/ioctl/ioctl-number.txt |  1 +
 include/uapi/linux/fsverity.h        | 39 ++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 include/uapi/linux/fsverity.h

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index c9558146ac58..21767c81e86d 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -225,6 +225,7 @@ Code  Seq#(hex)	Include File		Comments
 'f'	00-0F	fs/ext4/ext4.h		conflict!
 'f'	00-0F	linux/fs.h		conflict!
 'f'	00-0F	fs/ocfs2/ocfs2_fs.h	conflict!
+'f'	81-8F	linux/fsverity.h
 'g'	00-0F	linux/usb/gadgetfs.h
 'g'	20-2F	linux/usb/g_printer.h
 'h'	00-7F				conflict! Charon filesystem
diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h
new file mode 100644
index 000000000000..57d1d7fc0c34
--- /dev/null
+++ b/include/uapi/linux/fsverity.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * fs-verity user API
+ *
+ * These ioctls can be used on filesystems that support fs-verity.  See the
+ * "User API" section of Documentation/filesystems/fsverity.rst.
+ *
+ * Copyright 2019 Google LLC
+ */
+#ifndef _UAPI_LINUX_FSVERITY_H
+#define _UAPI_LINUX_FSVERITY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define FS_VERITY_HASH_ALG_SHA256	1
+
+struct fsverity_enable_arg {
+	__u32 version;
+	__u32 hash_algorithm;
+	__u32 block_size;
+	__u32 salt_size;
+	__u64 salt_ptr;
+	__u32 sig_size;
+	__u32 __reserved1;
+	__u64 sig_ptr;
+	__u64 __reserved2[11];
+};
+
+struct fsverity_digest {
+	__u16 digest_algorithm;
+	__u16 digest_size; /* input/output */
+	__u8 digest[];
+};
+
+#define FS_IOC_ENABLE_VERITY	_IOW('f', 133, struct fsverity_enable_arg)
+#define FS_IOC_MEASURE_VERITY	_IOWR('f', 134, struct fsverity_digest)
+
+#endif /* _UAPI_LINUX_FSVERITY_H */
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 02/16] fs-verity: add MAINTAINERS file entry
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

fs-verity will be jointly maintained by Eric Biggers and Theodore Ts'o.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 MAINTAINERS | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a6954776a37e..655065116f92 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6505,6 +6505,18 @@ S:	Maintained
 F:	fs/notify/
 F:	include/linux/fsnotify*.h
 
+FSVERITY: READ-ONLY FILE-BASED AUTHENTICITY PROTECTION
+M:	Eric Biggers <ebiggers@kernel.org>
+M:	Theodore Y. Ts'o <tytso@mit.edu>
+L:	linux-fscrypt@vger.kernel.org
+Q:	https://patchwork.kernel.org/project/linux-fscrypt/list/
+T:	git git://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git fsverity
+S:	Supported
+F:	fs/verity/
+F:	include/linux/fsverity.h
+F:	include/uapi/linux/fsverity.h
+F:	Documentation/filesystems/fsverity.rst
+
 FUJITSU LAPTOP EXTRAS
 M:	Jonathan Woithe <jwoithe@just42.net>
 L:	platform-driver-x86@vger.kernel.org
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 01/16] fs-verity: add a documentation file
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add a documentation file for fs-verity, covering:

- Introduction
- Use cases
- User API
    - FS_IOC_ENABLE_VERITY
    - FS_IOC_MEASURE_VERITY
    - FS_IOC_GETFLAGS
- Accessing verity files
- File measurement computation
    - Merkle tree
    - fs-verity descriptor
- Built-in signature verification
- Filesystem support
    - ext4
    - f2fs
- Implementation details
    - Verifying data
        - Pagecache
        - Block device based filesystems
- Userspace utility
- Tests
- FAQ

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fsverity.rst | 708 +++++++++++++++++++++++++
 Documentation/filesystems/index.rst    |   1 +
 2 files changed, 709 insertions(+)
 create mode 100644 Documentation/filesystems/fsverity.rst

diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
new file mode 100644
index 000000000000..0469bd46f2ee
--- /dev/null
+++ b/Documentation/filesystems/fsverity.rst
@@ -0,0 +1,708 @@
+=======================================================
+fs-verity: read-only file-based authenticity protection
+=======================================================
+
+Introduction
+============
+
+fs-verity (``fs/verity/``) is a support layer that filesystems can
+hook into to support transparent integrity and authenticity protection
+of read-only files.  Currently, it is supported by the ext4 and f2fs
+filesystems.  Like fscrypt, not too much filesystem-specific code is
+needed to support fs-verity.
+
+fs-verity is similar to `dm-verity
+<https://www.kernel.org/doc/Documentation/device-mapper/verity.txt>`_
+but works on files rather than block devices.  On regular files on
+filesystems supporting fs-verity, userspace can execute an ioctl that
+causes the filesystem to build a Merkle tree for the file and persist
+it to a filesystem-specific location associated with the file.
+
+After this, the file is made readonly, and all reads from the file are
+automatically verified against the file's Merkle tree.  Reads of any
+corrupted data, including mmap reads, will fail.
+
+Userspace can use another ioctl to retrieve the root hash (actually
+the "file measurement", which is a hash that includes the root hash)
+that fs-verity is enforcing for the file.  This ioctl executes in
+constant time, regardless of the file size.
+
+fs-verity is essentially a way to hash a file in constant time,
+subject to the caveat that reads which would violate the hash will
+fail at runtime.
+
+Use cases
+=========
+
+By itself, the base fs-verity feature only provides integrity
+protection, i.e. detection of accidental (non-malicious) corruption.
+
+However, because fs-verity makes retrieving the file hash extremely
+efficient, it's primarily meant to be used as a tool to support
+authentication (detection of malicious modifications) or auditing
+(logging file hashes before use).
+
+Trusted userspace code (e.g. operating system code running on a
+read-only partition that is itself authenticated by dm-verity) can
+authenticate the contents of an fs-verity file by using the
+`FS_IOC_MEASURE_VERITY`_ ioctl to retrieve its hash, then verifying a
+digital signature of it.
+
+A standard file hash could be used instead of fs-verity.  However,
+this is inefficient if the file is large and only a small portion may
+be accessed.  This is often the case for Android application package
+(APK) files, for example.  These typically contain many translations,
+classes, and other resources that are infrequently or even never
+accessed on a particular device.  It would be slow and wasteful to
+read and hash the entire file before starting the application.
+
+Unlike an ahead-of-time hash, fs-verity also re-verifies data each
+time it's paged in.  This ensures that malicious disk firmware can't
+undetectably change the contents of the file at runtime.
+
+fs-verity does not replace or obsolete dm-verity.  dm-verity should
+still be used on read-only filesystems.  fs-verity is for files that
+must live on a read-write filesystem because they are independently
+updated and potentially user-installed, so dm-verity cannot be used.
+
+The base fs-verity feature is a hashing mechanism only; actually
+authenticating the files is up to userspace.  However, to meet some
+users' needs, fs-verity optionally supports a simple signature
+verification mechanism where users can configure the kernel to require
+that all fs-verity files be signed by a key loaded into a keyring; see
+`Built-in signature verification`_.  Support for fs-verity file hashes
+in IMA (Integrity Measurement Architecture) policies is also planned.
+
+User API
+========
+
+FS_IOC_ENABLE_VERITY
+--------------------
+
+The FS_IOC_ENABLE_VERITY ioctl enables fs-verity on a file.  It takes
+in a pointer to a :c:type:`struct fsverity_enable_arg`, defined as
+follows::
+
+    struct fsverity_enable_arg {
+            __u32 version;
+            __u32 hash_algorithm;
+            __u32 block_size;
+            __u32 salt_size;
+            __u64 salt_ptr;
+            __u32 sig_size;
+            __u32 __reserved1;
+            __u64 sig_ptr;
+            __u64 __reserved2[11];
+    };
+
+This structure contains the parameters of the Merkle tree to build for
+the file, and optionally contains a signature.  It must be initialized
+as follows:
+
+- ``version`` must be 1.
+- ``hash_algorithm`` must be the identifier for the hash algorithm to
+  use for the Merkle tree, such as FS_VERITY_HASH_ALG_SHA256.  See
+  ``include/uapi/linux/fsverity.h`` for the list of possible values.
+- ``block_size`` must be the Merkle tree block size.  Currently, this
+  must be equal to the system page size, which is usually 4096 bytes.
+  Other sizes may be supported in the future.  This value is not
+  necessarily the same as the filesystem block size.
+- ``salt_size`` is the size of the salt in bytes, or 0 if no salt is
+  provided.  The salt is a value that is prepended to every hashed
+  block; it can be used to personalize the hashing for a particular
+  file or device.  Currently the maximum salt size is 32 bytes.
+- ``salt_ptr`` is the pointer to the salt, or NULL if no salt is
+  provided.
+- ``sig_size`` is the size of the signature in bytes, or 0 if no
+  signature is provided.  Currently the signature is (somewhat
+  arbitrarily) limited to 16128 bytes.  See `Built-in signature
+  verification`_ for more information.
+- ``sig_ptr``  is the pointer to the signature, or NULL if no
+  signature is provided.
+- All reserved fields must be zeroed.
+
+FS_IOC_ENABLE_VERITY causes the filesystem to build a Merkle tree for
+the file and persist it to a filesystem-specific location associated
+with the file, then mark the file as a verity file.  This ioctl may
+take a long time to execute on large files, and it is interruptible by
+fatal signals.
+
+FS_IOC_ENABLE_VERITY checks for write access to the inode.  However,
+it must be executed on an O_RDONLY file descriptor and no processes
+can have the file open for writing.  Attempts to open the file for
+writing while this ioctl is executing will fail with ETXTBSY.  (This
+is necessary to guarantee that no writable file descriptors will exist
+after verity is enabled, and to guarantee that the file's contents are
+stable while the Merkle tree is being built over it.)
+
+On success, FS_IOC_ENABLE_VERITY returns 0, and the file becomes a
+verity file.  On failure (including the case of interruption by a
+fatal signal), no changes are made to the file.
+
+FS_IOC_ENABLE_VERITY can fail with the following errors:
+
+- ``EACCES``: the process does not have write access to the file
+- ``EEXIST``: the file already has verity enabled
+- ``EFAULT``: the caller provided inaccessible memory
+- ``EINTR``: the operation was interrupted by a fatal signal
+- ``EINVAL``: unsupported version, hash algorithm, or block size; or
+  reserved bits are set; or the file descriptor refers to neither a
+  regular file nor a directory; or the file is empty.
+- ``EISDIR``: the file descriptor refers to a directory
+- ``EMSGSIZE``: the salt or signature is too long
+- ``ENOENT``: fs-verity recognizes the hash algorithm, but it's not
+  available in the kernel's crypto API as currently configured (e.g.
+  for SHA-512, missing CONFIG_CRYPTO_SHA512).
+- ``ENOTTY``: this type of filesystem does not implement fs-verity
+- ``EOPNOTSUPP``: the kernel was not configured with fs-verity
+  support, or the filesystem superblock has not had the 'verity'
+  feature enabled on it.  (See `Filesystem support`_.)
+- ``EPERM``: the file is append-only
+- ``EROFS``: the filesystem is read-only
+- ``ETXTBSY``: someone has the file open for writing.  This can be the
+  caller's file descriptor, another open file descriptor, or the file
+  reference held by a writable memory map.
+
+FS_IOC_MEASURE_VERITY
+---------------------
+
+The FS_IOC_MEASURE_VERITY ioctl retrieves the measurement of a verity
+file.  The file measurement is a digest that cryptographically
+identifies the file contents that are being enforced on reads.
+
+This ioctl takes in a pointer to a variable-length structure::
+
+    struct fsverity_digest {
+            __u16 digest_algorithm;
+            __u16 digest_size; /* input/output */
+            __u8 digest[];
+    };
+
+``digest_size`` is an input/output field.  On input, it must be
+initialized to the number of bytes allocated for the variable-length
+``digest`` field.
+
+On success, 0 is returned and the kernel fills in the structure as
+follows:
+
+- ``digest_algorithm`` will be the hash algorithm used for the file
+  measurement.  It will match ``fsverity_enable_arg::hash_algorithm``.
+- ``digest_size`` will be the size of the digest in bytes, e.g. 32
+  for SHA-256.  (This can be redundant with ``digest_algorithm``.)
+- ``digest`` will be the actual bytes of the digest.
+
+FS_IOC_MEASURE_VERITY is guaranteed to execute in constant time,
+regardless of the size of the file.
+
+FS_IOC_MEASURE_VERITY can fail with the following errors:
+
+- ``EFAULT``: the caller provided inaccessible memory
+- ``ENODATA``: the file is not a verity file
+- ``ENOTTY``: this type of filesystem does not implement fs-verity
+- ``EOPNOTSUPP``: the kernel was not configured with fs-verity
+  support, or the filesystem superblock has not had the 'verity'
+  feature enabled on it.  (See `Filesystem support`_.)
+- ``EOVERFLOW``: the digest is longer than the specified
+  ``digest_size`` bytes.  Try providing a larger buffer.
+
+FS_IOC_GETFLAGS
+---------------
+
+The existing ioctl FS_IOC_GETFLAGS (which isn't specific to fs-verity)
+can also be used to check whether a file has fs-verity enabled or not.
+To do so, check for FS_VERITY_FL (0x00100000) in the returned flags.
+
+The verity flag is not settable via FS_IOC_SETFLAGS.  You must use
+FS_IOC_ENABLE_VERITY instead, since parameters must be provided.
+
+Accessing verity files
+======================
+
+Applications can transparently access a verity file just like a
+non-verity one, with the following exceptions:
+
+- Verity files are readonly.  They cannot be opened for writing or
+  truncate()d, even if the file mode bits allow it.  Attempts to do
+  one of these things will fail with EPERM.  However, changes to
+  metadata such as owner, mode, timestamps, and xattrs are still
+  allowed, since these are not measured by fs-verity.  Verity files
+  can also still be renamed, deleted, and linked to.
+
+- Direct I/O is not supported on verity files.  Attempts to use direct
+  I/O on such files will fall back to buffered I/O.
+
+- DAX (Direct Access) is not supported on verity files, because this
+  would circumvent the data verification.
+
+- Reads of data that doesn't match the verity Merkle tree will fail
+  with EIO (for read()) or SIGBUS (for mmap() reads).
+
+- If the sysctl "fs.verity.require_signatures" is set to 1 and the
+  file's verity measurement is not signed by a key in the fs-verity
+  keyring, then opening the file will fail.  See `Built-in signature
+  verification`_.
+
+Direct access to the Merkle tree is not supported.  Therefore, if a
+verity file is copied, or is backed up and restored, then it will lose
+its "verity"-ness.  fs-verity is primarily meant for files like
+executables that are managed by a package manager.
+
+File measurement computation
+============================
+
+This section describes how fs-verity hashes the file contents using a
+Merkle tree to produce the "file measurement" which cryptographically
+identifies the file contents.  This algorithm is the same for all
+filesystems that support fs-verity.
+
+Userspace only needs to be aware of this algorithm if it needs to
+compute the file measurement itself, e.g. in order to sign the file.
+
+Merkle tree
+-----------
+
+The file contents is divided into blocks, where the block size is
+configurable but is usually 4096 bytes.  The end of the last block is
+zero-padded if needed.  Each block is then hashed, producing the first
+level of hashes.  Then, the hashes in this first level are grouped
+into 'blocksize'-byte blocks (zero-padding the ends as needed) and
+these blocks are hashed, producing the second level of hashes.  This
+proceeds up the tree until only a single block remains.  The hash of
+this block is the "Merkle tree root hash".
+
+If the entire file contents fit in one block, then the "Merkle tree
+root hash" is simply the hash of the single data block.  Empty files
+are not supported.
+
+The "blocks" here are not necessarily the same as "filesystem blocks".
+
+If a salt was specified, then it's zero-padded to the closest multiple
+of the input size of the hash algorithm's compression function, e.g.
+64 bytes for for SHA-256 or 128 bytes for SHA-512.  The padded salt is
+prepended to every data or Merkle tree block that is hashed.
+
+The purpose of the block padding is to cause every hash to be taken
+over the same amount of data, which simplifies the implementation and
+keeps open more possibilities for hardware acceleration.  The purpose
+of the salt padding is to make the salting "free" when the salted hash
+state is precomputed, then imported for each hash.
+
+Example: in the recommended configuration of SHA-256 and 4K blocks,
+128 hash values fit in each block.  Thus, each level of the Merkle
+tree is approximately 128 times smaller than the previous, and for
+large files the Merkle tree's size converges to approximately 1/127 of
+the original file size.  However, for small files, the padding is
+significant, making the space overhead proportionally more.
+
+fs-verity descriptor
+--------------------
+
+By itself, the Merkle tree root hash is ambiguous.  For example, it
+can't a distinguish a large file from a small second file whose data
+is exactly the top-level hash block of the first file.  Ambiguities
+also arise from the convention of padding to the next block boundary.
+
+To solve this problem, the verity file measurement is actually
+computed as a hash of the following structure, which contains the
+Merkle tree root hash as well as other fields such as the file size::
+
+    struct fsverity_descriptor {
+            __u8 version;           /* must be 1 */
+            __u8 hash_algorithm;    /* Merkle tree hash algorithm */
+            __u8 log_blocksize;     /* log2 of size of data and tree blocks */
+            __u8 salt_size;         /* size of salt in bytes; 0 if none */
+            __le32 sig_size;        /* must be 0 */
+            __le64 data_size;       /* size of file the Merkle tree is built over */
+            __u8 root_hash[64];     /* Merkle tree root hash */
+            __u8 salt[32];          /* salt prepended to each hashed block */
+            __u8 __reserved[144];   /* must be 0's */
+    };
+
+Note that the ``sig_size`` field must be set to 0 for the purpose of
+computing the file measurement, even if a signature was provided (or
+will be provided) to `FS_IOC_ENABLE_VERITY`_.
+
+Built-in signature verification
+===============================
+
+With CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y, fs-verity supports putting
+a portion of an authentication policy (see `Use cases`_) in the
+kernel.  Specifically, it adds support for:
+
+1. At fs-verity module initialization time, a keyring ".fs-verity" is
+   created.  The root user can add trusted X.509 certificates to this
+   keyring using the add_key() system call, then (when done)
+   optionally use keyctl_restrict_keyring() to prevent additional
+   certificates from being added.
+
+2. `FS_IOC_ENABLE_VERITY`_ accepts a pointer to a PKCS#7 formatted
+   signature in DER format of the file measurement.  On success, this
+   signature is persisted alongside the Merkle tree.  Then, any time
+   the file is opened, the kernel will verify this signature against
+   the certificates in the ".fs-verity" keyring, and verify that it
+   matches the actual file measurement.
+
+3. A new sysctl "fs.verity.require_signatures" is made available.
+   When set to 1, the kernel requires that all verity files have a
+   correctly signed file measurement as described in (2).
+
+File measurements must be signed in the following format, which is
+similar to the structure used by `FS_IOC_MEASURE_VERITY`_::
+
+    struct fsverity_signed_digest {
+            char magic[8];                  /* must be "FSVerity" */
+            __le16 digest_algorithm;
+            __le16 digest_size;
+            __u8 digest[];
+    };
+
+fs-verity's built-in signature verification support is meant as a
+relatively simple mechanism that can be used to provide some level of
+authenticity protection for verity files, as an alternative to doing
+the signature verification in userspace or using IMA-appraisal.
+However, with this mechanism, userspace programs still need to check
+that the verity bit is set, and there is no protection against verity
+files being swapped around.
+
+Filesystem support
+==================
+
+fs-verity is currently supported by the ext4 and f2fs filesystems.
+The CONFIG_FS_VERITY kconfig option must be enabled to use fs-verity
+on either filesystem.
+
+``include/linux/fsverity.h`` declares the interface between the
+``fs/verity/`` support layer and filesystems.  Briefly, filesystems
+must provide an ``fsverity_operations`` structure that provides
+methods to read and write the verity metadata to a filesystem-specific
+location, including the Merkle tree blocks and
+``fsverity_descriptor``.  Filesystems must also call functions in
+``fs/verity/`` at certain times, such as when a file is opened or when
+pages have been read into the pagecache.  (See `Verifying data`_.)
+
+ext4
+----
+
+ext4 supports fs-verity since Linux TODO and e2fsprogs v1.45.2.
+
+To create verity files on an ext4 filesystem, the filesystem must have
+been formatted with ``-O verity`` or had ``tune2fs -O verity`` run on
+it.  "verity" is an RO_COMPAT filesystem feature, so once set, old
+kernels will only be able to mount the filesystem readonly, and old
+versions of e2fsck will be unable to check the filesystem.  Moreover,
+currently ext4 only supports mounting a filesystem with the "verity"
+feature when its block size is equal to PAGE_SIZE (often 4096 bytes).
+
+ext4 sets the EXT4_VERITY_FL on-disk inode flag on verity files.  It
+can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared.
+
+ext4 also supports encryption, which can be used simultaneously with
+fs-verity.  In this case, the plaintext data is verified rather than
+the ciphertext.  This is necessary in order to make the file
+measurement meaningful, since every file is encrypted differently.
+
+ext4 stores the verity metadata (Merkle tree and fsverity_descriptor)
+past the end of the file, starting at the first page fully beyond
+i_size.  This approach works because (a) verity files are readonly,
+and (b) pages fully beyond i_size aren't visible to userspace but can
+be read/written internally by ext4 with only some relatively small
+changes to ext4.  This approach avoids having to depend on the
+EA_INODE feature and on rearchitecturing ext4's xattr support to
+support paging multi-gigabyte xattrs into memory, and to support
+encrypting xattrs.  Note that the verity metadata *must* be encrypted
+when the file is, since it contains hashes of the plaintext data.
+
+Currently, ext4 verity only supports the case where the Merkle tree
+block size, filesystem block size, and page size are all the same.
+
+f2fs
+----
+
+f2fs supports fs-verity since Linux TODO and f2fs-tools v1.11.0.
+
+To create verity files on an f2fs filesystem, the filesystem must have
+been formatted with ``-O verity``.
+
+f2fs sets the FADVISE_VERITY_BIT on-disk inode flag on verity files.
+It can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be
+cleared.
+
+Like ext4, f2fs stores the verity metadata (Merkle tree and
+fsverity_descriptor) past the end of the file, starting at the first
+page fully beyond i_size.  See explanation for ext4 above.  Moreover,
+f2fs supports at most 4096 bytes of xattr entries per inode which
+wouldn't be enough for even a single Merkle tree block.
+
+Currently, f2fs verity only supports a Merkle tree block size of 4096.
+
+Implementation details
+======================
+
+Verifying data
+--------------
+
+fs-verity ensures that all reads of a verity file's data are verified,
+regardless of which syscall is used to do the read (e.g. mmap(),
+read(), pread()) and regardless of whether it's the first read or a
+later read (unless the later read can return cached data that was
+already verified).  Below, we describe how filesystems implement this.
+
+Pagecache
+~~~~~~~~~
+
+For filesystems using Linux's pagecache, the ``->readpage()`` and
+``->readpages()`` methods must be modified to verify pages before they
+are marked Uptodate.  Merely hooking ``->read_iter()`` would be
+insufficient, since ``->read_iter()`` is not used for memory maps.
+
+Therefore, fs/verity/ provides a function fsverity_verify_page() which
+verifies a page that has been read into the pagecache of a verity
+inode, but is still locked and not Uptodate, so it's not yet readable
+by userspace.  As needed to do the verification,
+fsverity_verify_page() will call back into the filesystem to read
+Merkle tree pages via fsverity_operations::read_merkle_tree_page().
+
+fsverity_verify_page() returns false if verification failed; in this
+case, the filesystem must not set the page Uptodate.  Following this,
+as per the usual Linux pagecache behavior, attempts by userspace to
+read() from the part of the file containing the page will fail with
+EIO, and accesses to the page within a memory map will raise SIGBUS.
+
+fsverity_verify_page() currently only supports the case where the
+Merkle tree block size is equal to PAGE_SIZE (often 4096 bytes).
+
+In principle, fsverity_verify_page() verifies the entire path in the
+Merkle tree from the data page to the root hash.  However, for
+efficiency the filesystem may cache the hash pages.  Therefore,
+fsverity_verify_page() only ascends the tree reading hash pages until
+an already-verified hash page is seen, as indicated by the PageChecked
+bit being set.  It then verifies the path to that page.
+
+This optimization, which is also used by dm-verity, results in
+excellent sequential read performance.  This is because usually (e.g.
+127 in 128 times for 4K blocks and SHA-256) the hash page from the
+bottom level of the tree will already be cached and checked from
+reading a previous data page.  However, random reads perform worse.
+
+Block device based filesystems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Block device based filesystems (e.g. ext4 and f2fs) in Linux also use
+the pagecache, so the above subsection applies too.  However, they
+also usually read many pages from a file at once, grouped into a
+structure called a "bio".  To make it easier for these types of
+filesystems to support fs-verity, fs/verity/ also provides a function
+fsverity_verify_bio() which verifies all pages in a bio.
+
+ext4 and f2fs also support encryption.  If a verity file is also
+encrypted, the pages must be decrypted before being verified.  To
+support this, these filesystems allocate a "post-read context" for
+each bio and store it in ``->bi_private``::
+
+    struct bio_post_read_ctx {
+           struct bio *bio;
+           struct work_struct work;
+           unsigned int cur_step;
+           unsigned int enabled_steps;
+    };
+
+``enabled_steps`` is a bitmask that specifies whether decryption,
+verity, or both is enabled.  After the bio completes, for each needed
+postprocessing step the filesystem enqueues the bio_post_read_ctx on a
+workqueue, and then the workqueue work does the decryption or
+verification.  Finally, pages where no decryption or verity error
+occurred are marked Uptodate, and the pages are unlocked.
+
+Files on ext4 and f2fs may contain holes.  Normally, ``->readpages()``
+simply zeroes holes and sets the corresponding pages Uptodate; no bios
+are issued.  To prevent this case from bypassing fs-verity, these
+filesystems use fsverity_verify_page() to verify hole pages.
+
+ext4 and f2fs disable direct I/O on verity files, since otherwise
+direct I/O would bypass fs-verity.  (They also do the same for
+encrypted files.)
+
+Userspace utility
+=================
+
+This document focuses on the kernel, but a userspace utility for
+fs-verity can be found at:
+
+	https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/fsverity-utils.git
+
+See the README.md file in the fsverity-utils source tree for details,
+including examples of setting up fs-verity protected files.
+
+Tests
+=====
+
+To test fs-verity, use xfstests.  For example, using `kvm-xfstests
+<https://git.kernel.org/pub/scm/fs/ext2/xfstests-bld.git/tree/Documentation/kvm-quickstart.md>`_::
+
+    kvm-xfstests -c ext4,f2fs -g verity
+
+FAQ
+===
+
+This section answers frequently asked questions about fs-verity that
+weren't already directly answered in other parts of this document.
+
+:Q: Why isn't fs-verity part of IMA?
+:A: fs-verity and IMA (Integrity Measurement Architecture) have
+    different focuses.  fs-verity is a filesystem-level mechanism for
+    hashing individual files using a Merkle tree.  In contrast, IMA
+    specifies a system-wide policy that specifies which files are
+    hashed and what to do with those hashes, such as log them,
+    authenticate them, or add them to a measurement list.
+
+    IMA is planned to support the fs-verity hashing mechanism as an
+    alternative to doing full file hashes, for people who want the
+    performance and security benefits of the Merkle tree based hash.
+    But it doesn't make sense to force all uses of fs-verity to be
+    through IMA.  As a standalone filesystem feature, fs-verity
+    already meets many users' needs, and it's testable like other
+    filesystem features e.g. with xfstests.
+
+:Q: Isn't fs-verity useless because the attacker can just modify the
+    hashes in the Merkle tree, which is stored on-disk?
+:A: To verify the authenticity of an fs-verity file you must verify
+    the authenticity of the "file measurement", which is basically the
+    root hash of the Merkle tree.  See `Use cases`_.
+
+:Q: Isn't fs-verity useless because the attacker can just replace a
+    verity file with a non-verity one?
+:A: See `Use cases`_.  In the initial use case, it's really trusted
+    userspace code that authenticates the files; fs-verity is just a
+    tool to do this job efficiently and securely.  The trusted
+    userspace code will consider non-verity files to be inauthentic.
+
+:Q: Why does the Merkle tree need to be stored on-disk?  Couldn't you
+    store just the root hash?
+:A: If the Merkle tree wasn't stored on-disk, then you'd have to
+    compute the entire tree when the file is first accessed, even if
+    just one byte is being read.  This is a fundamental consequence of
+    how Merkle tree hashing works.  To verify a leaf node, you need to
+    verify the whole path to the root hash, including the root node
+    (the thing which the root hash is a hash of).  But if the root
+    node isn't stored on-disk, you have to compute it by hashing its
+    children, and so on until you've actually hashed the entire file.
+
+    That defeats most of the point of doing a Merkle tree-based hash,
+    since if you have to hash the whole file ahead of time anyway,
+    then you could simply do sha256(file) instead.  That would be much
+    simpler, and a bit faster too.
+
+    It's true that an in-memory Merkle tree could still provide the
+    advantage of verification on every read rather than just on the
+    first read.  However, it would be inefficient because every time a
+    hash page gets evicted (you can't pin the entire Merkle tree into
+    memory, since it may be very large), in order to restore it you
+    again need to hash everything below it in the tree.  This again
+    defeats most of the point of doing a Merkle tree-based hash, since
+    a single block read could trigger re-hashing gigabytes of data.
+
+:Q: But couldn't you store just the leaf nodes and compute the rest?
+:A: See previous answer; this really just moves up one level, since
+    one could alternatively interpret the data blocks as being the
+    leaf nodes of the Merkle tree.  It's true that the tree can be
+    computed much faster if the leaf level is stored rather than just
+    the data, but that's only because each level is less than 1% the
+    size of the level below (assuming the recommended settings of
+    SHA-256 and 4K blocks).  For the exact same reason, by storing
+    "just the leaf nodes" you'd already be storing over 99% of the
+    tree, so you might as well simply store the whole tree.
+
+:Q: Can the Merkle tree be built ahead of time, e.g. distributed as
+    part of a package that is installed to many computers?
+:A: This isn't currently supported.  It was part of the original
+    design, but was removed to simplify the kernel UAPI and because it
+    wasn't a critical use case.  Files are usually installed once and
+    used many times, and cryptographic hashing is somewhat fast on
+    most modern processors.
+
+:Q: Why doesn't fs-verity support writes?
+:A: Write support would be very difficult and would require a
+    completely different design, so it's well outside the scope of
+    fs-verity.  Write support would require:
+
+    - A way to maintain consistency between the data and hashes,
+      including all levels of hashes, since corruption after a crash
+      (especially of potentially the entire file!) is unacceptable.
+      The main options for solving this are data journalling,
+      copy-on-write, and log-structured volume.  But it's very hard to
+      retrofit existing filesystems with new consistency mechanisms.
+      Data journalling is available on ext4, but is very slow.
+
+    - Rebuilding the the Merkle tree after every write, which would be
+      extremely inefficient.  Alternatively, a different authenticated
+      dictionary structure such as an "authenticated skiplist" could
+      be used.  However, this would be far more complex.
+
+    Compare it to dm-verity vs. dm-integrity.  dm-verity is very
+    simple: the kernel just verifies read-only data against a
+    read-only Merkle tree.  In contrast, dm-integrity supports writes
+    but is slow, is much more complex, and doesn't actually support
+    full-device authentication since it authenticates each sector
+    independently, i.e. there is no "root hash".  It doesn't really
+    make sense for the same device-mapper target to support these two
+    very different cases; the same applies to fs-verity.
+
+:Q: Since verity files are immutable, why isn't the immutable bit set?
+:A: The existing "immutable" bit (FS_IMMUTABLE_FL) already has a
+    specific set of semantics which not only make the file contents
+    read-only, but also prevent the file from being deleted, renamed,
+    linked to, or having its owner or mode changed.  These extra
+    properties are unwanted for fs-verity, so reusing the immutable
+    bit isn't appropriate.
+
+:Q: Why does the API use ioctls instead of setxattr() and getxattr()?
+:A: Abusing the xattr interface for basically arbitrary syscalls is
+    heavily frowned upon by most of the Linux filesystem developers.
+    An xattr should really just be an xattr on-disk, not an API to
+    e.g. magically trigger construction of a Merkle tree.
+
+:Q: Does fs-verity support remote filesystems?
+:A: Only ext4 and f2fs support is implemented currently, but in
+    principle any filesystem that can store per-file verity metadata
+    can support fs-verity, regardless of whether it's local or remote.
+    Some filesystems may have fewer options of where to store the
+    verity metadata; one possibility is to store it past the end of
+    the file and "hide" it from userspace by manipulating i_size.  The
+    data verification functions provided by ``fs/verity/`` also assume
+    that the filesystem uses the Linux pagecache, but both local and
+    remote filesystems normally do so.
+
+:Q: Why is anything filesystem-specific at all?  Shouldn't fs-verity
+    be implemented entirely at the VFS level?
+:A: There are many reasons why this is not possible or would be very
+    difficult, including the following:
+
+    - To prevent bypassing verification, pages must not be marked
+      Uptodate until they've been verified.  Currently, each
+      filesystem is responsible for marking pages Uptodate via
+      ``->readpages()``.  Therefore, currently it's not possible for
+      the VFS to do the verification on its own.  Changing this would
+      require significant changes to the VFS and all filesystems.
+
+    - It would require defining a filesystem-independent way to store
+      the verity metadata.  Extended attributes don't work for this
+      because (a) the Merkle tree may be gigabytes, but many
+      filesystems assume that all xattrs fit into a single 4K
+      filesystem block, and (b) ext4 and f2fs encryption doesn't
+      encrypt xattrs, yet the Merkle tree *must* be encrypted when the
+      file contents are, because it stores hashes of the plaintext
+      file contents.
+
+      So the verity metadata would have to be stored in an actual
+      file.  Using a separate file would be very ugly, since the
+      metadata is fundamentally part of the file to be protected, and
+      it could cause problems where users could delete the real file
+      but not the metadata file or vice versa.  On the other hand,
+      having it be in the same file would break applications unless
+      filesystems' notion of i_size were divorced from the VFS's,
+      which would be complex and require changes to all filesystems.
+
+    - It's desirable that FS_IOC_ENABLE_VERITY uses the filesystem's
+      transaction mechanism so that either the file ends up with
+      verity enabled, or no changes were made.  Allowing intermediate
+      states to occur after a crash may cause problems.
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 1131c34d77f6..416c7f0e123a 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -31,6 +31,7 @@ filesystem implementations.
 
    journalling
    fscrypt
+   fsverity
 
 Filesystem-specific documentation
 =================================
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 00/16] fs-verity: read-only file-based authenticity protection
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh

Hello,

This is a redesigned version of the fs-verity patchset, implementing
Ted's suggestion to build the Merkle tree in the kernel
(https://lore.kernel.org/linux-fsdevel/20190207031101.GA7387@mit.edu/).
This greatly simplifies the UAPI, since the verity metadata no longer
needs to be transferred to the kernel.  Now to enable fs-verity on a
file, one simply calls FS_IOC_ENABLE_VERITY, passing it this structure:

	struct fsverity_enable_arg {
		__u32 version;
		__u32 hash_algorithm;
		__u32 block_size;
		__u32 salt_size;
		__u64 salt_ptr;
		__u32 sig_size;
		__u32 __reserved1;
		__u64 sig_ptr;
		__u64 __reserved2[11];
	};

The filesystem then builds the file's Merkle tree and stores it in a
filesystem-specific location associated with the file.  Afterwards,
FS_IOC_MEASURE_VERITY can be used to retrieve the file measurement
("root hash").  The way the file measurement is computed is also
effectively part of the API (it has to be), but it's logically
independent of where/how the filesystem stores the Merkle tree.

The API is fully documented in Documentation/filesystems/fsverity.rst,
along with other aspects of fs-verity.  I also added an FAQ section that
answers frequently asked questions about fs-verity, e.g. why isn't it
all at the VFS level, why isn't it part of IMA, why does the Merkle tree
need to be stored on-disk, etc.

Overview
--------

This patchset implements fs-verity for ext4 and f2fs.  fs-verity is
similar to dm-verity, but implemented on a per-file basis: a Merkle tree
is used to measure (hash) a read-only file's data as it is paged in.
ext4 and f2fs hide this Merkle tree beyond the end of the file, but
other filesystems can implement it differently if desired.

In general, fs-verity is intended for use on writable filesystems;
dm-verity is still recommended on read-only ones.

Similar to fscrypt, most of the code is in fs/verity/, and not too many
filesystem-specific changes are needed.  The Merkle tree is built by the
filesystem when the FS_IOC_ENABLE_VERITY ioctl is executed.

fs-verity provides a file measurement (hash) in constant time and
verifies data on-demand.  Thus, it is useful for efficiently verifying
the authenticity of large files of which only a small portion may be
accessed, such as Android application package (APK) files.  It may also
be useful in "audit" use cases where file hashes are logged.

fs-verity can also provide better protection against malicious disks
than an ahead-of-time hash, since fs-verity re-verifies data each time
it's paged in.  Note, however, that any authenticity guarantee is still
dependent on verification of the file measurement and other relevant
metadata in a way that makes sense for the overall system; fs-verity is
only a tool to help with this.

This patchset doesn't include IMA support for fs-verity file
measurements.  This is planned and we'd like to collaborate with the IMA
maintainers.  Although fs-verity can be used on its own without IMA,
fs-verity is primarily a lower level feature (think of it as a way of
hashing a file), so some users may still need IMA's policy mechanism.
However, an optional in-kernel signature verification mechanism within
fs-verity itself is also included.

This patchset is based on v5.2-rc3.  It can also be found in git at tag
fsverity_2019-06-06 of:

	https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git

fs-verity has a userspace utility:

	https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/fsverity-utils.git

xfstests for fs-verity can be found at branch "fsverity" of:

	https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfstests-dev.git

fs-verity is supported by f2fs-tools v1.11.0+ and e2fsprogs v1.45.2+.

Examples of setting up fs-verity protected files can be found in the
README.md file of fsverity-utils.

Other useful references include:

  - Documentation/filesystems/fsverity.rst, added by the first patch.

  - LWN coverage of v3 patchset: https://lwn.net/Articles/790185/

  - LWN coverage of v2 patchset: https://lwn.net/Articles/775872/

  - LWN coverage of v1 patchset: https://lwn.net/Articles/763729/

  - Presentation at Linux Security Summit North America 2018:
      - Slides: https://schd.ws/hosted_files/lssna18/af/fs-verity%20slide%20deck.pdf
      - Video: https://www.youtube.com/watch?v=Aw5h6aBhu6M
      (This corresponded to the v1 patchset; changes have been made since then.)

  - LWN coverage of LSFMM 2018 discussion: https://lwn.net/Articles/752614/

Changed since v3:

  - The FS_IOC_GETFLAGS ioctl now returns the verity flag.

  - Fixed setting i_verity_info too early.

  - Restored pagecache invalidation in FS_IOC_ENABLE_VERITY.

  - Fixed truncation of fsverity_enable_arg::hash_algorithm.

  - Reject empty files for both open and enable, not just enable.

  - Added a couple more FAQ entries to the documentation.

  - A few minor cleanups.

  - Rebased onto v5.2-rc3.

Changed since v2:

  - Large redesign: the Merkle tree is now built by
    FS_IOC_ENABLE_VERITY, rather than being provided by userspace.  The
    fsverity_operations provide an interface for filesystems to read and
    write the Merkle tree from/to a filesystem-specific location.

  - Lot of refactoring, cleanups, and documentation improvements.

  - Many simplifications, such as simplifying the fsverity_descriptor
    format, dropping CRC-32 support, and limiting the salt size.

  - ext4 and f2fs now store an xattr that gives the location of the
    fsverity_descriptor, so loading it is more straightforward.

  - f2fs no longer counts the verity metadata in the on-disk i_size,
    making it consistent with ext4.

  - Replaced the filesystem-specific fs-verity kconfig options with
    CONFIG_FS_VERITY.

  - Replaced the filesystem-specific verity bit checks with IS_VERITY().

Changed since v1:

  - Added documentation file.

  - Require write permission for FS_IOC_ENABLE_VERITY, rather than
    CAP_SYS_ADMIN.

  - Eliminated dependency on CONFIG_BLOCK and clarified that filesystems
    can verify a page at a time rather than a bio at a time.

  - Fixed conditions for verifying holes.

  - ext4 now only allows fs-verity on extent-based files.

  - Eliminated most of the assumptions that the verity metadata is
    stored beyond EOF, in case filesystems want to do things
    differently.

  - Other cleanups.

Eric Biggers (16):
  fs-verity: add a documentation file
  fs-verity: add MAINTAINERS file entry
  fs-verity: add UAPI header
  fs: uapi: define verity bit for FS_IOC_GETFLAGS
  fs-verity: add Kconfig and the helper functions for hashing
  fs-verity: add inode and superblock fields
  fs-verity: add the hook for file ->open()
  fs-verity: add the hook for file ->setattr()
  fs-verity: add data verification hooks for ->readpages()
  fs-verity: implement FS_IOC_ENABLE_VERITY ioctl
  fs-verity: implement FS_IOC_MEASURE_VERITY ioctl
  fs-verity: add SHA-512 support
  fs-verity: support builtin file signatures
  ext4: add basic fs-verity support
  ext4: add fs-verity read support
  f2fs: add fs-verity support

 Documentation/filesystems/fsverity.rst | 708 +++++++++++++++++++++++++
 Documentation/filesystems/index.rst    |   1 +
 Documentation/ioctl/ioctl-number.txt   |   1 +
 MAINTAINERS                            |  12 +
 fs/Kconfig                             |   2 +
 fs/Makefile                            |   1 +
 fs/ext4/Makefile                       |   1 +
 fs/ext4/ext4.h                         |  23 +-
 fs/ext4/file.c                         |   4 +
 fs/ext4/inode.c                        |  48 +-
 fs/ext4/ioctl.c                        |  12 +
 fs/ext4/readpage.c                     | 207 +++++++-
 fs/ext4/super.c                        |  18 +-
 fs/ext4/sysfs.c                        |   6 +
 fs/ext4/verity.c                       | 272 ++++++++++
 fs/ext4/xattr.h                        |   2 +
 fs/f2fs/Makefile                       |   1 +
 fs/f2fs/data.c                         |  72 ++-
 fs/f2fs/f2fs.h                         |  23 +-
 fs/f2fs/file.c                         |  40 ++
 fs/f2fs/inode.c                        |   5 +-
 fs/f2fs/super.c                        |   3 +
 fs/f2fs/sysfs.c                        |  11 +
 fs/f2fs/verity.c                       | 224 ++++++++
 fs/f2fs/xattr.h                        |   2 +
 fs/verity/Kconfig                      |  55 ++
 fs/verity/Makefile                     |  10 +
 fs/verity/enable.c                     | 353 ++++++++++++
 fs/verity/fsverity_private.h           | 188 +++++++
 fs/verity/hash_algs.c                  | 279 ++++++++++
 fs/verity/init.c                       |  61 +++
 fs/verity/measure.c                    |  57 ++
 fs/verity/open.c                       | 363 +++++++++++++
 fs/verity/signature.c                  | 207 ++++++++
 fs/verity/verify.c                     | 281 ++++++++++
 include/linux/fs.h                     |  11 +
 include/linux/fsverity.h               | 209 ++++++++
 include/uapi/linux/fs.h                |   1 +
 include/uapi/linux/fsverity.h          |  40 ++
 39 files changed, 3755 insertions(+), 59 deletions(-)
 create mode 100644 Documentation/filesystems/fsverity.rst
 create mode 100644 fs/ext4/verity.c
 create mode 100644 fs/f2fs/verity.c
 create mode 100644 fs/verity/Kconfig
 create mode 100644 fs/verity/Makefile
 create mode 100644 fs/verity/enable.c
 create mode 100644 fs/verity/fsverity_private.h
 create mode 100644 fs/verity/hash_algs.c
 create mode 100644 fs/verity/init.c
 create mode 100644 fs/verity/measure.c
 create mode 100644 fs/verity/open.c
 create mode 100644 fs/verity/signature.c
 create mode 100644 fs/verity/verify.c
 create mode 100644 include/linux/fsverity.h
 create mode 100644 include/uapi/linux/fsverity.h

-- 
2.22.0.rc1.311.g5d7573a151-goog

^ permalink raw reply

* [RFC]: Convention for naming syscall revisions
From: Christian Brauner @ 2019-06-06 15:42 UTC (permalink / raw)
  To: linux-api, linux-kernel

Hey everyone,

I hope this is not going to start a trash fire.

While working on a new clone version I tried to find out what the
current naming conventions for syscall revisions is. I was told and
seemed to be able to confirm through the syscall list that revisions of
syscalls are for the most part (for examples see [1]) named after the
number of arguments and not for the number of revisions. But some also
seem to escape that logic (e.g. clone2).

In any case, I would like to document *a* convention for syscall
revisions on https://www.kernel.org/doc/ . So what shall it be:
- number of args
- number of revision
?

Christian

[1]: - accept4(/* 4 args */)
     - dup2(/* 2 args */)
     - dup3(/* 3 args */)
     - eventfd2(/* 2 args */)
     - pipe2(/* 2 args */)
     - pselect6(/* 6 args, including structs */)
     - signalfd4(/* 4 args, one of them a struct */)
     - umount2(/* 2 args */)
     - wait3(/* 3 args, one of them a struct */)
     - wait4(/* 4 args, one of them a struct */)

^ permalink raw reply

* Re: [PATCH 09/10] usb: Add USB subsystem notifications [ver #3]
From: Greg Kroah-Hartman @ 2019-06-06 15:31 UTC (permalink / raw)
  To: Alan Stern
  Cc: David Howells, viro, linux-usb, raven, linux-fsdevel, linux-api,
	linux-block, keyrings, linux-security-module, linux-kernel
In-Reply-To: <Pine.LNX.4.44L0.1906061051310.1641-100000@iolanthe.rowland.org>

On Thu, Jun 06, 2019 at 10:55:24AM -0400, Alan Stern wrote:
> On Thu, 6 Jun 2019, Greg Kroah-Hartman wrote:
> 
> > On Thu, Jun 06, 2019 at 10:24:18AM -0400, Alan Stern wrote:
> > > On Thu, 6 Jun 2019, David Howells wrote:
> > > 
> > > > Add a USB subsystem notification mechanism whereby notifications about
> > > > hardware events such as device connection, disconnection, reset and I/O
> > > > errors, can be reported to a monitoring process asynchronously.
> > > 
> > > USB I/O errors covers an awfully large and vague field.  Do we really
> > > want to include them?  I'm doubtful.
> > 
> > See the other patch on the linux-usb list that wanted to start adding
> > KOBJ_CHANGE notifications about USB "i/o errors".
> 
> That patch wanted to add notifications only for enumeration failures
> (assuming you're talking about the patch from Eugeniu Rosca), not I/O
> errors in general.

Yes, sorry, I was thinking that as a "I/O failed in enumeration" :)

> > So for "severe" issues, yes, we should do this, but perhaps not for all
> > of the "normal" things we see when a device is yanked out of the system
> > and the like.
> 
> Then what counts as a "severe" issue?  Anything besides enumeration 
> failure?

Not that I can think of at the moment, other than the other recently
added KOBJ_CHANGE issue.  I'm sure we have other "hard failure" issues
in the USB stack that people will want exposed over time.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH 01/10] security: Override creds in __fput() with last fputter's creds [ver #3]
From: David Howells @ 2019-06-06 15:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, viro, Casey Schaufler, raven, linux-fsdevel, linux-api,
	linux-block, keyrings, linux-security-module, linux-kernel
In-Reply-To: <176F8189-3BE9-4B8C-A4D5-8915436338FB@amacapital.net>

Andy Lutomirski <luto@amacapital.net> wrote:

> > So that the LSM can see the credentials of the last process to do an fput()
> > on a file object when the file object is being dismantled, do the following
> > steps:
> > 
> 
> I still maintain that this is a giant design error.

Yes, I know.  This was primarily a post so that Greg could play with the USB
notifications stuff I added.  The LSM support isn't resolved and is unchanged.

> Can someone at least come up with a single valid use case that isn't
> entirely full of bugs?

"Entirely full of bugs"?

How would you propose I deal with Casey's requirement?  I'm getting the
feeling you're going to nak it if I try to fulfil that and he's going to nak
it if I don't.

David

^ permalink raw reply

* Re: [PATCH 01/10] security: Override creds in __fput() with last fputter's creds [ver #3]
From: Andy Lutomirski @ 2019-06-06 14:57 UTC (permalink / raw)
  To: David Howells
  Cc: viro, Casey Schaufler, raven, linux-fsdevel, linux-api,
	linux-block, keyrings, linux-security-module, linux-kernel
In-Reply-To: <155981413016.17513.10540579988392555403.stgit@warthog.procyon.org.uk>



> On Jun 6, 2019, at 2:42 AM, David Howells <dhowells@redhat.com> wrote:
> 
> So that the LSM can see the credentials of the last process to do an fput()
> on a file object when the file object is being dismantled, do the following
> steps:
> 

I still maintain that this is a giant design error. Can someone at least come up with a single valid use case that isn’t entirely full of bugs?

^ permalink raw reply

* Re: [PATCH 09/10] usb: Add USB subsystem notifications [ver #3]
From: Alan Stern @ 2019-06-06 14:55 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: David Howells, viro, linux-usb, raven, linux-fsdevel, linux-api,
	linux-block, keyrings, linux-security-module, linux-kernel
In-Reply-To: <20190606143306.GA11294@kroah.com>

On Thu, 6 Jun 2019, Greg Kroah-Hartman wrote:

> On Thu, Jun 06, 2019 at 10:24:18AM -0400, Alan Stern wrote:
> > On Thu, 6 Jun 2019, David Howells wrote:
> > 
> > > Add a USB subsystem notification mechanism whereby notifications about
> > > hardware events such as device connection, disconnection, reset and I/O
> > > errors, can be reported to a monitoring process asynchronously.
> > 
> > USB I/O errors covers an awfully large and vague field.  Do we really
> > want to include them?  I'm doubtful.
> 
> See the other patch on the linux-usb list that wanted to start adding
> KOBJ_CHANGE notifications about USB "i/o errors".

That patch wanted to add notifications only for enumeration failures
(assuming you're talking about the patch from Eugeniu Rosca), not I/O
errors in general.

> So for "severe" issues, yes, we should do this, but perhaps not for all
> of the "normal" things we see when a device is yanked out of the system
> and the like.

Then what counts as a "severe" issue?  Anything besides enumeration 
failure?

Alan Stern

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Christian Brauner @ 2019-06-06 14:34 UTC (permalink / raw)
  To: David Howells
  Cc: viro, Greg Kroah-Hartman, Casey Schaufler, linux-usb, raven,
	linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

On Thu, Jun 06, 2019 at 10:41:59AM +0100, David Howells wrote:
> 
> Hi Al,
> 
> Here's a set of patches to add a general variable-length notification queue
> concept and to add sources of events for:
> 
>  (1) Mount topology events, such as mounting, unmounting, mount expiry,
>      mount reconfiguration.
> 
>  (2) Superblock events, such as R/W<->R/O changes, quota overrun and I/O
>      errors (not complete yet).
> 
>  (3) Key/keyring events, such as creating, linking and removal of keys.
> 
>  (4) General device events (single common queue) including:
> 
>      - Block layer events, such as device errors
> 
>      - USB subsystem events, such as device/bus attach/remove, device
>        reset, device errors.
> 
> One of the reasons for this is so that we can remove the issue of processes
> having to repeatedly and regularly scan /proc/mounts, which has proven to
> be a system performance problem.  To further aid this, the fsinfo() syscall
> on which this patch series depends, provides a way to access superblock and
> mount information in binary form without the need to parse /proc/mounts.
> 
> 
> LSM support is included, but controversial:

Apart from the LSM/security controversy here the general direction of
this patchset is pretty well received it seems.
Imho, having a notification mechanism like this is a very good thing for
userspace. So would be great if there'd be a consensus on the LSM bits.

Christian

^ permalink raw reply

* Re: [PATCH 09/10] usb: Add USB subsystem notifications [ver #3]
From: Greg Kroah-Hartman @ 2019-06-06 14:33 UTC (permalink / raw)
  To: Alan Stern
  Cc: David Howells, viro, linux-usb, raven, linux-fsdevel, linux-api,
	linux-block, keyrings, linux-security-module, linux-kernel
In-Reply-To: <Pine.LNX.4.44L0.1906061020000.1641-100000@iolanthe.rowland.org>

On Thu, Jun 06, 2019 at 10:24:18AM -0400, Alan Stern wrote:
> On Thu, 6 Jun 2019, David Howells wrote:
> 
> > Add a USB subsystem notification mechanism whereby notifications about
> > hardware events such as device connection, disconnection, reset and I/O
> > errors, can be reported to a monitoring process asynchronously.
> 
> USB I/O errors covers an awfully large and vague field.  Do we really
> want to include them?  I'm doubtful.

See the other patch on the linux-usb list that wanted to start adding
KOBJ_CHANGE notifications about USB "i/o errors".

So for "severe" issues, yes, we should do this, but perhaps not for all
of the "normal" things we see when a device is yanked out of the system
and the like.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH 09/10] usb: Add USB subsystem notifications [ver #3]
From: Alan Stern @ 2019-06-06 14:24 UTC (permalink / raw)
  To: David Howells
  Cc: viro, Greg Kroah-Hartman, linux-usb, raven, linux-fsdevel,
	linux-api, linux-block, keyrings, linux-security-module,
	linux-kernel
In-Reply-To: <155981420247.17513.18371208824032389940.stgit@warthog.procyon.org.uk>

On Thu, 6 Jun 2019, David Howells wrote:

> Add a USB subsystem notification mechanism whereby notifications about
> hardware events such as device connection, disconnection, reset and I/O
> errors, can be reported to a monitoring process asynchronously.

USB I/O errors covers an awfully large and vague field.  Do we really
want to include them?  I'm doubtful.

Alan Stern

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Stephen Smalley @ 2019-06-06 14:05 UTC (permalink / raw)
  To: David Howells
  Cc: viro, Greg Kroah-Hartman, Casey Schaufler, linux-usb, raven,
	linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel, Paul Moore
In-Reply-To: <3813.1559827003@warthog.procyon.org.uk>

On 6/6/19 9:16 AM, David Howells wrote:
> Stephen Smalley <sds@tycho.nsa.gov> wrote:
> 
> This might be easier to discuss if you can reply to:
> 
> 	https://lore.kernel.org/lkml/5393.1559768763@warthog.procyon.org.uk/
> 
> which is on the ver #2 posting of this patchset.

Sorry for being late to the party.  Not sure whether you're asking me to 
respond only there or both there and here to your comments below.  I'll 
start here but can revisit if it's a problem.
> 
>>> LSM support is included, but controversial:
>>>
>>>    (1) The creds of the process that did the fput() that reduced the refcount
>>>        to zero are cached in the file struct.
>>>
>>>    (2) __fput() overrides the current creds with the creds from (1) whilst
>>>        doing the cleanup, thereby making sure that the creds seen by the
>>>        destruction notification generated by mntput() appears to come from
>>>        the last fputter.
>>>
>>>    (3) security_post_notification() is called for each queue that we might
>>>        want to post a notification into, thereby allowing the LSM to prevent
>>>        covert communications.
>>>
>>>    (?) Do I need to add security_set_watch(), say, to rule on whether a watch
>>>        may be set in the first place?  I might need to add a variant per
>>>        watch-type.
>>>
>>>    (?) Do I really need to keep track of the process creds in which an
>>>        implicit object destruction happened?  For example, imagine you create
>>>        an fd with fsopen()/fsmount().  It is marked to dissolve the mount it
>>>        refers to on close unless move_mount() clears that flag.  Now, imagine
>>>        someone looking at that fd through procfs at the same time as you exit
>>>        due to an error.  The LSM sees the destruction notification come from
>>>        the looker if they happen to do their fput() after yours.
>>
>>
>> I'm not in favor of this approach.
> 
> Which bit?  The last point?  Keeping track of the process creds after an
> implicit object destruction.

(1), (2), (3), and the last point.

>> Can we check permission to the object being watched when a watch is set
>> (read-like access),
> 
> Yes, and I need to do that.  I think it's likely to require an extra hook for
> each entry point added because the objects are different:
> 
> 	int security_watch_key(struct watch *watch, struct key *key);
> 	int security_watch_sb(struct watch *watch, struct path *path);
> 	int security_watch_mount(struct watch *watch, struct path *path);
> 	int security_watch_devices(struct watch *watch);
> 
>> make sure every access that can trigger a notification requires a
>> (write-like) permission to the accessed object,
> 
> "write-like permssion" for whom?  The triggerer or the watcher?

The former, i.e. the process that performed the operation that triggered 
the notification.  Think of it as a write from the process to the 
accessed object, which triggers a notification (another write) on some 
related object (the watched object), which is then read by the watcher.

> There are various 'classes' of events:
> 
>   (1) System events (eg. hardware I/O errors, automount points expiring).
> 
>   (2) Direct events (eg. automounts, manual mounts, EDQUOT, key linkage).
> 
>   (3) Indirect events (eg. exit/close doing the last fput and causing an
>       unmount).
> 
> Class (1) are uncaused by a process, so I use init_cred for them.  One could
> argue that the automount point expiry should perhaps take place under the
> creds of whoever triggered it in the first place, but we need to be careful
> about long-term cred pinning.

This seems equivalent to just checking whether the watcher is allowed to 
get that kind of event, no other cred truly needed.

> Class (2) the causing process must've had permission to cause them - otherwise
> we wouldn't have got the event.

So we've already done a check on the causing process, and we're going to 
check whether the watcher can set the watch. We just need to establish 
the connection between the accessed object and the watched object in 
some manner.

> Class (3) is interesting since it's currently entirely cleanup events and the
> process may have the right to do them (close, dup2, exit, but also execve)
> whether the LSM thinks it should be able to cause the object to be destroyed
> or not.
> 
> It gets more complicated than that, though: multiple processes with different
> security attributes can all have fds pointing to a common file object - and
> the last one to close carries the can as far as the LSM is concerned.

Yes, I'd prefer to avoid that.  You can't write policy that is stable 
and meaningful that way.  This may fall under a similar situation as 
class (1) - all we can meaningfully do is check whether the watcher is 
allowed to see all such events.

> And yet more complicated when you throw in unix sockets with partially passed
> fds still in their queues.  That's what patch 01 is designed to try and cope
> with.
> 
>> and make sure there is some sane way to control the relationship between the
>> accessed object and the watched object (write-like)?
> 
> This is the trick.  Keys and superblocks have object labels of their own and
> don't - for now - propagate their watches.  With these, the watch is on the
> object you initially assign it to and it goes no further than that.
> 
> mount_notify() is the interesting case since we want to be able to detect
> mount topology change events from within the vfs subtree rooted at the watched
> directory without having to manually put a watch on every directory in that
> subtree - or even just every mount object. >
> Or, maybe, that's what I'll have to do: make it mount_notify() can only apply
> to the subtree within its superblock, and the caller must call mount_notify()
> for every mount object it wants to monitor.  That would at least ensure that
> the caller can, at that point, reach all those mount points.

Would that at least make it consistent with fanotify (not that it 
provides a great example)?

>> For cases where we have no object per se or at least no security
>> structure/label associated with it, we may have to fall back to a
>> coarse-grained "Can the watcher get this kind of notification in general?".
> 
> Agreed - and we should probably have that anyway.
> 
> David

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: David Howells @ 2019-06-06 13:16 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: dhowells, viro, Greg Kroah-Hartman, Casey Schaufler, linux-usb,
	raven, linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel, Paul Moore
In-Reply-To: <b91710d8-cd2d-6b93-8619-130b9d15983d@tycho.nsa.gov>

Stephen Smalley <sds@tycho.nsa.gov> wrote:

This might be easier to discuss if you can reply to:

	https://lore.kernel.org/lkml/5393.1559768763@warthog.procyon.org.uk/

which is on the ver #2 posting of this patchset.

> > LSM support is included, but controversial:
> >
> >   (1) The creds of the process that did the fput() that reduced the refcount
> >       to zero are cached in the file struct.
> >
> >   (2) __fput() overrides the current creds with the creds from (1) whilst
> >       doing the cleanup, thereby making sure that the creds seen by the
> >       destruction notification generated by mntput() appears to come from
> >       the last fputter.
> >
> >   (3) security_post_notification() is called for each queue that we might
> >       want to post a notification into, thereby allowing the LSM to prevent
> >       covert communications.
> >
> >   (?) Do I need to add security_set_watch(), say, to rule on whether a watch
> >       may be set in the first place?  I might need to add a variant per
> >       watch-type.
> >
> >   (?) Do I really need to keep track of the process creds in which an
> >       implicit object destruction happened?  For example, imagine you create
> >       an fd with fsopen()/fsmount().  It is marked to dissolve the mount it
> >       refers to on close unless move_mount() clears that flag.  Now, imagine
> >       someone looking at that fd through procfs at the same time as you exit
> >       due to an error.  The LSM sees the destruction notification come from
> >       the looker if they happen to do their fput() after yours.
> 
> 
> I'm not in favor of this approach.

Which bit?  The last point?  Keeping track of the process creds after an
implicit object destruction.

> Can we check permission to the object being watched when a watch is set
> (read-like access),

Yes, and I need to do that.  I think it's likely to require an extra hook for
each entry point added because the objects are different:

	int security_watch_key(struct watch *watch, struct key *key);
	int security_watch_sb(struct watch *watch, struct path *path);
	int security_watch_mount(struct watch *watch, struct path *path);
	int security_watch_devices(struct watch *watch);

> make sure every access that can trigger a notification requires a
> (write-like) permission to the accessed object,

"write-like permssion" for whom?  The triggerer or the watcher?

There are various 'classes' of events:

 (1) System events (eg. hardware I/O errors, automount points expiring).

 (2) Direct events (eg. automounts, manual mounts, EDQUOT, key linkage).

 (3) Indirect events (eg. exit/close doing the last fput and causing an
     unmount).

Class (1) are uncaused by a process, so I use init_cred for them.  One could
argue that the automount point expiry should perhaps take place under the
creds of whoever triggered it in the first place, but we need to be careful
about long-term cred pinning.

Class (2) the causing process must've had permission to cause them - otherwise
we wouldn't have got the event.

Class (3) is interesting since it's currently entirely cleanup events and the
process may have the right to do them (close, dup2, exit, but also execve)
whether the LSM thinks it should be able to cause the object to be destroyed
or not.

It gets more complicated than that, though: multiple processes with different
security attributes can all have fds pointing to a common file object - and
the last one to close carries the can as far as the LSM is concerned.

And yet more complicated when you throw in unix sockets with partially passed
fds still in their queues.  That's what patch 01 is designed to try and cope
with.

> and make sure there is some sane way to control the relationship between the
> accessed object and the watched object (write-like)?

This is the trick.  Keys and superblocks have object labels of their own and
don't - for now - propagate their watches.  With these, the watch is on the
object you initially assign it to and it goes no further than that.

mount_notify() is the interesting case since we want to be able to detect
mount topology change events from within the vfs subtree rooted at the watched
directory without having to manually put a watch on every directory in that
subtree - or even just every mount object.

Or, maybe, that's what I'll have to do: make it mount_notify() can only apply
to the subtree within its superblock, and the caller must call mount_notify()
for every mount object it wants to monitor.  That would at least ensure that
the caller can, at that point, reach all those mount points.

> For cases where we have no object per se or at least no security
> structure/label associated with it, we may have to fall back to a
> coarse-grained "Can the watcher get this kind of notification in general?".

Agreed - and we should probably have that anyway.

David

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Stephen Smalley @ 2019-06-06 12:32 UTC (permalink / raw)
  To: David Howells, viro
  Cc: Greg Kroah-Hartman, Casey Schaufler, linux-usb, raven,
	linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel, Paul Moore
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

On 6/6/19 5:41 AM, David Howells wrote:
> 
> Hi Al,
> 
> Here's a set of patches to add a general variable-length notification queue
> concept and to add sources of events for:
> 
>   (1) Mount topology events, such as mounting, unmounting, mount expiry,
>       mount reconfiguration.
> 
>   (2) Superblock events, such as R/W<->R/O changes, quota overrun and I/O
>       errors (not complete yet).
> 
>   (3) Key/keyring events, such as creating, linking and removal of keys.
> 
>   (4) General device events (single common queue) including:
> 
>       - Block layer events, such as device errors
> 
>       - USB subsystem events, such as device/bus attach/remove, device
>         reset, device errors.
> 
> One of the reasons for this is so that we can remove the issue of processes
> having to repeatedly and regularly scan /proc/mounts, which has proven to
> be a system performance problem.  To further aid this, the fsinfo() syscall
> on which this patch series depends, provides a way to access superblock and
> mount information in binary form without the need to parse /proc/mounts.
> 
> 
> LSM support is included, but controversial:
> 
>   (1) The creds of the process that did the fput() that reduced the refcount
>       to zero are cached in the file struct.
> 
>   (2) __fput() overrides the current creds with the creds from (1) whilst
>       doing the cleanup, thereby making sure that the creds seen by the
>       destruction notification generated by mntput() appears to come from
>       the last fputter.
> 
>   (3) security_post_notification() is called for each queue that we might
>       want to post a notification into, thereby allowing the LSM to prevent
>       covert communications.
> 
>   (?) Do I need to add security_set_watch(), say, to rule on whether a watch
>       may be set in the first place?  I might need to add a variant per
>       watch-type.
> 
>   (?) Do I really need to keep track of the process creds in which an
>       implicit object destruction happened?  For example, imagine you create
>       an fd with fsopen()/fsmount().  It is marked to dissolve the mount it
>       refers to on close unless move_mount() clears that flag.  Now, imagine
>       someone looking at that fd through procfs at the same time as you exit
>       due to an error.  The LSM sees the destruction notification come from
>       the looker if they happen to do their fput() after yours.


I'm not in favor of this approach. Can we check permission to the object 
being watched when a watch is set (read-like access), make sure every 
access that can trigger a notification requires a (write-like) 
permission to the accessed object, and make sure there is some sane way 
to control the relationship between the accessed object and the watched 
object (write-like)?  For cases where we have no object per se or at 
least no security structure/label associated with it, we may have to 
fall back to a coarse-grained "Can the watcher get this kind of 
notification in general?".

> 
> 
> Design decisions:
> 
>   (1) A misc chardev is used to create and open a ring buffer:
> 
> 	fd = open("/dev/watch_queue", O_RDWR);
> 
>       which is then configured and mmap'd into userspace:
> 
> 	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE);
> 	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
> 	buf = mmap(NULL, BUF_SIZE * page_size, PROT_READ | PROT_WRITE,
> 		   MAP_SHARED, fd, 0);
> 
>       The fd cannot be read or written (though there is a facility to use
>       write to inject records for debugging) and userspace just pulls data
>       directly out of the buffer.
> 
>   (2) The ring index pointers are stored inside the ring and are thus
>       accessible to userspace.  Userspace should only update the tail
>       pointer and never the head pointer or risk breaking the buffer.  The
>       kernel checks that the pointers appear valid before trying to use
>       them.  A 'skip' record is maintained around the pointers.
> 
>   (3) poll() can be used to wait for data to appear in the buffer.
> 
>   (4) Records in the buffer are binary, typed and have a length so that they
>       can be of varying size.
> 
>       This means that multiple heterogeneous sources can share a common
>       buffer.  Tags may be specified when a watchpoint is created to help
>       distinguish the sources.
> 
>   (5) The queue is reusable as there are 16 million types available, of
>       which I've used 4, so there is scope for others to be used.
> 
>   (6) Records are filterable as types have up to 256 subtypes that can be
>       individually filtered.  Other filtration is also available.
> 
>   (7) Each time the buffer is opened, a new buffer is created - this means
>       that there's no interference between watchers.
> 
>   (8) When recording a notification, the kernel will not sleep, but will
>       rather mark a queue as overrun if there's insufficient space, thereby
>       avoiding userspace causing the kernel to hang.
> 
>   (9) The 'watchpoint' should be specific where possible, meaning that you
>       specify the object that you want to watch.
> 
> (10) The buffer is created and then watchpoints are attached to it, using
>       one of:
> 
> 	keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01);
> 	mount_notify(AT_FDCWD, "/", 0, fd, 0x02);
> 	sb_notify(AT_FDCWD, "/mnt", 0, fd, 0x03);
> 
>       where in all three cases, fd indicates the queue and the number after
>       is a tag between 0 and 255.
> 
> (11) The watch must be removed if either the watch buffer is destroyed or
>       the watched object is destroyed.
> 
> 
> Things I want to avoid:
> 
>   (1) Introducing features that make the core VFS dependent on the network
>       stack or networking namespaces (ie. usage of netlink).
> 
>   (2) Dumping all this stuff into dmesg and having a daemon that sits there
>       parsing the output and distributing it as this then puts the
>       responsibility for security into userspace and makes handling
>       namespaces tricky.  Further, dmesg might not exist or might be
>       inaccessible inside a container.
> 
>   (3) Letting users see events they shouldn't be able to see.
> 
> 
> Further things that could be considered:
> 
>   (1) Adding a keyctl call to allow a watch on a keyring to be extended to
>       "children" of that keyring, such that the watch is removed from the
>       child if it is unlinked from the keyring.
> 
>   (2) Adding global superblock event queue.
> 
>   (3) Propagating watches to child superblock over automounts.
> 
> 
> The patches can be found here also:
> 
> 	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=notifications
> 
> Changes:
> 
>   v3: I've added a USB notification source and reformulated the block
>       notification source so that there's now a common watch list, for which
>       the system call is now device_notify().
> 
>       I've assigned a pair of unused ioctl numbers in the 'W' series to the
>       ioctls added by this series.
> 
>       I've also added a description of the kernel API to the documentation.
> 
>   v2: I've fixed various issues raised by Jann Horn and GregKH and moved to
>       krefs for refcounting.  I've added some security features to try and
>       give Casey Schaufler the LSM control he wants.
> 
> David
> ---
> David Howells (10):
>        security: Override creds in __fput() with last fputter's creds
>        General notification queue with user mmap()'able ring buffer
>        keys: Add a notification facility
>        vfs: Add a mount-notification facility
>        vfs: Add superblock notifications
>        fsinfo: Export superblock notification counter
>        Add a general, global device notification watch list
>        block: Add block layer notifications
>        usb: Add USB subsystem notifications
>        Add sample notification program
> 
> 
>   Documentation/ioctl/ioctl-number.txt   |    1
>   Documentation/security/keys/core.rst   |   58 ++
>   Documentation/watch_queue.rst          |  492 ++++++++++++++++++
>   arch/x86/entry/syscalls/syscall_32.tbl |    3
>   arch/x86/entry/syscalls/syscall_64.tbl |    3
>   block/Kconfig                          |    9
>   block/blk-core.c                       |   29 +
>   drivers/base/Kconfig                   |    9
>   drivers/base/Makefile                  |    1
>   drivers/base/notify.c                  |   82 +++
>   drivers/misc/Kconfig                   |   13
>   drivers/misc/Makefile                  |    1
>   drivers/misc/watch_queue.c             |  889 ++++++++++++++++++++++++++++++++
>   drivers/usb/core/Kconfig               |   10
>   drivers/usb/core/devio.c               |   55 ++
>   drivers/usb/core/hub.c                 |    3
>   fs/Kconfig                             |   21 +
>   fs/Makefile                            |    1
>   fs/file_table.c                        |   12
>   fs/fsinfo.c                            |   12
>   fs/mount.h                             |   33 +
>   fs/mount_notify.c                      |  180 ++++++
>   fs/namespace.c                         |    9
>   fs/super.c                             |  116 ++++
>   include/linux/blkdev.h                 |   15 +
>   include/linux/dcache.h                 |    1
>   include/linux/device.h                 |    7
>   include/linux/fs.h                     |   79 +++
>   include/linux/key.h                    |    4
>   include/linux/lsm_hooks.h              |   15 +
>   include/linux/security.h               |   14 +
>   include/linux/syscalls.h               |    5
>   include/linux/usb.h                    |   19 +
>   include/linux/watch_queue.h            |   87 +++
>   include/uapi/linux/fsinfo.h            |   10
>   include/uapi/linux/keyctl.h            |    1
>   include/uapi/linux/watch_queue.h       |  213 ++++++++
>   kernel/sys_ni.c                        |    7
>   mm/interval_tree.c                     |    2
>   mm/memory.c                            |    1
>   samples/Kconfig                        |    6
>   samples/Makefile                       |    1
>   samples/vfs/test-fsinfo.c              |   13
>   samples/watch_queue/Makefile           |    9
>   samples/watch_queue/watch_test.c       |  310 +++++++++++
>   security/keys/Kconfig                  |   10
>   security/keys/compat.c                 |    2
>   security/keys/gc.c                     |    5
>   security/keys/internal.h               |   30 +
>   security/keys/key.c                    |   37 +
>   security/keys/keyctl.c                 |   88 +++
>   security/keys/keyring.c                |   17 -
>   security/keys/request_key.c            |    4
>   security/security.c                    |    9
>   54 files changed, 3025 insertions(+), 38 deletions(-)
>   create mode 100644 Documentation/watch_queue.rst
>   create mode 100644 drivers/base/notify.c
>   create mode 100644 drivers/misc/watch_queue.c
>   create mode 100644 fs/mount_notify.c
>   create mode 100644 include/linux/watch_queue.h
>   create mode 100644 include/uapi/linux/watch_queue.h
>   create mode 100644 samples/watch_queue/Makefile
>   create mode 100644 samples/watch_queue/watch_test.c
> 

^ permalink raw reply

* Re: [PATCH 1/5] glibc: Perform rseq(2) registration at C startup and thread creation (v10)
From: Florian Weimer @ 2019-06-06 11:57 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: carlos, Joseph Myers, Szabolcs Nagy, libc-alpha, Thomas Gleixner,
	Ben Maurer, Peter Zijlstra, Paul E. McKenney, Boqun Feng,
	Will Deacon, Dave Watson, Paul Turner, Rich Felker, linux-kernel,
	linux-api
In-Reply-To: <117220011.27079.1559663870037.JavaMail.zimbra@efficios.com>

* Mathieu Desnoyers:

> Should we plan ahead for such scheme to override which library "owns" rseq
> registration from a LD_PRELOAD library ? If so, then we would want glibc to
> set __rseq_handled _after_ LD_PRELOAD ctors are executed.

I don't think so.  The LD_PRELOAD phase is not clearly delineated from
the non-preload phase.  So it's not clear to me what this would even
mean in practice.

Let me ask the key question again: Does it matter if code observes the
rseq area first without kernel support, and then with kernel support?
If we don't expect any problems immediately, we do not need to worry
much about the constructor ordering right now.  I expect that over time,
fixing this properly will become easier.

>> The final remaining case is static dlopen.  There is a copy of ld.so on
>> the dynamic side, but it is completely inactive and has never run.  I do
>> not think we need to support that because multi-threading does not work
>> reliably in this scenario, either.  However, we should skip rseq
>> registration in a nested libc (see the rtld_active function).
>
> So for SHARED, if (!rtld_active ()), we should indeed leave the state of
> __rseq_handled as it is, because we are within a nested inactive ld.so.

I think we should add __rseq_handled initialization to ld.so, so it will
only run once, ever.

It's the registration from libc.so which needs some care.  In
particular, we must not override an existing registration.

Thanks,
Florian

^ permalink raw reply

* [PATCH 10/10] Add sample notification program [ver #3]
From: David Howells @ 2019-06-06  9:43 UTC (permalink / raw)
  To: viro
  Cc: dhowells, raven, linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

This needs to be linked with -lkeyutils.

It is run like:

	./watch_test

and watches "/" for mount changes and the current session keyring for key
changes:

	# keyctl add user a a @s
	1035096409
	# keyctl unlink 1035096409 @s
	# mount -t tmpfs none /mnt/nfsv3tcp/
	# umount /mnt/nfsv3tcp

producing:

	# ./watch_test
	ptrs h=4 t=2 m=20003
	NOTIFY[00000004-00000002] ty=0003 sy=0002 i=01000010
	KEY 2ffc2e5d change=2[linked] aux=1035096409
	ptrs h=6 t=4 m=20003
	NOTIFY[00000006-00000004] ty=0003 sy=0003 i=01000010
	KEY 2ffc2e5d change=3[unlinked] aux=1035096409
	ptrs h=8 t=6 m=20003
	NOTIFY[00000008-00000006] ty=0001 sy=0000 i=02000010
	MOUNT 00000013 change=0[new_mount] aux=168
	ptrs h=a t=8 m=20003
	NOTIFY[0000000a-00000008] ty=0001 sy=0001 i=02000010
	MOUNT 00000013 change=1[unmount] aux=168

Other events may be produced, such as with a failing disk:

	ptrs h=5 t=2 m=6000004
	NOTIFY[00000005-00000002] ty=0004 sy=0006 i=04000018
	BLOCK 00800050 e=6[critical medium] s=5be8

This corresponds to:

	print_req_error: critical medium error, dev sdf, sector 23528 flags 0

in dmesg.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 samples/Kconfig                  |    6 +
 samples/Makefile                 |    1 
 samples/watch_queue/Makefile     |    9 +
 samples/watch_queue/watch_test.c |  310 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 326 insertions(+)
 create mode 100644 samples/watch_queue/Makefile
 create mode 100644 samples/watch_queue/watch_test.c

diff --git a/samples/Kconfig b/samples/Kconfig
index 0561a94f6fdb..a2b7a7babee5 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -160,4 +160,10 @@ config SAMPLE_VFS
 	  as mount API and statx().  Note that this is restricted to the x86
 	  arch whilst it accesses system calls that aren't yet in all arches.
 
+config SAMPLE_WATCH_QUEUE
+	bool "Build example /dev/watch_queue notification consumer"
+	help
+	  Build example userspace program to use the new mount_notify(),
+	  sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.
+
 endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
index debf8925f06f..ed3b8bab6e9b 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_SAMPLE_TRACE_PRINTK)	+= trace_printk/
 obj-$(CONFIG_VIDEO_PCI_SKELETON)	+= v4l/
 obj-y					+= vfio-mdev/
 subdir-$(CONFIG_SAMPLE_VFS)		+= vfs
+subdir-$(CONFIG_SAMPLE_WATCH_QUEUE)	+= watch_queue
diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile
new file mode 100644
index 000000000000..42b694430d0f
--- /dev/null
+++ b/samples/watch_queue/Makefile
@@ -0,0 +1,9 @@
+# List of programs to build
+hostprogs-y := watch_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_watch_test.o += -I$(objtree)/usr/include
+
+HOSTLOADLIBES_watch_test += -lkeyutils
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
new file mode 100644
index 000000000000..893a5380f792
--- /dev/null
+++ b/samples/watch_queue/watch_test.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Use /dev/watch_queue to watch for notifications.
+ *
+ * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
+#include <limits.h>
+#include <linux/watch_queue.h>
+#include <linux/unistd.h>
+#include <linux/keyctl.h>
+
+#ifndef __NR_mount_notify
+#define __NR_mount_notify -1
+#endif
+#ifndef __NR_sb_notify
+#define __NR_sb_notify -1
+#endif
+#ifndef __NR_device_notify
+#define __NR_device_notify -1
+#endif
+#ifndef KEYCTL_WATCH_KEY
+#define KEYCTL_WATCH_KEY -1
+#endif
+
+#define BUF_SIZE 4
+
+static const char *key_subtypes[256] = {
+	[NOTIFY_KEY_INSTANTIATED]	= "instantiated",
+	[NOTIFY_KEY_UPDATED]		= "updated",
+	[NOTIFY_KEY_LINKED]		= "linked",
+	[NOTIFY_KEY_UNLINKED]		= "unlinked",
+	[NOTIFY_KEY_CLEARED]		= "cleared",
+	[NOTIFY_KEY_REVOKED]		= "revoked",
+	[NOTIFY_KEY_INVALIDATED]	= "invalidated",
+	[NOTIFY_KEY_SETATTR]		= "setattr",
+};
+
+static void saw_key_change(struct watch_notification *n)
+{
+	struct key_notification *k = (struct key_notification *)n;
+	unsigned int len = n->info & WATCH_INFO_LENGTH;
+
+	if (len != sizeof(struct key_notification))
+		return;
+
+	printf("KEY %08x change=%u[%s] aux=%u\n",
+	       k->key_id, n->subtype, key_subtypes[n->subtype], k->aux);
+}
+
+static const char *mount_subtypes[256] = {
+	[NOTIFY_MOUNT_NEW_MOUNT]	= "new_mount",
+	[NOTIFY_MOUNT_UNMOUNT]		= "unmount",
+	[NOTIFY_MOUNT_EXPIRY]		= "expiry",
+	[NOTIFY_MOUNT_READONLY]		= "readonly",
+	[NOTIFY_MOUNT_SETATTR]		= "setattr",
+	[NOTIFY_MOUNT_MOVE_FROM]	= "move_from",
+	[NOTIFY_MOUNT_MOVE_TO]		= "move_to",
+};
+
+static long keyctl_watch_key(int key, int watch_fd, int watch_id)
+{
+	return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id);
+}
+
+static void saw_mount_change(struct watch_notification *n)
+{
+	struct mount_notification *m = (struct mount_notification *)n;
+	unsigned int len = n->info & WATCH_INFO_LENGTH;
+
+	if (len != sizeof(struct mount_notification))
+		return;
+
+	printf("MOUNT %08x change=%u[%s] aux=%u\n",
+	       m->triggered_on, n->subtype, mount_subtypes[n->subtype], m->changed_mount);
+}
+
+static const char *super_subtypes[256] = {
+	[NOTIFY_SUPERBLOCK_READONLY]	= "readonly",
+	[NOTIFY_SUPERBLOCK_ERROR]	= "error",
+	[NOTIFY_SUPERBLOCK_EDQUOT]	= "edquot",
+	[NOTIFY_SUPERBLOCK_NETWORK]	= "network",
+};
+
+static void saw_super_change(struct watch_notification *n)
+{
+	struct superblock_notification *s = (struct superblock_notification *)n;
+	unsigned int len = n->info & WATCH_INFO_LENGTH;
+
+	if (len < sizeof(struct superblock_notification))
+		return;
+
+	printf("SUPER %08llx change=%u[%s]\n",
+	       s->sb_id, n->subtype, super_subtypes[n->subtype]);
+}
+
+static const char *block_subtypes[256] = {
+	[NOTIFY_BLOCK_ERROR_TIMEOUT]			= "timeout",
+	[NOTIFY_BLOCK_ERROR_NO_SPACE]			= "critical space allocation",
+	[NOTIFY_BLOCK_ERROR_RECOVERABLE_TRANSPORT]	= "recoverable transport",
+	[NOTIFY_BLOCK_ERROR_CRITICAL_TARGET]		= "critical target",
+	[NOTIFY_BLOCK_ERROR_CRITICAL_NEXUS]		= "critical nexus",
+	[NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM]		= "critical medium",
+	[NOTIFY_BLOCK_ERROR_PROTECTION]			= "protection",
+	[NOTIFY_BLOCK_ERROR_KERNEL_RESOURCE]		= "kernel resource",
+	[NOTIFY_BLOCK_ERROR_DEVICE_RESOURCE]		= "device resource",
+	[NOTIFY_BLOCK_ERROR_IO]				= "I/O",
+};
+
+static void saw_block_change(struct watch_notification *n)
+{
+	struct block_notification *b = (struct block_notification *)n;
+	unsigned int len = n->info & WATCH_INFO_LENGTH;
+
+	if (len < sizeof(struct block_notification))
+		return;
+
+	printf("BLOCK %08llx e=%u[%s] s=%llx\n",
+	       (unsigned long long)b->dev,
+	       n->subtype, block_subtypes[n->subtype],
+	       (unsigned long long)b->sector);
+}
+
+static const char *usb_subtypes[256] = {
+	[NOTIFY_USB_DEVICE_ADD]		= "dev-add",
+	[NOTIFY_USB_DEVICE_REMOVE]	= "dev-remove",
+	[NOTIFY_USB_BUS_ADD]		= "bus-add",
+	[NOTIFY_USB_BUS_REMOVE]		= "bus-remove",
+	[NOTIFY_USB_DEVICE_RESET]	= "dev-reset",
+	[NOTIFY_USB_DEVICE_ERROR]	= "dev-error",
+};
+
+static void saw_usb_event(struct watch_notification *n)
+{
+	struct usb_notification *u = (struct usb_notification *)n;
+	unsigned int len = n->info & WATCH_INFO_LENGTH;
+
+	if (len < sizeof(struct usb_notification))
+		return;
+
+	printf("USB %*.*s %s e=%x r=%x\n",
+	       u->name_len, u->name_len, u->name,
+	       usb_subtypes[n->subtype],
+	       u->error, u->reserved);
+}
+
+/*
+ * Consume and display events.
+ */
+static int consumer(int fd, struct watch_queue_buffer *buf)
+{
+	struct watch_notification *n;
+	struct pollfd p[1];
+	unsigned int head, tail, mask = buf->meta.mask;
+
+	for (;;) {
+		p[0].fd = fd;
+		p[0].events = POLLIN | POLLERR;
+		p[0].revents = 0;
+
+		if (poll(p, 1, -1) == -1) {
+			perror("poll");
+			break;
+		}
+
+		printf("ptrs h=%x t=%x m=%x\n",
+		       buf->meta.head, buf->meta.tail, buf->meta.mask);
+
+		while (head = buf->meta.head,
+		       tail = buf->meta.tail,
+		       tail != head
+		       ) {
+			asm ("lfence" : : : "memory" );
+			n = &buf->slots[tail & mask];
+			printf("NOTIFY[%08x-%08x] ty=%04x sy=%04x i=%08x\n",
+			       head, tail, n->type, n->subtype, n->info);
+			if ((n->info & WATCH_INFO_LENGTH) == 0)
+				goto out;
+
+			switch (n->type) {
+			case WATCH_TYPE_META:
+				if (n->subtype == WATCH_META_REMOVAL_NOTIFICATION)
+					printf("REMOVAL of watchpoint %08x\n",
+					       n->info & WATCH_INFO_ID);
+				break;
+			case WATCH_TYPE_MOUNT_NOTIFY:
+				saw_mount_change(n);
+				break;
+			case WATCH_TYPE_SB_NOTIFY:
+				saw_super_change(n);
+				break;
+			case WATCH_TYPE_KEY_NOTIFY:
+				saw_key_change(n);
+				break;
+			case WATCH_TYPE_BLOCK_NOTIFY:
+				saw_block_change(n);
+				break;
+			case WATCH_TYPE_USB_NOTIFY:
+				saw_usb_event(n);
+				break;
+			}
+
+			tail += (n->info & WATCH_INFO_LENGTH) >> WATCH_LENGTH_SHIFT;
+			asm("mfence" ::: "memory");
+			buf->meta.tail = tail;
+		}
+	}
+
+out:
+	return 0;
+}
+
+static struct watch_notification_filter filter = {
+	.nr_filters	= 5,
+	.__reserved	= 0,
+	.filters = {
+		[0] = {
+			.type			= WATCH_TYPE_MOUNT_NOTIFY,
+			// Reject move-from notifications
+			.subtype_filter[0]	= UINT_MAX & ~(1 << NOTIFY_MOUNT_MOVE_FROM),
+		},
+		[1]	= {
+			.type			= WATCH_TYPE_SB_NOTIFY,
+			// Only accept notification of changes to R/O state
+			.subtype_filter[0]	= (1 << NOTIFY_SUPERBLOCK_READONLY),
+			// Only accept notifications of change-to-R/O
+			.info_mask		= WATCH_INFO_FLAG_0,
+			.info_filter		= WATCH_INFO_FLAG_0,
+		},
+		[2]	= {
+			.type			= WATCH_TYPE_KEY_NOTIFY,
+			.subtype_filter[0]	= UINT_MAX,
+		},
+		[3]	= {
+			.type			= WATCH_TYPE_BLOCK_NOTIFY,
+			.subtype_filter[0]	= UINT_MAX,
+		},
+		[4]	= {
+			.type			= WATCH_TYPE_USB_NOTIFY,
+			.subtype_filter[0]	= UINT_MAX,
+		},
+	},
+};
+
+int main(int argc, char **argv)
+{
+	struct watch_queue_buffer *buf;
+	size_t page_size;
+	int fd;
+
+	fd = open("/dev/watch_queue", O_RDWR);
+	if (fd == -1) {
+		perror("/dev/watch_queue");
+		exit(1);
+	}
+
+	if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) {
+		perror("/dev/watch_queue(size)");
+		exit(1);
+	}
+
+	if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) {
+		perror("/dev/watch_queue(filter)");
+		exit(1);
+	}
+
+	page_size = sysconf(_SC_PAGESIZE);
+	buf = mmap(NULL, BUF_SIZE * page_size, PROT_READ | PROT_WRITE,
+		   MAP_SHARED, fd, 0);
+	if (buf == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) {
+		perror("keyctl");
+		exit(1);
+	}
+
+	if (syscall(__NR_mount_notify, AT_FDCWD, "/", 0, fd, 0x02) == -1) {
+		perror("mount_notify");
+		exit(1);
+	}
+
+	if (syscall(__NR_sb_notify, AT_FDCWD, "/mnt", 0, fd, 0x03) == -1) {
+		perror("sb_notify");
+		exit(1);
+	}
+
+	if (syscall(__NR_device_notify, fd, 0x04) == -1) {
+		perror("device_notify");
+		exit(1);
+	}
+
+	return consumer(fd, buf);
+}

^ permalink raw reply related

* [PATCH 09/10] usb: Add USB subsystem notifications [ver #3]
From: David Howells @ 2019-06-06  9:43 UTC (permalink / raw)
  To: viro
  Cc: Greg Kroah-Hartman, linux-usb, dhowells, raven, linux-fsdevel,
	linux-api, linux-block, keyrings, linux-security-module,
	linux-kernel
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

Add a USB subsystem notification mechanism whereby notifications about
hardware events such as device connection, disconnection, reset and I/O
errors, can be reported to a monitoring process asynchronously.

Firstly, an event queue needs to be created:

	fd = open("/dev/event_queue", O_RDWR);
	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

then a notification can be set up to report USB notifications via that
queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_USB_NOTIFY,
				.subtype_filter[0] = UINT_MAX;
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	device_notify(fd, 12);

After that, records will be placed into the queue when events occur on a
USB device or bus.  Records are of the following format:

	struct usb_notification {
		struct watch_notification watch;
		__u32	error;
		__u32	reserved;
		__u8	name_len;
		__u8	name[0];
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_USB_NOTIFY

	n->watch.subtype will be the type of notification, such as
	NOTIFY_USB_DEVICE_ADD.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the second argument to
	device_notify(), shifted.

	n->error and n->reserved are intended to convey information such as
	error codes, but are currently not used

	n->name_len and n->name convey the USB device name as an
	unterminated string.  This may be truncated - it is currently
	limited to a maximum 63 chars.

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: linux-usb@vger.kernel.org
---

 Documentation/watch_queue.rst    |    9 ++++++
 drivers/usb/core/Kconfig         |   10 +++++++
 drivers/usb/core/devio.c         |   55 ++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/hub.c           |    3 ++
 include/linux/usb.h              |   19 +++++++++++++
 include/uapi/linux/watch_queue.h |   30 ++++++++++++++++++++-
 6 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index c2954e191989..7ce5d4147fa9 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -15,6 +15,8 @@ receive notifications from the kernel.  This can be used in conjunction with::
 
     * Block layer event notifications
 
+    * USB subsystem event notifications
+
 
 The notifications buffers can be enabled by:
 
@@ -344,6 +346,13 @@ Any particular buffer can be fed from multiple sources.  Sources include:
     or temporary link loss.  Watchpoints of this type are set on the global
     device watch list.
 
+  * WATCH_TYPE_USB_NOTIFY
+
+    Notifications of this type indicate USB subsystem events, such as
+    attachment, removal, reset and I/O errors.  Separate events are generated
+    for buses and devices.  Watchpoints of this type are set on the global
+    device watch list.
+
 
 Event Filtering
 ===============
diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig
index bdb6bd0b63a6..4be88368ab6b 100644
--- a/drivers/usb/core/Kconfig
+++ b/drivers/usb/core/Kconfig
@@ -103,3 +103,13 @@ config USB_AUTOSUSPEND_DELAY
 	  The default value Linux has always had is 2 seconds.  Change
 	  this value if you want a different delay and cannot modify
 	  the command line or module parameter.
+
+config USB_NOTIFICATIONS
+	bool "Provide USB hardware event notifications"
+	depends on USB
+	select DEVICE_NOTIFICATIONS
+	help
+	  This option provides support for getting hardware event notifications
+	  on USB devices and interfaces.  This makes use of the
+	  /dev/watch_queue misc device to handle the notification buffer.
+	  device_notify(2) is used to set/remove watches.
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index fa783531ee88..af7f339c35c5 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -41,6 +41,7 @@
 #include <linux/dma-mapping.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
+#include <linux/watch_queue.h>
 
 #include "usb.h"
 
@@ -2633,13 +2634,67 @@ static void usbdev_remove(struct usb_device *udev)
 	}
 }
 
+#ifdef CONFIG_USB_NOTIFICATIONS
+static noinline void post_usb_notification(const char *devname,
+					   enum usb_notification_type subtype,
+					   u32 error)
+{
+	unsigned int name_len, n_len;
+	u64 id = 0; /* Might want to put a dev# here. */
+
+	struct {
+		struct usb_notification n;
+		char more_name[USB_NOTIFICATION_MAX_NAME_LEN -
+			       (sizeof(struct usb_notification) -
+				offsetof(struct usb_notification, name))];
+	} n;
+
+	name_len = strlen(devname);
+	name_len = min_t(size_t, name_len, USB_NOTIFICATION_MAX_NAME_LEN);
+	n_len = round_up(offsetof(struct usb_notification, name) + name_len,
+			 sizeof(__u64));
+
+	memset(&n, 0, sizeof(n));
+	memcpy(n.n.name, devname, n_len);
+
+	n.n.watch.type		= WATCH_TYPE_USB_NOTIFY;
+	n.n.watch.subtype	= subtype;
+	n.n.watch.info		= n_len;
+	n.n.error		= error;
+	n.n.name_len		= name_len;
+
+	post_device_notification(&n.n.watch, id);
+}
+
+void post_usb_device_notification(const struct usb_device *udev,
+				  enum usb_notification_type subtype, u32 error)
+{
+	post_usb_notification(dev_name(&udev->dev), subtype, error);
+}
+
+void post_usb_bus_notification(const struct usb_bus *ubus,
+			       enum usb_notification_type subtype, u32 error)
+{
+	post_usb_notification(ubus->bus_name, subtype, error);
+}
+#endif
+
 static int usbdev_notify(struct notifier_block *self,
 			       unsigned long action, void *dev)
 {
 	switch (action) {
 	case USB_DEVICE_ADD:
+		post_usb_device_notification(dev, NOTIFY_USB_DEVICE_ADD, 0);
 		break;
 	case USB_DEVICE_REMOVE:
+		post_usb_device_notification(dev, NOTIFY_USB_DEVICE_REMOVE, 0);
+		usbdev_remove(dev);
+		break;
+	case USB_BUS_ADD:
+		post_usb_bus_notification(dev, NOTIFY_USB_BUS_ADD, 0);
+		break;
+	case USB_BUS_REMOVE:
+		post_usb_bus_notification(dev, NOTIFY_USB_BUS_REMOVE, 0);
 		usbdev_remove(dev);
 		break;
 	}
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 2f94568ba385..722013d8142c 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -4596,6 +4596,9 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 				(udev->config) ? "reset" : "new", speed,
 				devnum, driver_name);
 
+	if (udev->config)
+		post_usb_device_notification(udev, NOTIFY_USB_DEVICE_RESET, 0);
+
 	/* Set up TT records, if needed  */
 	if (hdev->tt) {
 		udev->tt = hdev->tt;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index ae82d9d1112b..12687b55811d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -2008,6 +2008,25 @@ extern void usb_led_activity(enum usb_led_event ev);
 static inline void usb_led_activity(enum usb_led_event ev) {}
 #endif
 
+/*
+ * Notification functions.
+ */
+#ifdef CONFIG_USB_NOTIFICATIONS
+extern void post_usb_device_notification(const struct usb_device *udev,
+					 enum usb_notification_type subtype,
+					 u32 error);
+extern void post_usb_bus_notification(const struct usb_bus *ubus,
+				      enum usb_notification_type subtype,
+				      u32 error);
+#else
+static inline void post_usb_device_notification(const struct usb_device *udev,
+						enum usb_notification_type subtype,
+						u32 error) {}
+static inline void post_usb_bus_notification(const struct usb_bus *ubus,
+					     enum usb_notification_type subtype,
+					     u32 error) {}
+#endif
+
 #endif  /* __KERNEL__ */
 
 #endif
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index 22e3326b83a6..d596ac5a61e4 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -14,7 +14,8 @@ enum watch_notification_type {
 	WATCH_TYPE_SB_NOTIFY	= 2,	/* Superblock notification */
 	WATCH_TYPE_KEY_NOTIFY	= 3,	/* Key/keyring change notification */
 	WATCH_TYPE_BLOCK_NOTIFY	= 4,	/* Block layer notifications */
-#define WATCH_TYPE___NR 5
+	WATCH_TYPE_USB_NOTIFY	= 5,	/* USB subsystem notifications */
+#define WATCH_TYPE___NR 6
 };
 
 enum watch_meta_notification_subtype {
@@ -182,4 +183,31 @@ struct block_notification {
 	__u64	sector;			/* Affected sector */
 };
 
+/*
+ * Type of USB layer notification.
+ */
+enum usb_notification_type {
+	NOTIFY_USB_DEVICE_ADD		= 0, /* USB device added */
+	NOTIFY_USB_DEVICE_REMOVE	= 1, /* USB device removed */
+	NOTIFY_USB_BUS_ADD		= 2, /* USB bus added */
+	NOTIFY_USB_BUS_REMOVE		= 3, /* USB bus removed */
+	NOTIFY_USB_DEVICE_RESET		= 4, /* USB device reset */
+	NOTIFY_USB_DEVICE_ERROR		= 5, /* USB device error */
+};
+
+/*
+ * USB subsystem notification record.
+ * - watch.type = WATCH_TYPE_USB_NOTIFY
+ * - watch.subtype = enum usb_notification_type
+ */
+struct usb_notification {
+	struct watch_notification watch; /* WATCH_TYPE_USB_NOTIFY */
+	__u32	error;
+	__u32	reserved;
+	__u8	name_len;		/* Length of device name */
+	__u8	name[0];		/* Device name (padded to __u64, truncated at 63 chars) */
+};
+
+#define USB_NOTIFICATION_MAX_NAME_LEN 63
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */

^ permalink raw reply related

* [PATCH 08/10] block: Add block layer notifications [ver #3]
From: David Howells @ 2019-06-06  9:43 UTC (permalink / raw)
  To: viro
  Cc: dhowells, raven, linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

Add a block layer notification mechanism whereby notifications about
block-layer events such as I/O errors, can be reported to a monitoring
process asynchronously.

Firstly, an event queue needs to be created:

	fd = open("/dev/event_queue", O_RDWR);
	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

then a notification can be set up to report block notifications via that
queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_BLOCK_NOTIFY,
				.subtype_filter[0] = UINT_MAX;
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	device_notify(fd, 12);

After that, records will be placed into the queue when, for example, errors
occur on a block device.  Records are of the following format:

	struct block_notification {
		struct watch_notification watch;
		__u64	dev;
		__u64	sector;
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_BLOCK_NOTIFY

	n->watch.subtype will be the type of notification, such as
	NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the second argument to
	device_notify(), shifted.

	n->dev will be the device numbers munged together.

	n->sector will indicate the affected sector (if appropriate for the
	event).

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 Documentation/watch_queue.rst    |   10 +++++++++-
 block/Kconfig                    |    9 +++++++++
 block/blk-core.c                 |   29 +++++++++++++++++++++++++++++
 include/linux/blkdev.h           |   15 +++++++++++++++
 include/uapi/linux/watch_queue.h |   27 +++++++++++++++++++++++++++
 5 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index e4b8233d5aa8..c2954e191989 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -11,7 +11,9 @@ receive notifications from the kernel.  This can be used in conjunction with::
 
   * Superblock event notifications
 
-  * General device event notifications
+  * General device event notifications, including::
+
+    * Block layer event notifications
 
 
 The notifications buffers can be enabled by:
@@ -336,6 +338,12 @@ Any particular buffer can be fed from multiple sources.  Sources include:
 
     See Documentation/security/keys/core.rst for more information.
 
+  * WATCH_TYPE_BLOCK_NOTIFY
+
+    Notifications of this type indicate block layer events, such as I/O errors
+    or temporary link loss.  Watchpoints of this type are set on the global
+    device watch list.
+
 
 Event Filtering
 ===============
diff --git a/block/Kconfig b/block/Kconfig
index 1b220101a9cb..4ff4a56ba9f9 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -163,6 +163,15 @@ config BLK_SED_OPAL
 	Enabling this option enables users to setup/unlock/lock
 	Locking ranges for SED devices using the Opal protocol.
 
+config BLK_NOTIFICATIONS
+	bool "Block layer event notifications"
+	select DEVICE_NOTIFICATIONS
+	help
+	  This option provides support for getting block layer event
+	  notifications.  This makes use of the /dev/watch_queue misc device to
+	  handle the notification buffer and provides the device_notify() system
+	  call to enable/disable watches.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/blk-core.c b/block/blk-core.c
index 419d600e6637..edad86172d47 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -144,6 +144,22 @@ static const struct {
 	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
 };
 
+#ifdef CONFIG_BLK_NOTIFICATIONS
+static const
+enum block_notification_type blk_notifications[ARRAY_SIZE(blk_errors)] = {
+	[BLK_STS_TIMEOUT]	= NOTIFY_BLOCK_ERROR_TIMEOUT,
+	[BLK_STS_NOSPC]		= NOTIFY_BLOCK_ERROR_NO_SPACE,
+	[BLK_STS_TRANSPORT]	= NOTIFY_BLOCK_ERROR_RECOVERABLE_TRANSPORT,
+	[BLK_STS_TARGET]	= NOTIFY_BLOCK_ERROR_CRITICAL_TARGET,
+	[BLK_STS_NEXUS]		= NOTIFY_BLOCK_ERROR_CRITICAL_NEXUS,
+	[BLK_STS_MEDIUM]	= NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM,
+	[BLK_STS_PROTECTION]	= NOTIFY_BLOCK_ERROR_PROTECTION,
+	[BLK_STS_RESOURCE]	= NOTIFY_BLOCK_ERROR_KERNEL_RESOURCE,
+	[BLK_STS_DEV_RESOURCE]	= NOTIFY_BLOCK_ERROR_DEVICE_RESOURCE,
+	[BLK_STS_IOERR]		= NOTIFY_BLOCK_ERROR_IO,
+};
+#endif
+
 blk_status_t errno_to_blk_status(int errno)
 {
 	int i;
@@ -179,6 +195,19 @@ static void print_req_error(struct request *req, blk_status_t status)
 				req->rq_disk ?  req->rq_disk->disk_name : "?",
 				(unsigned long long)blk_rq_pos(req),
 				req->cmd_flags);
+
+#ifdef CONFIG_BLK_NOTIFICATIONS
+	if (blk_notifications[idx]) {
+		struct block_notification n = {
+			.watch.type	= WATCH_TYPE_BLOCK_NOTIFY,
+			.watch.subtype	= blk_notifications[idx],
+			.watch.info	= sizeof(n),
+			.dev		= req->rq_disk ? disk_devt(req->rq_disk) : 0,
+			.sector		= blk_rq_pos(req),
+		};
+		post_block_notification(&n);
+	}
+#endif
 }
 
 static void req_bio_endio(struct request *rq, struct bio *bio,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1aafeb923e7b..8b8e235f47c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -43,6 +43,7 @@ struct pr_ops;
 struct rq_qos;
 struct blk_queue_stats;
 struct blk_stat_callback;
+struct block_notification;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -1744,6 +1745,20 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+#ifdef CONFIG_BLK_NOTIFICATIONS
+static inline void post_block_notification(struct block_notification *n)
+{
+	u64 id = 0; /* Might want to allow dev# here. */
+
+	post_device_notification(&n->watch, id);
+}
+#else
+static inline void post_block_notification(struct block_notification *n)
+{
+}
+#endif
+
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index aeffcfd7a742..22e3326b83a6 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -155,4 +155,31 @@ struct superblock_error_notification {
 	__u32	error_cookie;
 };
 
+/*
+ * Type of block layer notification.
+ */
+enum block_notification_type {
+	NOTIFY_BLOCK_ERROR_TIMEOUT		= 1, /* Timeout error */
+	NOTIFY_BLOCK_ERROR_NO_SPACE		= 2, /* Critical space allocation error */
+	NOTIFY_BLOCK_ERROR_RECOVERABLE_TRANSPORT = 3, /* Recoverable transport error */
+	NOTIFY_BLOCK_ERROR_CRITICAL_TARGET	= 4, /* Critical target error */
+	NOTIFY_BLOCK_ERROR_CRITICAL_NEXUS	= 5, /* Critical nexus error */
+	NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM	= 6, /* Critical medium error */
+	NOTIFY_BLOCK_ERROR_PROTECTION		= 7, /* Protection error */
+	NOTIFY_BLOCK_ERROR_KERNEL_RESOURCE	= 8, /* Kernel resource error */
+	NOTIFY_BLOCK_ERROR_DEVICE_RESOURCE	= 9, /* Device resource error */
+	NOTIFY_BLOCK_ERROR_IO			= 10, /* Other I/O error */
+};
+
+/*
+ * Block layer notification record.
+ * - watch.type = WATCH_TYPE_BLOCK_NOTIFY
+ * - watch.subtype = enum block_notification_type
+ */
+struct block_notification {
+	struct watch_notification watch; /* WATCH_TYPE_BLOCK_NOTIFY */
+	__u64	dev;			/* Device number */
+	__u64	sector;			/* Affected sector */
+};
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */

^ permalink raw reply related

* [PATCH 07/10] Add a general, global device notification watch list [ver #3]
From: David Howells @ 2019-06-06  9:43 UTC (permalink / raw)
  To: viro
  Cc: dhowells, raven, linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

Create a general, global watch list that can be used for the posting of
device notification events, for such things as device attachment,
detachment and errors on sources such as block devices and USB devices.
This can be enabled with:

	CONFIG_DEVICE_NOTIFICATIONS

To add a watch on this list, an event queue must be created and configured:

        fd = open("/dev/event_queue", O_RDWR);
        ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

and then a watch can be placed upon it using a system call:

        device_notify(fd, 12);

Unless the application wants to receive all events, it should emplace
appropriate filters.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 Documentation/watch_queue.rst          |   27 ++++++++---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 drivers/base/Kconfig                   |    9 ++++
 drivers/base/Makefile                  |    1 
 drivers/base/notify.c                  |   82 ++++++++++++++++++++++++++++++++
 include/linux/device.h                 |    7 +++
 include/linux/syscalls.h               |    1 
 include/linux/watch_queue.h            |    3 +
 kernel/sys_ni.c                        |    1 
 10 files changed, 126 insertions(+), 7 deletions(-)
 create mode 100644 drivers/base/notify.c

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index 0668c4a31710..e4b8233d5aa8 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -11,7 +11,7 @@ receive notifications from the kernel.  This can be used in conjunction with::
 
   * Superblock event notifications
 
-  * Block layer event notifications
+  * General device event notifications
 
 
 The notifications buffers can be enabled by:
@@ -292,6 +292,25 @@ The ``id`` is the ID of the source object (such as the serial number on a key).
 Only watches that have the same ID set in them will see this notification.
 
 
+Global Device Watch List
+========================
+
+There is a global watch list that hardware generated events, such as device
+connection, disconnection, failure and error can be posted upon.  It must be
+enabled using::
+
+	CONFIG_DEVICE_NOTIFICATIONS
+
+Watchpoints are set in userspace using the device_notify(2) system call.
+Within the kernel events are posted upon it using::
+
+	void post_device_notification(struct watch_notification *n, u64 id);
+
+where ``n`` is the formatted notification record to post.  ``id`` is an
+identifier that can be used to direct to specific watches, but it should be 0
+for general use on this queue.
+
+
 Watch Sources
 =============
 
@@ -317,12 +336,6 @@ Any particular buffer can be fed from multiple sources.  Sources include:
 
     See Documentation/security/keys/core.rst for more information.
 
-  * WATCH_TYPE_BLOCK_NOTIFY
-
-    Notifications of this type indicate block layer events, such as I/O errors
-    or temporary link loss.  Watchpoints of this type are set on a global
-    queue.
-
 
 Event Filtering
 ===============
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 429416ce60e1..4a12ab8ac7ef 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -441,3 +441,4 @@
 434	i386	fsinfo			sys_fsinfo			__ia32_sys_fsinfo
 435	i386	mount_notify		sys_mount_notify		__ia32_sys_mount_notify
 436	i386	sb_notify		sys_sb_notify			__ia32_sys_sb_notify
+437	i386	device_notify		sys_device_notify		__ia32_sys_device_notify
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 4ae146e472db..60f847eb0977 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -358,6 +358,7 @@
 434	common	fsinfo			__x64_sys_fsinfo
 435	common	mount_notify		__x64_sys_mount_notify
 436	common	sb_notify		__x64_sys_sb_notify
+437	common	device_notify		__x64_sys_device_notify
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index dc404492381d..63db34efb23b 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -1,6 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0
 menu "Generic Driver Options"
 
+config DEVICE_NOTIFICATIONS
+	bool "Provide device event notifications"
+	select WATCH_QUEUE
+	help
+	  This option provides support for getting hardware event notifications
+	  on devices, buses and interfaces.  This makes use of the
+	  /dev/watch_queue misc device to handle the notification buffer.
+	  device_notify(2) is used to set/remove watches.
+
 config UEVENT_HELPER
 	bool "Support for uevent helper"
 	help
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 157452080f3d..9fc43539f970 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -7,6 +7,7 @@ obj-y			:= component.o core.o bus.o dd.o syscore.o \
 			   attribute_container.o transport_class.o \
 			   topology.o container.o property.o cacheinfo.o \
 			   devcon.o swnode.o
+obj-$(CONFIG_DEVICE_NOTIFICATIONS) += notify.o
 obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
 obj-y			+= power/
 obj-$(CONFIG_ISA_BUS_API)	+= isa.o
diff --git a/drivers/base/notify.c b/drivers/base/notify.c
new file mode 100644
index 000000000000..1c4bb55e387b
--- /dev/null
+++ b/drivers/base/notify.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Event notifications.
+ *
+ * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/watch_queue.h>
+#include <linux/syscalls.h>
+#include <linux/init_task.h>
+
+/*
+ * Global queue for watching for device layer events.
+ */
+static struct watch_list device_watchers = {
+	.watchers	= HLIST_HEAD_INIT,
+	.lock		= __SPIN_LOCK_UNLOCKED(&device_watchers.lock),
+};
+
+static DEFINE_SPINLOCK(device_watchers_lock);
+
+/**
+ * post_device_notification - Post notification of a device event
+ * @n - The notification to post
+ * @id - The device ID
+ *
+ * Note that there's only a global queue to which all events are posted.  Might
+ * want to provide per-dev queues also.
+ */
+void post_device_notification(struct watch_notification *n, u64 id)
+{
+	post_watch_notification(&device_watchers, n, &init_cred, id);
+}
+
+/**
+ * sys_device_notify - Watch for superdevice events.
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE2(device_notify, int, watch_fd, int, watch_id)
+{
+	struct watch_queue *wqueue;
+	struct watch_list *wlist = &device_watchers;
+	struct watch *watch;
+	long ret = -ENOMEM;
+	u64 id = 0; /* Might want to allow dev# here. */
+
+	if (watch_id < -1 || watch_id > 0xff)
+		return -EINVAL;
+
+	wqueue = get_watch_queue(watch_fd);
+	if (IS_ERR(wqueue)) {
+		ret = PTR_ERR(wqueue);
+		goto err;
+	}
+
+	if (watch_id >= 0) {
+		watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+		if (!watch)
+			goto err_wqueue;
+
+		init_watch(watch, wqueue);
+		watch->id	= id;
+		watch->info_id	= (u32)watch_id << WATCH_INFO_ID__SHIFT;
+
+		spin_lock(&device_watchers_lock);
+		ret = add_watch_to_object(watch, wlist);
+		spin_unlock(&device_watchers_lock);
+		if (ret < 0)
+			kfree(watch);
+	} else {
+		spin_lock(&device_watchers_lock);
+		ret = remove_watch_from_object(wlist, wqueue, id, false);
+		spin_unlock(&device_watchers_lock);
+	}
+
+err_wqueue:
+	put_watch_queue(wqueue);
+err:
+	return ret;
+}
diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..c947c078b1be 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -26,6 +26,7 @@
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
 #include <linux/overflow.h>
+#include <linux/watch_queue.h>
 #include <asm/device.h>
 
 struct device;
@@ -1396,6 +1397,12 @@ struct device_link *device_link_add(struct device *consumer,
 void device_link_del(struct device_link *link);
 void device_link_remove(void *consumer, struct device *supplier);
 
+#ifdef CONFIG_DEVICE_NOTIFICATIONS
+extern void post_device_notification(struct watch_notification *n, u64 id);
+#else
+static inline void post_device_notification(struct watch_notification *n, u64 id) {}
+#endif
+
 #ifndef dev_fmt
 #define dev_fmt(fmt) fmt
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 204a6dbcc34a..8cd9ec564d01 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1005,6 +1005,7 @@ asmlinkage long sys_mount_notify(int dfd, const char __user *path,
 				 unsigned int at_flags, int watch_fd, int watch_id);
 asmlinkage long sys_sb_notify(int dfd, const char __user *path,
 			      unsigned int at_flags, int watch_fd, int watch_id);
+asmlinkage long sys_device_notify(int watch_fd, int watch_id);
 
 /*
  * Architecture-specific system calls
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 91777119db5e..8a5d586dfdf8 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -12,10 +12,12 @@
 
 #include <uapi/linux/watch_queue.h>
 #include <linux/kref.h>
+#include <linux/rcupdate.h>
 
 #ifdef CONFIG_WATCH_QUEUE
 
 struct watch_queue;
+struct cred;
 
 /*
  * Representation of a watch on an object.
@@ -53,6 +55,7 @@ extern void put_watch_queue(struct watch_queue *);
 extern void init_watch(struct watch *, struct watch_queue *);
 extern int add_watch_to_object(struct watch *, struct watch_list *);
 extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool);
+extern void post_device_notification(struct watch_notification *, u64);
 
 static inline void init_watch_list(struct watch_list *wlist,
 				   void (*release_watch)(struct watch *))
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 565d1e3d1bed..580374089f8d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,6 +51,7 @@ COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
 COND_SYSCALL(io_uring_register);
+COND_SYSCALL(device_notify);
 
 /* fs/xattr.c */
 

^ permalink raw reply related

* [PATCH 06/10] fsinfo: Export superblock notification counter [ver #3]
From: David Howells @ 2019-06-06  9:42 UTC (permalink / raw)
  To: viro
  Cc: dhowells, raven, linux-fsdevel, linux-api, linux-block, keyrings,
	linux-security-module, linux-kernel
In-Reply-To: <155981411940.17513.7137844619951358374.stgit@warthog.procyon.org.uk>

Provide an fsinfo attribute to export the superblock notification counter
so that it can be polled in the case of a notification buffer overrun.
This is accessed with:

	struct fsinfo_params params = {
		.request = FSINFO_ATTR_SB_NOTIFICATIONS,
	};

and returns a structure that looks like:

	struct fsinfo_sb_notifications {
		__u64	watch_id;
		__u32	notify_counter;
		__u32	__reserved[1];
	};

Where watch_id is a number uniquely identifying the superblock in
notification records and notify_counter is incremented for each
superblock notification posted.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/fsinfo.c                      |   12 ++++++++++++
 fs/super.c                       |    1 +
 include/linux/fs.h               |    1 +
 include/uapi/linux/fsinfo.h      |   10 ++++++++++
 include/uapi/linux/watch_queue.h |    2 +-
 samples/vfs/test-fsinfo.c        |   13 +++++++++++++
 6 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/fs/fsinfo.c b/fs/fsinfo.c
index 3ec64d3cba08..1456e26d2f7c 100644
--- a/fs/fsinfo.c
+++ b/fs/fsinfo.c
@@ -284,6 +284,16 @@ static int fsinfo_generic_param_enum(struct file_system_type *f,
 	return sizeof(*p);
 }
 
+static int fsinfo_generic_sb_notifications(struct path *path,
+					   struct fsinfo_sb_notifications *p)
+{
+	struct super_block *sb = path->dentry->d_sb;
+
+	p->watch_id		= sb->s_unique_id;
+	p->notify_counter	= atomic_read(&sb->s_notify_counter);
+	return sizeof(*p);
+}
+
 static void fsinfo_insert_sb_flag_parameters(struct path *path,
 					     struct fsinfo_kparams *params)
 {
@@ -331,6 +341,7 @@ int generic_fsinfo(struct path *path, struct fsinfo_kparams *params)
 	case _genp(MOUNT_DEVNAME,	mount_devname);
 	case _genp(MOUNT_CHILDREN,	mount_children);
 	case _genp(MOUNT_SUBMOUNT,	mount_submount);
+	case _gen(SB_NOTIFICATIONS,	sb_notifications);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -606,6 +617,7 @@ static const struct fsinfo_attr_info fsinfo_buffer_info[FSINFO_ATTR__NR] = {
 	FSINFO_STRING_N		(SERVER_NAME,		server_name),
 	FSINFO_STRUCT_NM	(SERVER_ADDRESS,	server_address),
 	FSINFO_STRING		(CELL_NAME,		cell_name),
+	FSINFO_STRUCT		(SB_NOTIFICATIONS,	sb_notifications),
 };
 
 /**
diff --git a/fs/super.c b/fs/super.c
index cddf23f1d648..da428702e725 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1823,6 +1823,7 @@ EXPORT_SYMBOL(thaw_super);
  */
 void post_sb_notification(struct super_block *s, struct superblock_notification *n)
 {
+	atomic_inc(&s->s_notify_counter);
 	post_watch_notification(s->s_watchers, &n->watch, current_cred(),
 				s->s_unique_id);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 02ba4bfb9cc3..06e272a25ed7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1536,6 +1536,7 @@ struct super_block {
 #ifdef CONFIG_SB_NOTIFICATIONS
 	struct watch_list	*s_watchers;
 #endif
+	atomic_t		s_notify_counter;
 } __randomize_layout;
 
 /* Helper functions so that in most cases filesystems will
diff --git a/include/uapi/linux/fsinfo.h b/include/uapi/linux/fsinfo.h
index 7247088332c2..b4c9446305bb 100644
--- a/include/uapi/linux/fsinfo.h
+++ b/include/uapi/linux/fsinfo.h
@@ -39,6 +39,7 @@ enum fsinfo_attribute {
 	FSINFO_ATTR_SERVER_NAME		= 21,	/* Name of the Nth server (string) */
 	FSINFO_ATTR_SERVER_ADDRESS	= 22,	/* Mth address of the Nth server */
 	FSINFO_ATTR_CELL_NAME		= 23,	/* Cell name (string) */
+	FSINFO_ATTR_SB_NOTIFICATIONS	= 24,	/* sb_notify() information */
 	FSINFO_ATTR__NR
 };
 
@@ -308,4 +309,13 @@ struct fsinfo_server_address {
 	struct __kernel_sockaddr_storage address;
 };
 
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_SB_NOTIFICATIONS).
+ */
+struct fsinfo_sb_notifications {
+	__u64		watch_id;	/* Watch ID for superblock. */
+	__u32		notify_counter;	/* Number of notifications. */
+	__u32		__reserved[1];
+};
+
 #endif /* _UAPI_LINUX_FSINFO_H */
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index 66b0da7cf888..aeffcfd7a742 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -146,7 +146,7 @@ enum superblock_notification_type {
  */
 struct superblock_notification {
 	struct watch_notification watch; /* WATCH_TYPE_SB_NOTIFY */
-	__u64	sb_id;			/* 64-bit superblock ID [fsinfo_ids::f_sb_id] */
+	__u64	sb_id;		/* 64-bit superblock ID [fsinfo_sb_notifications::watch_id] */
 };
 
 struct superblock_error_notification {
diff --git a/samples/vfs/test-fsinfo.c b/samples/vfs/test-fsinfo.c
index af29da74559e..0f8f9ded0925 100644
--- a/samples/vfs/test-fsinfo.c
+++ b/samples/vfs/test-fsinfo.c
@@ -90,6 +90,7 @@ static const struct fsinfo_attr_info fsinfo_buffer_info[FSINFO_ATTR__NR] = {
 	FSINFO_STRING_N		(SERVER_NAME,		server_name),
 	FSINFO_STRUCT_NM	(SERVER_ADDRESS,	server_address),
 	FSINFO_STRING		(CELL_NAME,		cell_name),
+	FSINFO_STRUCT		(SB_NOTIFICATIONS,	sb_notifications),
 };
 
 #define FSINFO_NAME(X,Y) [FSINFO_ATTR_##X] = #Y
@@ -118,6 +119,7 @@ static const char *fsinfo_attr_names[FSINFO_ATTR__NR] = {
 	FSINFO_NAME		(SERVER_NAME,		server_name),
 	FSINFO_NAME		(SERVER_ADDRESS,	server_address),
 	FSINFO_NAME		(CELL_NAME,		cell_name),
+	FSINFO_NAME		(SB_NOTIFICATIONS,	sb_notifications),
 };
 
 union reply {
@@ -133,6 +135,7 @@ union reply {
 	struct fsinfo_mount_info mount_info;
 	struct fsinfo_mount_child mount_children[1];
 	struct fsinfo_server_address srv_addr;
+	struct fsinfo_sb_notifications sb_notifications;
 };
 
 static void dump_hex(unsigned int *data, int from, int to)
@@ -377,6 +380,15 @@ static void dump_attr_MOUNT_CHILDREN(union reply *r, int size)
 		printf("\t[%u] %8x %8x\n", i++, f->mnt_id, f->notify_counter);
 }
 
+static void dump_attr_SB_NOTIFICATIONS(union reply *r, int size)
+{
+	struct fsinfo_sb_notifications *f = &r->sb_notifications;
+
+	printf("\n");
+	printf("\twatch_id: %llx\n", (unsigned long long)f->watch_id);
+	printf("\tnotifs  : %llx\n", (unsigned long long)f->notify_counter);
+}
+
 /*
  *
  */
@@ -395,6 +407,7 @@ static const dumper_t fsinfo_attr_dumper[FSINFO_ATTR__NR] = {
 	FSINFO_DUMPER(MOUNT_INFO),
 	FSINFO_DUMPER(MOUNT_CHILDREN),
 	FSINFO_DUMPER(SERVER_ADDRESS),
+	FSINFO_DUMPER(SB_NOTIFICATIONS),
 };
 
 static void dump_fsinfo(enum fsinfo_attribute attr,

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox