* [PATCH 03/18] fsinfo: Provide a bitmap of the features a filesystem supports [ver #21]
From: David Howells @ 2020-08-03 13:36 UTC (permalink / raw)
To: viro
Cc: dhowells, torvalds, raven, mszeredi, christian, jannh,
darrick.wong, kzak, jlayton, linux-api, linux-fsdevel,
linux-security-module, linux-kernel
In-Reply-To: <159646178122.1784947.11705396571718464082.stgit@warthog.procyon.org.uk>
Provide a bitmap of features that a filesystem may provide for the path
being queried. Features include such things as:
(1) The general class of filesystem, such as kernel-interface,
block-based, flash-based, network-based.
(2) Supported inode features, such as which timestamps are supported,
whether simple numeric user, group or project IDs are supported and
whether user identification is actually more complex behind the
scenes.
(3) Supported volume features, such as it having a UUID, a name or a
filesystem ID.
(4) Supported filesystem features, such as what types of file are
supported, whether sparse files, extended attributes and quotas are
supported.
(5) Supported interface features, such as whether locking and leases are
supported, what open flags are honoured and how i_version is managed.
For some filesystems, this may be an immutable set and can just be memcpy'd
into the reply buffer.
Signed-off-by: David Howells <dhowells@redhat.com>
---
fs/fsinfo.c | 34 +++++++++++++++++++++
include/linux/fsinfo.h | 38 +++++++++++++++++++++++
include/uapi/linux/fsinfo.h | 68 ++++++++++++++++++++++++++++++++++++++++++
samples/vfs/test-fsinfo.c | 70 +++++++++++++++++++++++++++++++++++++++++++
4 files changed, 210 insertions(+)
diff --git a/fs/fsinfo.c b/fs/fsinfo.c
index 7d9c73e9cbde..79c222d465d8 100644
--- a/fs/fsinfo.c
+++ b/fs/fsinfo.c
@@ -131,6 +131,39 @@ int fsinfo_generic_supports(struct path *path, struct fsinfo_context *ctx)
}
EXPORT_SYMBOL(fsinfo_generic_supports);
+int fsinfo_generic_features(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_features *p = ctx->buffer;
+ struct super_block *sb = path->dentry->d_sb;
+
+ fsinfo_init_features(p);
+ if (sb->s_mtd)
+ fsinfo_set_feature(p, FSINFO_FEAT_IS_FLASH_FS);
+ else if (sb->s_bdev)
+ fsinfo_set_feature(p, FSINFO_FEAT_IS_BLOCK_FS);
+
+ if (sb->s_quota_types & QTYPE_MASK_USR)
+ fsinfo_set_feature(p, FSINFO_FEAT_USER_QUOTAS);
+ if (sb->s_quota_types & QTYPE_MASK_GRP)
+ fsinfo_set_feature(p, FSINFO_FEAT_GROUP_QUOTAS);
+ if (sb->s_quota_types & QTYPE_MASK_PRJ)
+ fsinfo_set_feature(p, FSINFO_FEAT_PROJECT_QUOTAS);
+ if (sb->s_d_op && sb->s_d_op->d_automount)
+ fsinfo_set_feature(p, FSINFO_FEAT_AUTOMOUNTS);
+ if (sb->s_id[0])
+ fsinfo_set_feature(p, FSINFO_FEAT_VOLUME_ID);
+ if (sb->s_flags & SB_MANDLOCK)
+ fsinfo_set_feature(p, FSINFO_FEAT_MAND_LOCKS);
+ if (sb->s_flags & SB_POSIXACL)
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_ACL);
+
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_ATIME);
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_CTIME);
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_MTIME);
+ return sizeof(*p);
+}
+EXPORT_SYMBOL(fsinfo_generic_features);
+
static const struct fsinfo_timestamp_info fsinfo_default_timestamp_info = {
.atime = {
.minimum = S64_MIN,
@@ -206,6 +239,7 @@ static const struct fsinfo_attribute fsinfo_common_attributes[] = {
FSINFO_VSTRUCT (FSINFO_ATTR_TIMESTAMP_INFO, fsinfo_generic_timestamp_info),
FSINFO_STRING (FSINFO_ATTR_VOLUME_ID, fsinfo_generic_volume_id),
FSINFO_VSTRUCT (FSINFO_ATTR_VOLUME_UUID, fsinfo_generic_volume_uuid),
+ FSINFO_VSTRUCT (FSINFO_ATTR_FEATURES, fsinfo_generic_features),
FSINFO_LIST (FSINFO_ATTR_FSINFO_ATTRIBUTES, (void *)123UL),
FSINFO_VSTRUCT_N(FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO, (void *)123UL),
diff --git a/include/linux/fsinfo.h b/include/linux/fsinfo.h
index a811d69b02ff..517edd5a2791 100644
--- a/include/linux/fsinfo.h
+++ b/include/linux/fsinfo.h
@@ -68,6 +68,44 @@ extern int fsinfo_generic_supports(struct path *, struct fsinfo_context *);
extern int fsinfo_generic_limits(struct path *, struct fsinfo_context *);
extern int fsinfo_get_attribute(struct path *, struct fsinfo_context *,
const struct fsinfo_attribute *);
+extern int fsinfo_generic_features(struct path *, struct fsinfo_context *);
+
+static inline void fsinfo_init_features(struct fsinfo_features *p)
+{
+ p->nr_features = FSINFO_FEAT__NR;
+}
+
+static inline void fsinfo_set_feature(struct fsinfo_features *p,
+ enum fsinfo_feature feature)
+{
+ p->features[feature / 8] |= 1 << (feature % 8);
+}
+
+static inline void fsinfo_clear_feature(struct fsinfo_features *p,
+ enum fsinfo_feature feature)
+{
+ p->features[feature / 8] &= ~(1 << (feature % 8));
+}
+
+/**
+ * fsinfo_set_unix_features - Set standard UNIX features.
+ * @f: The features mask to alter
+ */
+static inline void fsinfo_set_unix_features(struct fsinfo_features *p)
+{
+ fsinfo_set_feature(p, FSINFO_FEAT_UIDS);
+ fsinfo_set_feature(p, FSINFO_FEAT_GIDS);
+ fsinfo_set_feature(p, FSINFO_FEAT_DIRECTORIES);
+ fsinfo_set_feature(p, FSINFO_FEAT_SYMLINKS);
+ fsinfo_set_feature(p, FSINFO_FEAT_HARD_LINKS);
+ fsinfo_set_feature(p, FSINFO_FEAT_DEVICE_FILES);
+ fsinfo_set_feature(p, FSINFO_FEAT_UNIX_SPECIALS);
+ fsinfo_set_feature(p, FSINFO_FEAT_SPARSE);
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_ATIME);
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_CTIME);
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_MTIME);
+ fsinfo_set_feature(p, FSINFO_FEAT_HAS_INODE_NUMBERS);
+}
#endif /* CONFIG_FSINFO */
diff --git a/include/uapi/linux/fsinfo.h b/include/uapi/linux/fsinfo.h
index 65892239ba86..b8b2c836267b 100644
--- a/include/uapi/linux/fsinfo.h
+++ b/include/uapi/linux/fsinfo.h
@@ -23,6 +23,7 @@
#define FSINFO_ATTR_VOLUME_ID 0x05 /* Volume ID (string) */
#define FSINFO_ATTR_VOLUME_UUID 0x06 /* Volume UUID (LE uuid) */
#define FSINFO_ATTR_VOLUME_NAME 0x07 /* Volume name (string) */
+#define FSINFO_ATTR_FEATURES 0x08 /* Filesystem features (bits) */
#define FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO 0x100 /* Information about attr N (for path) */
#define FSINFO_ATTR_FSINFO_ATTRIBUTES 0x101 /* List of supported attrs (for path) */
@@ -157,6 +158,73 @@ struct fsinfo_supports {
#define FSINFO_ATTR_SUPPORTS__STRUCT struct fsinfo_supports
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_FEATURES).
+ *
+ * Bitmask indicating filesystem features where renderable as single bits.
+ */
+enum fsinfo_feature {
+ FSINFO_FEAT_IS_KERNEL_FS = 0, /* fs is kernel-special filesystem */
+ FSINFO_FEAT_IS_BLOCK_FS = 1, /* fs is block-based filesystem */
+ FSINFO_FEAT_IS_FLASH_FS = 2, /* fs is flash filesystem */
+ FSINFO_FEAT_IS_NETWORK_FS = 3, /* fs is network filesystem */
+ FSINFO_FEAT_IS_AUTOMOUNTER_FS = 4, /* fs is automounter special filesystem */
+ FSINFO_FEAT_IS_MEMORY_FS = 5, /* fs is memory-based filesystem */
+ FSINFO_FEAT_AUTOMOUNTS = 6, /* fs supports automounts */
+ FSINFO_FEAT_ADV_LOCKS = 7, /* fs supports advisory file locking */
+ FSINFO_FEAT_MAND_LOCKS = 8, /* fs supports mandatory file locking */
+ FSINFO_FEAT_LEASES = 9, /* fs supports file leases */
+ FSINFO_FEAT_UIDS = 10, /* fs supports numeric uids */
+ FSINFO_FEAT_GIDS = 11, /* fs supports numeric gids */
+ FSINFO_FEAT_PROJIDS = 12, /* fs supports numeric project ids */
+ FSINFO_FEAT_STRING_USER_IDS = 13, /* fs supports string user identifiers */
+ FSINFO_FEAT_GUID_USER_IDS = 14, /* fs supports GUID user identifiers */
+ FSINFO_FEAT_WINDOWS_ATTRS = 15, /* fs has windows attributes */
+ FSINFO_FEAT_USER_QUOTAS = 16, /* fs has per-user quotas */
+ FSINFO_FEAT_GROUP_QUOTAS = 17, /* fs has per-group quotas */
+ FSINFO_FEAT_PROJECT_QUOTAS = 18, /* fs has per-project quotas */
+ FSINFO_FEAT_XATTRS = 19, /* fs has xattrs */
+ FSINFO_FEAT_JOURNAL = 20, /* fs has a journal */
+ FSINFO_FEAT_DATA_IS_JOURNALLED = 21, /* fs is using data journalling */
+ FSINFO_FEAT_O_SYNC = 22, /* fs supports O_SYNC */
+ FSINFO_FEAT_O_DIRECT = 23, /* fs supports O_DIRECT */
+ FSINFO_FEAT_VOLUME_ID = 24, /* fs has a volume ID */
+ FSINFO_FEAT_VOLUME_UUID = 25, /* fs has a volume UUID */
+ FSINFO_FEAT_VOLUME_NAME = 26, /* fs has a volume name */
+ FSINFO_FEAT_VOLUME_FSID = 27, /* fs has a volume FSID */
+ FSINFO_FEAT_IVER_ALL_CHANGE = 28, /* i_version represents data + meta changes */
+ FSINFO_FEAT_IVER_DATA_CHANGE = 29, /* i_version represents data changes only */
+ FSINFO_FEAT_IVER_MONO_INCR = 30, /* i_version incremented monotonically */
+ FSINFO_FEAT_DIRECTORIES = 31, /* fs supports (sub)directories */
+ FSINFO_FEAT_SYMLINKS = 32, /* fs supports symlinks */
+ FSINFO_FEAT_HARD_LINKS = 33, /* fs supports hard links */
+ FSINFO_FEAT_HARD_LINKS_1DIR = 34, /* fs supports hard links in same dir only */
+ FSINFO_FEAT_DEVICE_FILES = 35, /* fs supports bdev, cdev */
+ FSINFO_FEAT_UNIX_SPECIALS = 36, /* fs supports pipe, fifo, socket */
+ FSINFO_FEAT_RESOURCE_FORKS = 37, /* fs supports resource forks/streams */
+ FSINFO_FEAT_NAME_CASE_INDEP = 38, /* Filename case independence is mandatory */
+ FSINFO_FEAT_NAME_CASE_FOLD = 39, /* Filename case is folded on medium */
+ FSINFO_FEAT_NAME_NON_UTF8 = 40, /* fs has non-utf8 names */
+ FSINFO_FEAT_NAME_HAS_CODEPAGE = 41, /* fs has a filename codepage */
+ FSINFO_FEAT_SPARSE = 42, /* fs supports sparse files */
+ FSINFO_FEAT_NOT_PERSISTENT = 43, /* fs is not persistent */
+ FSINFO_FEAT_NO_UNIX_MODE = 44, /* fs does not support unix mode bits */
+ FSINFO_FEAT_HAS_ATIME = 45, /* fs supports access time */
+ FSINFO_FEAT_HAS_BTIME = 46, /* fs supports birth/creation time */
+ FSINFO_FEAT_HAS_CTIME = 47, /* fs supports change time */
+ FSINFO_FEAT_HAS_MTIME = 48, /* fs supports modification time */
+ FSINFO_FEAT_HAS_ACL = 49, /* fs supports ACLs of some sort */
+ FSINFO_FEAT_HAS_INODE_NUMBERS = 50, /* fs has inode numbers */
+ FSINFO_FEAT__NR
+};
+
+struct fsinfo_features {
+ __u32 nr_features; /* Number of supported features (FSINFO_FEAT__NR) */
+ __u8 features[(FSINFO_FEAT__NR + 7) / 8];
+};
+
+#define FSINFO_ATTR_FEATURES__STRUCT struct fsinfo_features
+
struct fsinfo_timestamp_one {
__s64 minimum; /* Minimum timestamp value in seconds */
__s64 maximum; /* Maximum timestamp value in seconds */
diff --git a/samples/vfs/test-fsinfo.c b/samples/vfs/test-fsinfo.c
index 934b25399ffe..c5932109f683 100644
--- a/samples/vfs/test-fsinfo.c
+++ b/samples/vfs/test-fsinfo.c
@@ -190,6 +190,75 @@ static void dump_fsinfo_generic_supports(void *reply, unsigned int size)
printf("\twin_fattrs : %x\n", f->win_file_attrs);
}
+#define FSINFO_FEATURE_NAME(C) [FSINFO_FEAT_##C] = #C
+static const char *fsinfo_feature_names[FSINFO_FEAT__NR] = {
+ FSINFO_FEATURE_NAME(IS_KERNEL_FS),
+ FSINFO_FEATURE_NAME(IS_BLOCK_FS),
+ FSINFO_FEATURE_NAME(IS_FLASH_FS),
+ FSINFO_FEATURE_NAME(IS_NETWORK_FS),
+ FSINFO_FEATURE_NAME(IS_AUTOMOUNTER_FS),
+ FSINFO_FEATURE_NAME(IS_MEMORY_FS),
+ FSINFO_FEATURE_NAME(AUTOMOUNTS),
+ FSINFO_FEATURE_NAME(ADV_LOCKS),
+ FSINFO_FEATURE_NAME(MAND_LOCKS),
+ FSINFO_FEATURE_NAME(LEASES),
+ FSINFO_FEATURE_NAME(UIDS),
+ FSINFO_FEATURE_NAME(GIDS),
+ FSINFO_FEATURE_NAME(PROJIDS),
+ FSINFO_FEATURE_NAME(STRING_USER_IDS),
+ FSINFO_FEATURE_NAME(GUID_USER_IDS),
+ FSINFO_FEATURE_NAME(WINDOWS_ATTRS),
+ FSINFO_FEATURE_NAME(USER_QUOTAS),
+ FSINFO_FEATURE_NAME(GROUP_QUOTAS),
+ FSINFO_FEATURE_NAME(PROJECT_QUOTAS),
+ FSINFO_FEATURE_NAME(XATTRS),
+ FSINFO_FEATURE_NAME(JOURNAL),
+ FSINFO_FEATURE_NAME(DATA_IS_JOURNALLED),
+ FSINFO_FEATURE_NAME(O_SYNC),
+ FSINFO_FEATURE_NAME(O_DIRECT),
+ FSINFO_FEATURE_NAME(VOLUME_ID),
+ FSINFO_FEATURE_NAME(VOLUME_UUID),
+ FSINFO_FEATURE_NAME(VOLUME_NAME),
+ FSINFO_FEATURE_NAME(VOLUME_FSID),
+ FSINFO_FEATURE_NAME(IVER_ALL_CHANGE),
+ FSINFO_FEATURE_NAME(IVER_DATA_CHANGE),
+ FSINFO_FEATURE_NAME(IVER_MONO_INCR),
+ FSINFO_FEATURE_NAME(DIRECTORIES),
+ FSINFO_FEATURE_NAME(SYMLINKS),
+ FSINFO_FEATURE_NAME(HARD_LINKS),
+ FSINFO_FEATURE_NAME(HARD_LINKS_1DIR),
+ FSINFO_FEATURE_NAME(DEVICE_FILES),
+ FSINFO_FEATURE_NAME(UNIX_SPECIALS),
+ FSINFO_FEATURE_NAME(RESOURCE_FORKS),
+ FSINFO_FEATURE_NAME(NAME_CASE_INDEP),
+ FSINFO_FEATURE_NAME(NAME_CASE_FOLD),
+ FSINFO_FEATURE_NAME(NAME_NON_UTF8),
+ FSINFO_FEATURE_NAME(NAME_HAS_CODEPAGE),
+ FSINFO_FEATURE_NAME(SPARSE),
+ FSINFO_FEATURE_NAME(NOT_PERSISTENT),
+ FSINFO_FEATURE_NAME(NO_UNIX_MODE),
+ FSINFO_FEATURE_NAME(HAS_ATIME),
+ FSINFO_FEATURE_NAME(HAS_BTIME),
+ FSINFO_FEATURE_NAME(HAS_CTIME),
+ FSINFO_FEATURE_NAME(HAS_MTIME),
+ FSINFO_FEATURE_NAME(HAS_ACL),
+ FSINFO_FEATURE_NAME(HAS_INODE_NUMBERS),
+};
+
+static void dump_fsinfo_generic_features(void *reply, unsigned int size)
+{
+ struct fsinfo_features *f = reply;
+ int i;
+
+ printf("\n\t");
+ for (i = 0; i < sizeof(f->features); i++)
+ printf("%02x", f->features[i]);
+ printf(" (nr=%u)\n", f->nr_features);
+ for (i = 0; i < FSINFO_FEAT__NR; i++)
+ if (f->features[i / 8] & (1 << (i % 8)))
+ printf("\t- %s\n", fsinfo_feature_names[i]);
+}
+
static void print_time(struct fsinfo_timestamp_one *t, char stamp)
{
printf("\t%ctime : gran=%uE%d range=%llx-%llx\n",
@@ -290,6 +359,7 @@ static const struct fsinfo_attribute fsinfo_attributes[] = {
FSINFO_VSTRUCT (FSINFO_ATTR_IDS, fsinfo_generic_ids),
FSINFO_VSTRUCT (FSINFO_ATTR_LIMITS, fsinfo_generic_limits),
FSINFO_VSTRUCT (FSINFO_ATTR_SUPPORTS, fsinfo_generic_supports),
+ FSINFO_VSTRUCT (FSINFO_ATTR_FEATURES, fsinfo_generic_features),
FSINFO_VSTRUCT (FSINFO_ATTR_TIMESTAMP_INFO, fsinfo_generic_timestamp_info),
FSINFO_STRING (FSINFO_ATTR_VOLUME_ID, string),
FSINFO_VSTRUCT (FSINFO_ATTR_VOLUME_UUID, fsinfo_generic_volume_uuid),
^ permalink raw reply related
* [PATCH 02/18] fsinfo: Add fsinfo() syscall to query filesystem information [ver #21]
From: David Howells @ 2020-08-03 13:36 UTC (permalink / raw)
To: viro
Cc: linux-api, dhowells, torvalds, raven, mszeredi, christian, jannh,
darrick.wong, kzak, jlayton, linux-api, linux-fsdevel,
linux-security-module, linux-kernel
In-Reply-To: <159646178122.1784947.11705396571718464082.stgit@warthog.procyon.org.uk>
Add a system call to allow filesystem information to be queried. A request
value can be given to indicate the desired attribute. Support is provided
for enumerating multi-value attributes.
===============
NEW SYSTEM CALL
===============
The new system call looks like:
int ret = fsinfo(int dfd,
const char *pathname,
const struct fsinfo_params *params,
size_t params_size,
void *result_buffer,
size_t result_buf_size);
The params parameter optionally points to a block of parameters:
struct fsinfo_params {
__u64 resolve_flags;
__u32 at_flags;
__u32 flags;
__u32 request;
__u32 Nth;
__u32 Mth;
};
If params is NULL, the default is that params->request is
FSINFO_ATTR_STATFS and all the other fields are 0. params_size indicates
the size of the parameter struct. If the parameter block is short compared
to what the kernel expects, the missing length will be set to 0; if the
parameter block is longer, an error will be given if the excess is not all
zeros.
The object to be queried is specified as follows - part param->flags
indicates the type of reference:
(1) FSINFO_FLAGS_QUERY_PATH - dfd, pathname and at_flags indicate a
filesystem object to query.
There is no separate system call providing an analogue of lstat() -
AT_SYMLINK_NOFOLLOW should be set in at_flags instead.
AT_NO_AUTOMOUNT can also be used to an allow automount point to be
queried without triggering it.
RESOLVE_* flags can also be set in resolve_flags to further restrict
the patchwalk.
(2) FSINFO_FLAGS_QUERY_FD - dfd indicates a file descriptor pointing to
the filesystem object to query. pathname should be NULL.
(3) FSINFO_FLAGS_QUERY_MOUNT - pathname indicates the numeric ID of the
mountpoint to query as a string. dfd is used to constrain which
mounts can be accessed. If dfd is AT_FDCWD, the mount must be within
the subtree rooted at chroot, otherwise the mount must be within the
subtree rooted at the directory specified by dfd.
(4) In the future FSINFO_FLAGS_QUERY_FSCONTEXT will be added - dfd will
indicate a context handle fd obtained from fsopen() or fspick(),
allowing that to be queried before the target superblock is attached
to the filesystem or even created.
params->request indicates the attribute/attributes to be queried. This can
be one of:
FSINFO_ATTR_STATFS - statfs-style info
FSINFO_ATTR_IDS - Filesystem IDs
FSINFO_ATTR_LIMITS - Filesystem limits
FSINFO_ATTR_SUPPORTS - Support for statx, ioctl, etc.
FSINFO_ATTR_TIMESTAMP_INFO - Inode timestamp info
FSINFO_ATTR_VOLUME_ID - Volume ID (string)
FSINFO_ATTR_VOLUME_UUID - Volume UUID
FSINFO_ATTR_VOLUME_NAME - Volume name (string)
FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO - Information about attr Nth
FSINFO_ATTR_FSINFO_ATTRIBUTES - List of supported attrs
Some attributes (such as the servers backing a network filesystem) can have
multiple values. These can be enumerated by setting params->Nth and
params->Mth to 0, 1, ... until ENODATA is returned.
result_buffer and result_buf_size point to the reply buffer. The buffer is
filled up to the specified size, even if this means truncating the reply.
The size of the full reply is returned, irrespective of the amount data
that was copied. In future versions, this will allow extra fields to be
tacked on to the end of the reply, but anyone not expecting them will only
get the subset they're expecting. If either buffer of result_buf_size are
0, no copy will take place and the data size will be returned.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---
arch/alpha/kernel/syscalls/syscall.tbl | 1
arch/arm/tools/syscall.tbl | 1
arch/arm64/include/asm/unistd.h | 2
arch/arm64/include/asm/unistd32.h | 2
arch/ia64/kernel/syscalls/syscall.tbl | 1
arch/m68k/kernel/syscalls/syscall.tbl | 1
arch/microblaze/kernel/syscalls/syscall.tbl | 1
arch/mips/kernel/syscalls/syscall_n32.tbl | 1
arch/mips/kernel/syscalls/syscall_n64.tbl | 1
arch/mips/kernel/syscalls/syscall_o32.tbl | 1
arch/parisc/kernel/syscalls/syscall.tbl | 1
arch/powerpc/kernel/syscalls/syscall.tbl | 1
arch/s390/kernel/syscalls/syscall.tbl | 1
arch/sh/kernel/syscalls/syscall.tbl | 1
arch/sparc/kernel/syscalls/syscall.tbl | 1
arch/x86/entry/syscalls/syscall_32.tbl | 1
arch/x86/entry/syscalls/syscall_64.tbl | 1
arch/xtensa/kernel/syscalls/syscall.tbl | 1
fs/Kconfig | 7
fs/Makefile | 1
fs/fsinfo.c | 596 +++++++++++++++++++++++++
include/linux/fs.h | 4
include/linux/fsinfo.h | 74 +++
include/linux/syscalls.h | 4
include/uapi/asm-generic/unistd.h | 4
include/uapi/linux/fsinfo.h | 189 ++++++++
kernel/sys_ni.c | 1
samples/vfs/Makefile | 2
samples/vfs/test-fsinfo.c | 646 +++++++++++++++++++++++++++
29 files changed, 1545 insertions(+), 3 deletions(-)
create mode 100644 fs/fsinfo.c
create mode 100644 include/linux/fsinfo.h
create mode 100644 include/uapi/linux/fsinfo.h
create mode 100644 samples/vfs/test-fsinfo.c
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index b6cf8403da35..984abd1ac058 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@
548 common pidfd_getfd sys_pidfd_getfd
549 common faccessat2 sys_faccessat2
550 common watch_mount sys_watch_mount
+551 common fsinfo sys_fsinfo
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 27cc1f53f4a0..bd791f91f5bb 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -453,3 +453,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index b3b2019f8d16..86a9d7b3eabe 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 441
+#define __NR_compat_syscalls 442
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 4f9cf98cdf0f..bd78eb2c487a 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -887,6 +887,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_watch_mount 440
__SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_fsinfo 441
+__SYSCALL(__NR_fsinfo, sys_fsinfo)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index fc6d87903781..09d144487b7d 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -360,3 +360,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index c671aa0e4d25..1bdc26af3c54 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 65cc53f129ef..fb8543122904 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -445,3 +445,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 7f034a239930..b8362bd6bd4a 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -378,3 +378,4 @@
438 n32 pidfd_getfd sys_pidfd_getfd
439 n32 faccessat2 sys_faccessat2
440 n32 watch_mount sys_watch_mount
+441 n32 fsinfo sys_fsinfo
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index d39b90de3642..60ca4091d378 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -354,3 +354,4 @@
438 n64 pidfd_getfd sys_pidfd_getfd
439 n64 faccessat2 sys_faccessat2
440 n64 watch_mount sys_watch_mount
+441 n64 fsinfo sys_fsinfo
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 09f426cb45b1..07aea9379ca0 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -427,3 +427,4 @@
438 o32 pidfd_getfd sys_pidfd_getfd
439 o32 faccessat2 sys_faccessat2
440 o32 watch_mount sys_watch_mount
+441 o32 fsinfo sys_fsinfo
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 52ff3454baa1..f8060767f11a 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -437,3 +437,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 10b7ed3c7a1b..3036bf1336d2 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -529,3 +529,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 86f317bf52df..c0a111fdb3ce 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -442,3 +442,4 @@
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo sys_fsinfo
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 0bb0f0b372c7..03b55c32441f 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -442,3 +442,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 369ab65c1e9a..a0144db9fb8c 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -485,3 +485,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index e760ba92c58d..edf90a2be0b9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -444,3 +444,4 @@
438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2
440 i386 watch_mount sys_watch_mount
+441 i386 fsinfo sys_fsinfo
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5b58621d4f75..ab0eda639d67 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -361,6 +361,7 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 5b28ee39f70f..979013890caf 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -410,3 +410,4 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
+441 common fsinfo sys_fsinfo
diff --git a/fs/Kconfig b/fs/Kconfig
index 1a55e56d5c54..df76451ab49a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -15,6 +15,13 @@ config VALIDATE_FS_PARSER
Enable this to perform validation of the parameter description for a
filesystem when it is registered.
+config FSINFO
+ bool "Enable the fsinfo() system call"
+ help
+ Enable the file system information querying system call to allow
+ comprehensive information to be retrieved about a filesystem,
+ superblock or mount object.
+
if BLOCK
config FS_IOMAP
diff --git a/fs/Makefile b/fs/Makefile
index dd0d87e2ef19..93a7f8047585 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_SYSCTL) += drop_caches.o
obj-$(CONFIG_FHANDLE) += fhandle.o
+obj-$(CONFIG_FSINFO) += fsinfo.o
obj-y += iomap/
obj-y += quota/
diff --git a/fs/fsinfo.c b/fs/fsinfo.c
new file mode 100644
index 000000000000..7d9c73e9cbde
--- /dev/null
+++ b/fs/fsinfo.c
@@ -0,0 +1,596 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Filesystem information query.
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/fsinfo.h>
+#include <uapi/linux/mount.h>
+#include "internal.h"
+
+/**
+ * fsinfo_opaque - Store opaque blob as an fsinfo attribute value.
+ * @s: The blob to store (may be NULL)
+ * @ctx: The parameter context
+ * @len: The length of the blob
+ */
+int fsinfo_opaque(const void *s, struct fsinfo_context *ctx, unsigned int len)
+{
+ void *p = ctx->buffer;
+ int ret = 0;
+
+ if (s) {
+ if (!ctx->want_size_only)
+ memcpy(p, s, len);
+ ret = len;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(fsinfo_opaque);
+
+/**
+ * fsinfo_string - Store a NUL-terminated string as an fsinfo attribute value.
+ * @s: The string to store (may be NULL)
+ * @ctx: The parameter context
+ */
+int fsinfo_string(const char *s, struct fsinfo_context *ctx)
+{
+ if (!s)
+ return 1;
+ return fsinfo_opaque(s, ctx, min_t(size_t, strlen(s) + 1, ctx->buf_size));
+}
+EXPORT_SYMBOL(fsinfo_string);
+
+/*
+ * Get basic filesystem stats from statfs.
+ */
+static int fsinfo_generic_statfs(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_statfs *p = ctx->buffer;
+ struct kstatfs buf;
+ int ret;
+
+ ret = vfs_statfs(path, &buf);
+ if (ret < 0)
+ return ret;
+
+ p->f_blocks.lo = buf.f_blocks;
+ p->f_bfree.lo = buf.f_bfree;
+ p->f_bavail.lo = buf.f_bavail;
+ p->f_files.lo = buf.f_files;
+ p->f_ffree.lo = buf.f_ffree;
+ p->f_favail.lo = buf.f_ffree;
+ p->f_bsize = buf.f_bsize;
+ p->f_frsize = buf.f_frsize;
+ return sizeof(*p);
+}
+
+static int fsinfo_generic_ids(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_ids *p = ctx->buffer;
+ struct super_block *sb;
+ struct kstatfs buf;
+ int ret;
+
+ ret = vfs_statfs(path, &buf);
+ if (ret < 0 && ret != -ENOSYS)
+ return ret;
+ if (ret == 0)
+ memcpy(&p->f_fsid, &buf.f_fsid, sizeof(p->f_fsid));
+
+ sb = path->dentry->d_sb;
+ p->f_fstype = sb->s_magic;
+ p->f_dev_major = MAJOR(sb->s_dev);
+ p->f_dev_minor = MINOR(sb->s_dev);
+ p->f_sb_id = sb->s_unique_id;
+ strlcpy(p->f_fs_name, sb->s_type->name, sizeof(p->f_fs_name));
+ return sizeof(*p);
+}
+
+int fsinfo_generic_limits(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_limits *p = ctx->buffer;
+ struct super_block *sb = path->dentry->d_sb;
+
+ p->max_file_size.hi = 0;
+ p->max_file_size.lo = sb->s_maxbytes;
+ p->max_ino.hi = 0;
+ p->max_ino.lo = UINT_MAX;
+ p->max_hard_links = sb->s_max_links;
+ p->max_uid = UINT_MAX;
+ p->max_gid = UINT_MAX;
+ p->max_projid = UINT_MAX;
+ p->max_filename_len = NAME_MAX;
+ p->max_symlink_len = PATH_MAX;
+ p->max_xattr_name_len = XATTR_NAME_MAX;
+ p->max_xattr_body_len = XATTR_SIZE_MAX;
+ p->max_dev_major = 0xffffff;
+ p->max_dev_minor = 0xff;
+ return sizeof(*p);
+}
+EXPORT_SYMBOL(fsinfo_generic_limits);
+
+int fsinfo_generic_supports(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_supports *p = ctx->buffer;
+ struct super_block *sb = path->dentry->d_sb;
+
+ p->stx_mask = STATX_BASIC_STATS;
+ if (sb->s_d_op && sb->s_d_op->d_automount)
+ p->stx_attributes |= STATX_ATTR_AUTOMOUNT;
+ return sizeof(*p);
+}
+EXPORT_SYMBOL(fsinfo_generic_supports);
+
+static const struct fsinfo_timestamp_info fsinfo_default_timestamp_info = {
+ .atime = {
+ .minimum = S64_MIN,
+ .maximum = S64_MAX,
+ .gran_mantissa = 1,
+ .gran_exponent = 0,
+ },
+ .mtime = {
+ .minimum = S64_MIN,
+ .maximum = S64_MAX,
+ .gran_mantissa = 1,
+ .gran_exponent = 0,
+ },
+ .ctime = {
+ .minimum = S64_MIN,
+ .maximum = S64_MAX,
+ .gran_mantissa = 1,
+ .gran_exponent = 0,
+ },
+ .btime = {
+ .minimum = S64_MIN,
+ .maximum = S64_MAX,
+ .gran_mantissa = 1,
+ .gran_exponent = 0,
+ },
+};
+
+int fsinfo_generic_timestamp_info(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_timestamp_info *p = ctx->buffer;
+ struct super_block *sb = path->dentry->d_sb;
+ s8 exponent;
+
+ *p = fsinfo_default_timestamp_info;
+
+ if (sb->s_time_gran < 1000000000) {
+ if (sb->s_time_gran < 1000)
+ exponent = -9;
+ else if (sb->s_time_gran < 1000000)
+ exponent = -6;
+ else
+ exponent = -3;
+
+ p->atime.gran_exponent = exponent;
+ p->mtime.gran_exponent = exponent;
+ p->ctime.gran_exponent = exponent;
+ p->btime.gran_exponent = exponent;
+ }
+
+ return sizeof(*p);
+}
+EXPORT_SYMBOL(fsinfo_generic_timestamp_info);
+
+static int fsinfo_generic_volume_uuid(struct path *path, struct fsinfo_context *ctx)
+{
+ struct fsinfo_volume_uuid *p = ctx->buffer;
+ struct super_block *sb = path->dentry->d_sb;
+
+ memcpy(p, &sb->s_uuid, sizeof(*p));
+ return sizeof(*p);
+}
+
+static int fsinfo_generic_volume_id(struct path *path, struct fsinfo_context *ctx)
+{
+ return fsinfo_string(path->dentry->d_sb->s_id, ctx);
+}
+
+static const struct fsinfo_attribute fsinfo_common_attributes[] = {
+ FSINFO_VSTRUCT (FSINFO_ATTR_STATFS, fsinfo_generic_statfs),
+ FSINFO_VSTRUCT (FSINFO_ATTR_IDS, fsinfo_generic_ids),
+ FSINFO_VSTRUCT (FSINFO_ATTR_LIMITS, fsinfo_generic_limits),
+ FSINFO_VSTRUCT (FSINFO_ATTR_SUPPORTS, fsinfo_generic_supports),
+ FSINFO_VSTRUCT (FSINFO_ATTR_TIMESTAMP_INFO, fsinfo_generic_timestamp_info),
+ FSINFO_STRING (FSINFO_ATTR_VOLUME_ID, fsinfo_generic_volume_id),
+ FSINFO_VSTRUCT (FSINFO_ATTR_VOLUME_UUID, fsinfo_generic_volume_uuid),
+
+ FSINFO_LIST (FSINFO_ATTR_FSINFO_ATTRIBUTES, (void *)123UL),
+ FSINFO_VSTRUCT_N(FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO, (void *)123UL),
+ {}
+};
+
+/*
+ * Determine an attribute's minimum buffer size and, if the buffer is large
+ * enough, get the attribute value.
+ */
+static int fsinfo_get_this_attribute(struct path *path,
+ struct fsinfo_context *ctx,
+ const struct fsinfo_attribute *attr)
+{
+ int buf_size;
+
+ if (ctx->Nth != 0 && !(attr->flags & (FSINFO_FLAGS_N | FSINFO_FLAGS_NM)))
+ return -ENODATA;
+ if (ctx->Mth != 0 && !(attr->flags & FSINFO_FLAGS_NM))
+ return -ENODATA;
+
+ switch (attr->type) {
+ case FSINFO_TYPE_VSTRUCT:
+ ctx->clear_tail = true;
+ buf_size = attr->size;
+ break;
+ case FSINFO_TYPE_STRING:
+ case FSINFO_TYPE_OPAQUE:
+ case FSINFO_TYPE_LIST:
+ buf_size = 4096;
+ break;
+ default:
+ return -ENOPKG;
+ }
+
+ if (ctx->buf_size < buf_size)
+ return buf_size;
+
+ return attr->get(path, ctx);
+}
+
+static void fsinfo_attributes_insert(struct fsinfo_context *ctx,
+ const struct fsinfo_attribute *attr)
+{
+ __u32 *p = ctx->buffer;
+ unsigned int i;
+
+ if (ctx->usage >= ctx->buf_size ||
+ ctx->buf_size - ctx->usage < sizeof(__u32)) {
+ ctx->usage += sizeof(__u32);
+ return;
+ }
+
+ for (i = 0; i < ctx->usage / sizeof(__u32); i++)
+ if (p[i] == attr->attr_id)
+ return;
+
+ p[i] = attr->attr_id;
+ ctx->usage += sizeof(__u32);
+}
+
+static int fsinfo_list_attributes(struct path *path,
+ struct fsinfo_context *ctx,
+ const struct fsinfo_attribute *attributes)
+{
+ const struct fsinfo_attribute *a;
+
+ for (a = attributes; a->get; a++)
+ fsinfo_attributes_insert(ctx, a);
+ return -EOPNOTSUPP; /* We want to go through all the lists */
+}
+
+static int fsinfo_get_attribute_info(struct path *path,
+ struct fsinfo_context *ctx,
+ const struct fsinfo_attribute *attributes)
+{
+ const struct fsinfo_attribute *a;
+ struct fsinfo_attribute_info *p = ctx->buffer;
+
+ if (!ctx->buf_size)
+ return sizeof(*p);
+
+ for (a = attributes; a->get; a++) {
+ if (a->attr_id == ctx->Nth) {
+ p->attr_id = a->attr_id;
+ p->type = a->type;
+ p->flags = a->flags;
+ p->size = a->size;
+ p->size = a->size;
+ return sizeof(*p);
+ }
+ }
+ return -EOPNOTSUPP; /* We want to go through all the lists */
+}
+
+/**
+ * fsinfo_get_attribute - Look up and handle an attribute
+ * @path: The object to query
+ * @params: Parameters to define a request and place to store result
+ * @attributes: List of attributes to search.
+ *
+ * Look through a list of attributes for one that matches the requested
+ * attribute then call the handler for it.
+ */
+int fsinfo_get_attribute(struct path *path, struct fsinfo_context *ctx,
+ const struct fsinfo_attribute *attributes)
+{
+ const struct fsinfo_attribute *a;
+
+ switch (ctx->requested_attr) {
+ case FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO:
+ return fsinfo_get_attribute_info(path, ctx, attributes);
+ case FSINFO_ATTR_FSINFO_ATTRIBUTES:
+ return fsinfo_list_attributes(path, ctx, attributes);
+ default:
+ for (a = attributes; a->get; a++)
+ if (a->attr_id == ctx->requested_attr)
+ return fsinfo_get_this_attribute(path, ctx, a);
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL(fsinfo_get_attribute);
+
+/**
+ * generic_fsinfo - Handle an fsinfo attribute generically
+ * @path: The object to query
+ * @params: Parameters to define a request and place to store result
+ */
+static int fsinfo_call(struct path *path, struct fsinfo_context *ctx)
+{
+ int ret;
+
+ if (path->dentry->d_sb->s_op->fsinfo) {
+ ret = path->dentry->d_sb->s_op->fsinfo(path, ctx);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ ret = fsinfo_get_attribute(path, ctx, fsinfo_common_attributes);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+
+ switch (ctx->requested_attr) {
+ case FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO:
+ return -ENODATA;
+ case FSINFO_ATTR_FSINFO_ATTRIBUTES:
+ return ctx->usage;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+/**
+ * vfs_fsinfo - Retrieve filesystem information
+ * @path: The object to query
+ * @params: Parameters to define a request and place to store result
+ *
+ * Get an attribute on a filesystem or an object within a filesystem. The
+ * filesystem attribute to be queried is indicated by @ctx->requested_attr, and
+ * if it's a multi-valued attribute, the particular value is selected by
+ * @ctx->Nth and then @ctx->Mth.
+ *
+ * For common attributes, a value may be fabricated if it is not supported by
+ * the filesystem.
+ *
+ * On success, the size of the attribute's value is returned (0 is a valid
+ * size). A buffer will have been allocated and will be pointed to by
+ * @ctx->buffer. The caller must free this with kvfree().
+ *
+ * Errors can also be returned: -ENOMEM if a buffer cannot be allocated, -EPERM
+ * or -EACCES if permission is denied by the LSM, -EOPNOTSUPP if an attribute
+ * doesn't exist for the specified object or -ENODATA if the attribute exists,
+ * but the Nth,Mth value does not exist. -EMSGSIZE indicates that the value is
+ * unmanageable internally and -ENOPKG indicates other internal failure.
+ *
+ * Errors such as -EIO may also come from attempts to access media or servers
+ * to obtain the requested information if it's not immediately to hand.
+ *
+ * [*] Note that the caller may set @ctx->want_size_only if it only wants the
+ * size of the value and not the data. If this is set, a buffer may not be
+ * allocated under some circumstances. This is intended for size query by
+ * userspace.
+ *
+ * [*] Note that @ctx->clear_tail will be returned set if the data should be
+ * padded out with zeros when writing it to userspace.
+ */
+static int vfs_fsinfo(struct path *path, struct fsinfo_context *ctx)
+{
+ struct dentry *dentry = path->dentry;
+ int ret;
+
+ ret = security_sb_statfs(dentry);
+ if (ret)
+ return ret;
+
+ /* Call the handler to find out the buffer size required. */
+ ctx->buf_size = 0;
+ ret = fsinfo_call(path, ctx);
+ if (ret < 0 || ctx->want_size_only)
+ return ret;
+ ctx->buf_size = ret;
+
+ do {
+ /* Allocate a buffer of the requested size. */
+ if (ctx->buf_size > INT_MAX)
+ return -EMSGSIZE;
+ ctx->buffer = kvzalloc(ctx->buf_size, GFP_KERNEL);
+ if (!ctx->buffer)
+ return -ENOMEM;
+
+ ctx->usage = 0;
+ ctx->skip = 0;
+ ret = fsinfo_call(path, ctx);
+ if (IS_ERR_VALUE((long)ret))
+ return ret;
+ if ((unsigned int)ret <= ctx->buf_size)
+ return ret; /* It fitted */
+
+ /* We need to resize the buffer */
+ ctx->buf_size = roundup(ret, PAGE_SIZE);
+ kvfree(ctx->buffer);
+ ctx->buffer = NULL;
+ } while (!signal_pending(current));
+
+ return -ERESTARTSYS;
+}
+
+static int vfs_fsinfo_path(int dfd, const char __user *pathname,
+ const struct fsinfo_params *up,
+ struct fsinfo_context *ctx)
+{
+ struct path path;
+ unsigned lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+ int ret = -EINVAL;
+
+ if (up->resolve_flags & ~VALID_RESOLVE_FLAGS)
+ return -EINVAL;
+ if (up->at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+ AT_EMPTY_PATH))
+ return -EINVAL;
+
+ if (up->resolve_flags & RESOLVE_NO_XDEV)
+ lookup_flags |= LOOKUP_NO_XDEV;
+ if (up->resolve_flags & RESOLVE_NO_MAGICLINKS)
+ lookup_flags |= LOOKUP_NO_MAGICLINKS;
+ if (up->resolve_flags & RESOLVE_NO_SYMLINKS)
+ lookup_flags |= LOOKUP_NO_SYMLINKS;
+ if (up->resolve_flags & RESOLVE_BENEATH)
+ lookup_flags |= LOOKUP_BENEATH;
+ if (up->resolve_flags & RESOLVE_IN_ROOT)
+ lookup_flags |= LOOKUP_IN_ROOT;
+ if (up->at_flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (up->at_flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (up->at_flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+retry:
+ ret = user_path_at(dfd, pathname, lookup_flags, &path);
+ if (ret)
+ goto out;
+
+ ret = vfs_fsinfo(&path, ctx);
+ path_put(&path);
+ if (retry_estale(ret, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+out:
+ return ret;
+}
+
+static int vfs_fsinfo_fd(unsigned int fd, struct fsinfo_context *ctx)
+{
+ struct fd f = fdget_raw(fd);
+ int ret = -EBADF;
+
+ if (f.file) {
+ ret = vfs_fsinfo(&f.file->f_path, ctx);
+ fdput(f);
+ }
+ return ret;
+}
+
+/**
+ * sys_fsinfo - System call to get filesystem information
+ * @dfd: Base directory to pathwalk from or fd referring to filesystem.
+ * @pathname: Filesystem to query or NULL.
+ * @params: Parameters to define request (NULL: FSINFO_ATTR_STATFS).
+ * @params_size: Size of parameter buffer.
+ * @result_buffer: Result buffer.
+ * @result_buf_size: Size of result buffer.
+ *
+ * Get information on a filesystem. The filesystem attribute to be queried is
+ * indicated by @_params->request, and some of the attributes can have multiple
+ * values, indexed by @_params->Nth and @_params->Mth. If @_params is NULL,
+ * then the 0th fsinfo_attr_statfs attribute is queried. If an attribute does
+ * not exist, EOPNOTSUPP is returned; if the Nth,Mth value does not exist,
+ * ENODATA is returned.
+ *
+ * On success, the size of the attribute's value is returned. If
+ * @result_buf_size is 0 or @result_buffer is NULL, only the size is returned.
+ * If the size of the value is larger than @result_buf_size, it will be
+ * truncated by the copy. If the size of the value is smaller than
+ * @result_buf_size then the excess buffer space will be cleared. The full
+ * size of the value will be returned, irrespective of how much data is
+ * actually placed in the buffer.
+ */
+SYSCALL_DEFINE6(fsinfo,
+ int, dfd,
+ const char __user *, pathname,
+ const struct fsinfo_params __user *, params,
+ size_t, params_size,
+ void __user *, result_buffer,
+ size_t, result_buf_size)
+{
+ struct fsinfo_context ctx;
+ struct fsinfo_params user_params;
+ unsigned int result_size;
+ void *r;
+ int ret;
+
+ if ((!params && params_size) ||
+ ( params && !params_size) ||
+ (!result_buffer && result_buf_size) ||
+ ( result_buffer && !result_buf_size))
+ return -EINVAL;
+ if (result_buf_size > UINT_MAX)
+ return -EOVERFLOW;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.requested_attr = FSINFO_ATTR_STATFS;
+ ctx.flags = FSINFO_FLAGS_QUERY_PATH;
+ ctx.want_size_only = (result_buf_size == 0);
+
+ if (params) {
+ ret = copy_struct_from_user(&user_params, sizeof(user_params),
+ params, params_size);
+ if (ret < 0)
+ return ret;
+ if (user_params.flags & ~FSINFO_FLAGS_QUERY_MASK)
+ return -EINVAL;
+ ctx.flags = user_params.flags;
+ ctx.requested_attr = user_params.request;
+ ctx.Nth = user_params.Nth;
+ ctx.Mth = user_params.Mth;
+ }
+
+ switch (ctx.flags & FSINFO_FLAGS_QUERY_MASK) {
+ case FSINFO_FLAGS_QUERY_PATH:
+ ret = vfs_fsinfo_path(dfd, pathname, &user_params, &ctx);
+ break;
+ case FSINFO_FLAGS_QUERY_FD:
+ if (pathname)
+ return -EINVAL;
+ ret = vfs_fsinfo_fd(dfd, &ctx);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (ret < 0)
+ goto error;
+
+ r = ctx.buffer + ctx.skip;
+ result_size = min_t(size_t, ret, result_buf_size);
+ if (result_size > 0 &&
+ copy_to_user(result_buffer, r, result_size) != 0) {
+ ret = -EFAULT;
+ goto error;
+ }
+
+ /* Clear any part of the buffer that we won't fill if we're putting a
+ * struct in there. Strings, opaque objects and arrays are expected to
+ * be variable length.
+ */
+ if (ctx.clear_tail &&
+ result_buf_size > result_size &&
+ clear_user(result_buffer + result_size,
+ result_buf_size - result_size) != 0) {
+ ret = -EFAULT;
+ goto error;
+ }
+
+error:
+ kvfree(ctx.buffer);
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 28a29356eace..3284f497de0a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -68,6 +68,7 @@ struct fsverity_info;
struct fsverity_operations;
struct fs_context;
struct fs_parameter_spec;
+struct fsinfo_context;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -1963,6 +1964,9 @@ struct super_operations {
int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
+#ifdef CONFIG_FSINFO
+ int (*fsinfo)(struct path *, struct fsinfo_context *);
+#endif
int (*remount_fs) (struct super_block *, int *, char *);
void (*umount_begin) (struct super_block *);
diff --git a/include/linux/fsinfo.h b/include/linux/fsinfo.h
new file mode 100644
index 000000000000..a811d69b02ff
--- /dev/null
+++ b/include/linux/fsinfo.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Filesystem information query
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_FSINFO_H
+#define _LINUX_FSINFO_H
+
+#ifdef CONFIG_FSINFO
+
+#include <uapi/linux/fsinfo.h>
+
+struct path;
+
+#define FSINFO_NORMAL_ATTR_MAX_SIZE 4096
+
+struct fsinfo_context {
+ __u32 flags; /* [in] FSINFO_FLAGS_* */
+ __u32 requested_attr; /* [in] What is being asking for */
+ __u32 Nth; /* [in] Instance of it (some may have multiple) */
+ __u32 Mth; /* [in] Subinstance */
+ bool want_size_only; /* [in] Just want to know the size, not the data */
+ bool clear_tail; /* [out] T if tail of buffer should be cleared */
+ unsigned int skip; /* [out] Number of bytes to skip in buffer */
+ unsigned int usage; /* [tmp] Amount of buffer used (if large) */
+ unsigned int buf_size; /* [tmp] Size of ->buffer[] */
+ void *buffer; /* [out] The reply buffer */
+};
+
+/*
+ * A filesystem information attribute definition.
+ */
+struct fsinfo_attribute {
+ unsigned int attr_id; /* The ID of the attribute */
+ enum fsinfo_value_type type:8; /* The type of the attribute's value(s) */
+ unsigned int flags:8;
+ unsigned int size:16; /* - Value size (FSINFO_STRUCT/LIST) */
+ int (*get)(struct path *path, struct fsinfo_context *params);
+};
+
+#define __FSINFO(A, T, S, G, F) \
+ { .attr_id = A, .type = T, .flags = F, .size = S, .get = G }
+
+#define _FSINFO(A, T, S, G) __FSINFO(A, T, S, G, 0)
+#define _FSINFO_N(A, T, S, G) __FSINFO(A, T, S, G, FSINFO_FLAGS_N)
+#define _FSINFO_NM(A, T, S, G) __FSINFO(A, T, S, G, FSINFO_FLAGS_NM)
+
+#define _FSINFO_VSTRUCT(A,S,G) _FSINFO (A, FSINFO_TYPE_VSTRUCT, sizeof(S), G)
+#define _FSINFO_VSTRUCT_N(A,S,G) _FSINFO_N (A, FSINFO_TYPE_VSTRUCT, sizeof(S), G)
+#define _FSINFO_VSTRUCT_NM(A,S,G) _FSINFO_NM(A, FSINFO_TYPE_VSTRUCT, sizeof(S), G)
+
+#define FSINFO_VSTRUCT(A,G) _FSINFO_VSTRUCT (A, A##__STRUCT, G)
+#define FSINFO_VSTRUCT_N(A,G) _FSINFO_VSTRUCT_N (A, A##__STRUCT, G)
+#define FSINFO_VSTRUCT_NM(A,G) _FSINFO_VSTRUCT_NM(A, A##__STRUCT, G)
+#define FSINFO_STRING(A,G) _FSINFO (A, FSINFO_TYPE_STRING, 0, G)
+#define FSINFO_STRING_N(A,G) _FSINFO_N (A, FSINFO_TYPE_STRING, 0, G)
+#define FSINFO_STRING_NM(A,G) _FSINFO_NM(A, FSINFO_TYPE_STRING, 0, G)
+#define FSINFO_OPAQUE(A,G) _FSINFO (A, FSINFO_TYPE_OPAQUE, 0, G)
+#define FSINFO_LIST(A,G) _FSINFO (A, FSINFO_TYPE_LIST, sizeof(A##__STRUCT), G)
+#define FSINFO_LIST_N(A,G) _FSINFO_N (A, FSINFO_TYPE_LIST, sizeof(A##__STRUCT), G)
+
+extern int fsinfo_opaque(const void *, struct fsinfo_context *, unsigned int);
+extern int fsinfo_string(const char *, struct fsinfo_context *);
+extern int fsinfo_generic_timestamp_info(struct path *, struct fsinfo_context *);
+extern int fsinfo_generic_supports(struct path *, struct fsinfo_context *);
+extern int fsinfo_generic_limits(struct path *, struct fsinfo_context *);
+extern int fsinfo_get_attribute(struct path *, struct fsinfo_context *,
+ const struct fsinfo_attribute *);
+
+#endif /* CONFIG_FSINFO */
+
+#endif /* _LINUX_FSINFO_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88d03fd627ab..e31ad49af4c3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -47,6 +47,7 @@ struct stat64;
struct statfs;
struct statfs64;
struct statx;
+struct fsinfo_params;
struct __sysctl_args;
struct sysinfo;
struct timespec;
@@ -1007,6 +1008,9 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
asmlinkage long sys_watch_mount(int dfd, const char __user *path,
unsigned int at_flags, int watch_fd, int watch_id);
+asmlinkage long sys_fsinfo(int dfd, const char __user *pathname,
+ const struct fsinfo_params __user *params, size_t params_size,
+ void __user *result_buffer, size_t result_buf_size);
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index fcdca8c7d30a..9e38f611ab56 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_watch_mount 440
__SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_fsinfo 441
+__SYSCALL(__NR_fsinfo, sys_fsinfo)
#undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/fsinfo.h b/include/uapi/linux/fsinfo.h
new file mode 100644
index 000000000000..65892239ba86
--- /dev/null
+++ b/include/uapi/linux/fsinfo.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* fsinfo() definitions.
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#ifndef _UAPI_LINUX_FSINFO_H
+#define _UAPI_LINUX_FSINFO_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/openat2.h>
+
+/*
+ * The filesystem attributes that can be requested. Note that some attributes
+ * may have multiple instances which can be switched in the parameter block.
+ */
+#define FSINFO_ATTR_STATFS 0x00 /* statfs()-style state */
+#define FSINFO_ATTR_IDS 0x01 /* Filesystem IDs */
+#define FSINFO_ATTR_LIMITS 0x02 /* Filesystem limits */
+#define FSINFO_ATTR_SUPPORTS 0x03 /* What's supported in statx, iocflags, ... */
+#define FSINFO_ATTR_TIMESTAMP_INFO 0x04 /* Inode timestamp info */
+#define FSINFO_ATTR_VOLUME_ID 0x05 /* Volume ID (string) */
+#define FSINFO_ATTR_VOLUME_UUID 0x06 /* Volume UUID (LE uuid) */
+#define FSINFO_ATTR_VOLUME_NAME 0x07 /* Volume name (string) */
+
+#define FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO 0x100 /* Information about attr N (for path) */
+#define FSINFO_ATTR_FSINFO_ATTRIBUTES 0x101 /* List of supported attrs (for path) */
+
+/*
+ * Optional fsinfo() parameter structure.
+ *
+ * If this is not given, it is assumed that fsinfo_attr_statfs instance 0,0 is
+ * desired.
+ */
+struct fsinfo_params {
+ __u64 resolve_flags; /* RESOLVE_* flags */
+ __u32 at_flags; /* AT_* flags */
+ __u32 flags; /* Flags controlling fsinfo() specifically */
+#define FSINFO_FLAGS_QUERY_MASK 0x0007 /* What object should fsinfo() query? */
+#define FSINFO_FLAGS_QUERY_PATH 0x0000 /* - path, specified by dirfd,pathname,AT_EMPTY_PATH */
+#define FSINFO_FLAGS_QUERY_FD 0x0001 /* - fd specified by dirfd */
+ __u32 request; /* ID of requested attribute */
+ __u32 Nth; /* Instance of it (some may have multiple) */
+ __u32 Mth; /* Subinstance of Nth instance */
+};
+
+enum fsinfo_value_type {
+ FSINFO_TYPE_VSTRUCT = 0, /* Version-lengthed struct (up to 4096 bytes) */
+ FSINFO_TYPE_STRING = 1, /* NUL-term var-length string (up to 4095 chars) */
+ FSINFO_TYPE_OPAQUE = 2, /* Opaque blob (unlimited size) */
+ FSINFO_TYPE_LIST = 3, /* List of ints/structs (unlimited size) */
+};
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO).
+ *
+ * This gives information about the attributes supported by fsinfo for the
+ * given path.
+ */
+struct fsinfo_attribute_info {
+ unsigned int attr_id; /* The ID of the attribute */
+ enum fsinfo_value_type type; /* The type of the attribute's value(s) */
+ unsigned int flags;
+#define FSINFO_FLAGS_N 0x01 /* - Attr has a set of values */
+#define FSINFO_FLAGS_NM 0x02 /* - Attr has a set of sets of values */
+ unsigned int size; /* - Value size (FSINFO_STRUCT/FSINFO_LIST) */
+};
+
+#define FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO__STRUCT struct fsinfo_attribute_info
+#define FSINFO_ATTR_FSINFO_ATTRIBUTES__STRUCT __u32
+
+struct fsinfo_u128 {
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
+ __u64 hi;
+ __u64 lo;
+#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+ __u64 lo;
+ __u64 hi;
+#endif
+};
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_STATFS).
+ * - This gives extended filesystem information.
+ */
+struct fsinfo_statfs {
+ struct fsinfo_u128 f_blocks; /* Total number of blocks in fs */
+ struct fsinfo_u128 f_bfree; /* Total number of free blocks */
+ struct fsinfo_u128 f_bavail; /* Number of free blocks available to ordinary user */
+ struct fsinfo_u128 f_files; /* Total number of file nodes in fs */
+ struct fsinfo_u128 f_ffree; /* Number of free file nodes */
+ struct fsinfo_u128 f_favail; /* Number of file nodes available to ordinary user */
+ __u64 f_bsize; /* Optimal block size */
+ __u64 f_frsize; /* Fragment size */
+};
+
+#define FSINFO_ATTR_STATFS__STRUCT struct fsinfo_statfs
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_IDS).
+ *
+ * List of basic identifiers as is normally found in statfs().
+ */
+struct fsinfo_ids {
+ char f_fs_name[15 + 1]; /* Filesystem name */
+ __u64 f_fsid; /* Short 64-bit Filesystem ID (as statfs) */
+ __u64 f_sb_id; /* Internal superblock ID for sbnotify()/mntnotify() */
+ __u32 f_fstype; /* Filesystem type from linux/magic.h [uncond] */
+ __u32 f_dev_major; /* As st_dev_* from struct statx [uncond] */
+ __u32 f_dev_minor;
+ __u32 __padding[1];
+};
+
+#define FSINFO_ATTR_IDS__STRUCT struct fsinfo_ids
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_LIMITS).
+ *
+ * List of supported filesystem limits.
+ */
+struct fsinfo_limits {
+ struct fsinfo_u128 max_file_size; /* Maximum file size */
+ struct fsinfo_u128 max_ino; /* Maximum inode number */
+ __u64 max_uid; /* Maximum UID supported */
+ __u64 max_gid; /* Maximum GID supported */
+ __u64 max_projid; /* Maximum project ID supported */
+ __u64 max_hard_links; /* Maximum number of hard links on a file */
+ __u64 max_xattr_body_len; /* Maximum xattr content length */
+ __u32 max_xattr_name_len; /* Maximum xattr name length */
+ __u32 max_filename_len; /* Maximum filename length */
+ __u32 max_symlink_len; /* Maximum symlink content length */
+ __u32 max_dev_major; /* Maximum device major representable */
+ __u32 max_dev_minor; /* Maximum device minor representable */
+ __u32 __padding[1];
+};
+
+#define FSINFO_ATTR_LIMITS__STRUCT struct fsinfo_limits
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_SUPPORTS).
+ *
+ * What's supported in various masks, such as statx() attribute and mask bits
+ * and IOC flags.
+ */
+struct fsinfo_supports {
+ __u64 stx_attributes; /* What statx::stx_attributes are supported */
+ __u32 stx_mask; /* What statx::stx_mask bits are supported */
+ __u32 fs_ioc_getflags; /* What FS_IOC_GETFLAGS may return */
+ __u32 fs_ioc_setflags_set; /* What FS_IOC_SETFLAGS may set */
+ __u32 fs_ioc_setflags_clear; /* What FS_IOC_SETFLAGS may clear */
+ __u32 fs_ioc_fsgetxattr_xflags; /* What FS_IOC_FSGETXATTR[A] may return in fsx_xflags */
+ __u32 fs_ioc_fssetxattr_xflags_set; /* What FS_IOC_FSSETXATTR may set in fsx_xflags */
+ __u32 fs_ioc_fssetxattr_xflags_clear; /* What FS_IOC_FSSETXATTR may set in fsx_xflags */
+ __u32 win_file_attrs; /* What DOS/Windows FILE_* attributes are supported */
+};
+
+#define FSINFO_ATTR_SUPPORTS__STRUCT struct fsinfo_supports
+
+struct fsinfo_timestamp_one {
+ __s64 minimum; /* Minimum timestamp value in seconds */
+ __s64 maximum; /* Maximum timestamp value in seconds */
+ __u16 gran_mantissa; /* Granularity(secs) = mant * 10^exp */
+ __s8 gran_exponent;
+ __u8 __padding[5];
+};
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_TIMESTAMP_INFO).
+ */
+struct fsinfo_timestamp_info {
+ struct fsinfo_timestamp_one atime; /* Access time */
+ struct fsinfo_timestamp_one mtime; /* Modification time */
+ struct fsinfo_timestamp_one ctime; /* Change time */
+ struct fsinfo_timestamp_one btime; /* Birth/creation time */
+};
+
+#define FSINFO_ATTR_TIMESTAMP_INFO__STRUCT struct fsinfo_timestamp_info
+
+/*
+ * Information struct for fsinfo(FSINFO_ATTR_VOLUME_UUID).
+ */
+struct fsinfo_volume_uuid {
+ __u8 uuid[16];
+};
+
+#define FSINFO_ATTR_VOLUME_UUID__STRUCT struct fsinfo_volume_uuid
+
+#endif /* _UAPI_LINUX_FSINFO_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3e1c5c9d2efe..f72a9e4ddc9a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,6 +51,7 @@ COND_SYSCALL_COMPAT(io_pgetevents);
COND_SYSCALL(io_uring_setup);
COND_SYSCALL(io_uring_enter);
COND_SYSCALL(io_uring_register);
+COND_SYSCALL(fsinfo);
/* fs/xattr.c */
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
index 00b6824f9237..d63af5106fc2 100644
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-userprogs := test-fsmount test-statx
+userprogs := test-fsinfo test-fsmount test-statx
always-y := $(userprogs)
userccflags += -I usr/include
diff --git a/samples/vfs/test-fsinfo.c b/samples/vfs/test-fsinfo.c
new file mode 100644
index 000000000000..934b25399ffe
--- /dev/null
+++ b/samples/vfs/test-fsinfo.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Test the fsinfo() system call
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <linux/fsinfo.h>
+#include <linux/socket.h>
+#include <sys/stat.h>
+#include <arpa/inet.h>
+
+#ifndef __NR_fsinfo
+#define __NR_fsinfo -1
+#endif
+
+static bool debug = 0;
+static bool list_last;
+
+static __attribute__((unused))
+ssize_t fsinfo(int dfd, const char *filename,
+ struct fsinfo_params *params, size_t params_size,
+ void *result_buffer, size_t result_buf_size)
+{
+ return syscall(__NR_fsinfo, dfd, filename,
+ params, params_size,
+ result_buffer, result_buf_size);
+}
+
+struct fsinfo_attribute {
+ unsigned int attr_id;
+ enum fsinfo_value_type type;
+ unsigned int size;
+ const char *name;
+ void (*dump)(void *reply, unsigned int size);
+};
+
+static const struct fsinfo_attribute fsinfo_attributes[];
+
+static ssize_t get_fsinfo(const char *, const char *, struct fsinfo_params *, void **);
+
+static void dump_hex(FILE *f, unsigned char *data, int from, int to)
+{
+ unsigned offset, col = 0;
+ bool print_offset = true;
+
+ for (offset = from; offset < to; offset++) {
+ if (print_offset) {
+ fprintf(f, "%04x: ", offset);
+ print_offset = 0;
+ }
+ fprintf(f, "%02x", data[offset]);
+ col++;
+ if ((col & 3) == 0) {
+ if ((col & 15) == 0) {
+ fprintf(f, "\n");
+ print_offset = 1;
+ } else {
+ fprintf(f, " ");
+ }
+ }
+ }
+
+ if (!print_offset)
+ fprintf(f, "\n");
+}
+
+static void dump_attribute_info(void *reply, unsigned int size)
+{
+ struct fsinfo_attribute_info *attr_info = reply;
+ const struct fsinfo_attribute *attr;
+ char type[32], val_size[32];
+
+ switch (attr_info->type) {
+ case FSINFO_TYPE_VSTRUCT: strcpy(type, "V-STRUCT"); break;
+ case FSINFO_TYPE_STRING: strcpy(type, "STRING"); break;
+ case FSINFO_TYPE_OPAQUE: strcpy(type, "OPAQUE"); break;
+ case FSINFO_TYPE_LIST: strcpy(type, "LIST"); break;
+ default:
+ sprintf(type, "type-%x", attr_info->type);
+ break;
+ }
+
+ if (attr_info->flags & FSINFO_FLAGS_N)
+ strcat(type, " x N");
+ else if (attr_info->flags & FSINFO_FLAGS_NM)
+ strcat(type, " x NM");
+
+ for (attr = fsinfo_attributes; attr->name; attr++)
+ if (attr->attr_id == attr_info->attr_id)
+ break;
+
+ if (attr_info->size)
+ sprintf(val_size, "%u", attr_info->size);
+ else
+ strcpy(val_size, "-");
+
+ printf("%8x %-12s %08x %5s %s\n",
+ attr_info->attr_id,
+ type,
+ attr_info->flags,
+ val_size,
+ attr->name ? attr->name : "");
+}
+
+static void dump_fsinfo_generic_statfs(void *reply, unsigned int size)
+{
+ struct fsinfo_statfs *f = reply;
+
+ printf("\n");
+ printf("\tblocks : n=%llu fr=%llu av=%llu\n",
+ (unsigned long long)f->f_blocks.lo,
+ (unsigned long long)f->f_bfree.lo,
+ (unsigned long long)f->f_bavail.lo);
+
+ printf("\tfiles : n=%llu fr=%llu av=%llu\n",
+ (unsigned long long)f->f_files.lo,
+ (unsigned long long)f->f_ffree.lo,
+ (unsigned long long)f->f_favail.lo);
+ printf("\tbsize : %llu\n",
+ (unsigned long long)f->f_bsize);
+ printf("\tfrsize : %llu\n",
+ (unsigned long long)f->f_frsize);
+}
+
+static void dump_fsinfo_generic_ids(void *reply, unsigned int size)
+{
+ struct fsinfo_ids *f = reply;
+
+ printf("\n");
+ printf("\tdev : %02x:%02x\n", f->f_dev_major, f->f_dev_minor);
+ printf("\tfs : type=%x name=%s\n", f->f_fstype, f->f_fs_name);
+ printf("\tfsid : %llx\n", (unsigned long long)f->f_fsid);
+ printf("\tsbid : %llx\n", (unsigned long long)f->f_sb_id);
+}
+
+static void dump_fsinfo_generic_limits(void *reply, unsigned int size)
+{
+ struct fsinfo_limits *f = reply;
+
+ printf("\n");
+ printf("\tmax file size: %llx%016llx\n",
+ (unsigned long long)f->max_file_size.hi,
+ (unsigned long long)f->max_file_size.lo);
+ printf("\tmax ino : %llx%016llx\n",
+ (unsigned long long)f->max_ino.hi,
+ (unsigned long long)f->max_ino.lo);
+ printf("\tmax ids : u=%llx g=%llx p=%llx\n",
+ (unsigned long long)f->max_uid,
+ (unsigned long long)f->max_gid,
+ (unsigned long long)f->max_projid);
+ printf("\tmax dev : maj=%x min=%x\n",
+ f->max_dev_major, f->max_dev_minor);
+ printf("\tmax links : %llx\n",
+ (unsigned long long)f->max_hard_links);
+ printf("\tmax xattr : n=%x b=%llx\n",
+ f->max_xattr_name_len,
+ (unsigned long long)f->max_xattr_body_len);
+ printf("\tmax len : file=%x sym=%x\n",
+ f->max_filename_len, f->max_symlink_len);
+}
+
+static void dump_fsinfo_generic_supports(void *reply, unsigned int size)
+{
+ struct fsinfo_supports *f = reply;
+
+ printf("\n");
+ printf("\tstx_attr : %llx\n", (unsigned long long)f->stx_attributes);
+ printf("\tstx_mask : %x\n", f->stx_mask);
+ printf("\tfs_ioc_*flags: get=%x set=%x clr=%x\n",
+ f->fs_ioc_getflags, f->fs_ioc_setflags_set, f->fs_ioc_setflags_clear);
+ printf("\tfs_ioc_*xattr: fsx_xflags: get=%x set=%x clr=%x\n",
+ f->fs_ioc_fsgetxattr_xflags,
+ f->fs_ioc_fssetxattr_xflags_set,
+ f->fs_ioc_fssetxattr_xflags_clear);
+ printf("\twin_fattrs : %x\n", f->win_file_attrs);
+}
+
+static void print_time(struct fsinfo_timestamp_one *t, char stamp)
+{
+ printf("\t%ctime : gran=%uE%d range=%llx-%llx\n",
+ stamp,
+ t->gran_mantissa, t->gran_exponent,
+ (long long)t->minimum, (long long)t->maximum);
+}
+
+static void dump_fsinfo_generic_timestamp_info(void *reply, unsigned int size)
+{
+ struct fsinfo_timestamp_info *f = reply;
+
+ printf("\n");
+ print_time(&f->atime, 'a');
+ print_time(&f->mtime, 'm');
+ print_time(&f->ctime, 'c');
+ print_time(&f->btime, 'b');
+}
+
+static void dump_fsinfo_generic_volume_uuid(void *reply, unsigned int size)
+{
+ struct fsinfo_volume_uuid *f = reply;
+
+ printf("%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x"
+ "-%02x%02x%02x%02x%02x%02x\n",
+ f->uuid[ 0], f->uuid[ 1],
+ f->uuid[ 2], f->uuid[ 3],
+ f->uuid[ 4], f->uuid[ 5],
+ f->uuid[ 6], f->uuid[ 7],
+ f->uuid[ 8], f->uuid[ 9],
+ f->uuid[10], f->uuid[11],
+ f->uuid[12], f->uuid[13],
+ f->uuid[14], f->uuid[15]);
+}
+
+static void dump_string(void *reply, unsigned int size)
+{
+ char *s = reply, *p;
+ bool nl = false, last_nl = false;
+
+ p = s;
+ if (size >= 4096) {
+ size = 4096;
+ p[4092] = '.';
+ p[4093] = '.';
+ p[4094] = '.';
+ p[4095] = 0;
+ } else {
+ p[size] = 0;
+ }
+
+ for (p = s; *p; p++) {
+ if (*p == '\n') {
+ last_nl = nl = true;
+ continue;
+ }
+ last_nl = false;
+ if (!isprint(*p) && *p != '\t')
+ *p = '?';
+ }
+
+ if (nl)
+ putchar('\n');
+ printf("%s", s);
+ if (!last_nl)
+ putchar('\n');
+}
+
+#define dump_fsinfo_meta_attribute_info (void *)0x123
+#define dump_fsinfo_meta_attributes (void *)0x123
+
+/*
+ *
+ */
+#define __FSINFO(A, T, S, G, F, N) \
+ { .attr_id = A, .type = T, .size = S, .name = N, .dump = dump_##G }
+
+#define _FSINFO(A,T,S,G,N) __FSINFO(A, T, S, G, 0, N)
+#define _FSINFO_N(A,T,S,G,N) __FSINFO(A, T, S, G, FSINFO_FLAGS_N, N)
+#define _FSINFO_NM(A,T,S,G,N) __FSINFO(A, T, S, G, FSINFO_FLAGS_NM, N)
+
+#define _FSINFO_VSTRUCT(A,S,G,N) _FSINFO (A, FSINFO_TYPE_VSTRUCT, sizeof(S), G, N)
+#define _FSINFO_VSTRUCT_N(A,S,G,N) _FSINFO_N (A, FSINFO_TYPE_VSTRUCT, sizeof(S), G, N)
+#define _FSINFO_VSTRUCT_NM(A,S,G,N) _FSINFO_NM(A, FSINFO_TYPE_VSTRUCT, sizeof(S), G, N)
+
+#define FSINFO_VSTRUCT(A,G) _FSINFO_VSTRUCT (A, A##__STRUCT, G, #A)
+#define FSINFO_VSTRUCT_N(A,G) _FSINFO_VSTRUCT_N (A, A##__STRUCT, G, #A)
+#define FSINFO_VSTRUCT_NM(A,G) _FSINFO_VSTRUCT_NM(A, A##__STRUCT, G, #A)
+#define FSINFO_STRING(A,G) _FSINFO (A, FSINFO_TYPE_STRING, 0, G, #A)
+#define FSINFO_STRING_N(A,G) _FSINFO_N (A, FSINFO_TYPE_STRING, 0, G, #A)
+#define FSINFO_STRING_NM(A,G) _FSINFO_NM(A, FSINFO_TYPE_STRING, 0, G, #A)
+#define FSINFO_OPAQUE(A,G) _FSINFO (A, FSINFO_TYPE_OPAQUE, 0, G, #A)
+#define FSINFO_LIST(A,G) _FSINFO (A, FSINFO_TYPE_LIST, sizeof(A##__STRUCT), G, #A)
+#define FSINFO_LIST_N(A,G) _FSINFO_N (A, FSINFO_TYPE_LIST, sizeof(A##__STRUCT), G, #A)
+
+static const struct fsinfo_attribute fsinfo_attributes[] = {
+ FSINFO_VSTRUCT (FSINFO_ATTR_STATFS, fsinfo_generic_statfs),
+ FSINFO_VSTRUCT (FSINFO_ATTR_IDS, fsinfo_generic_ids),
+ FSINFO_VSTRUCT (FSINFO_ATTR_LIMITS, fsinfo_generic_limits),
+ FSINFO_VSTRUCT (FSINFO_ATTR_SUPPORTS, fsinfo_generic_supports),
+ FSINFO_VSTRUCT (FSINFO_ATTR_TIMESTAMP_INFO, fsinfo_generic_timestamp_info),
+ FSINFO_STRING (FSINFO_ATTR_VOLUME_ID, string),
+ FSINFO_VSTRUCT (FSINFO_ATTR_VOLUME_UUID, fsinfo_generic_volume_uuid),
+ FSINFO_STRING (FSINFO_ATTR_VOLUME_NAME, string),
+ FSINFO_VSTRUCT_N(FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO, fsinfo_meta_attribute_info),
+ FSINFO_LIST (FSINFO_ATTR_FSINFO_ATTRIBUTES, fsinfo_meta_attributes),
+ {}
+};
+
+static __attribute__((noreturn))
+void bad_value(const char *what,
+ struct fsinfo_params *params,
+ const struct fsinfo_attribute *attr,
+ const struct fsinfo_attribute_info *attr_info,
+ void *reply, unsigned int size)
+{
+ printf("\n");
+ fprintf(stderr, "%s %s{%u}{%u} t=%x f=%x s=%x\n",
+ what, attr->name, params->Nth, params->Mth,
+ attr_info->type, attr_info->flags, attr_info->size);
+ fprintf(stderr, "size=%u\n", size);
+ dump_hex(stderr, reply, 0, size);
+ exit(1);
+}
+
+static void dump_value(unsigned int attr_id,
+ const struct fsinfo_attribute *attr,
+ const struct fsinfo_attribute_info *attr_info,
+ void *reply, unsigned int size)
+{
+ if (!attr || !attr->dump) {
+ printf("<no dumper>\n");
+ return;
+ }
+
+ if (attr->type == FSINFO_TYPE_VSTRUCT && size < attr->size) {
+ printf("<short data %u/%u>\n", size, attr->size);
+ return;
+ }
+
+ attr->dump(reply, size);
+}
+
+static void dump_list(unsigned int attr_id,
+ const struct fsinfo_attribute *attr,
+ const struct fsinfo_attribute_info *attr_info,
+ void *reply, unsigned int size)
+{
+ size_t elem_size = attr_info->size;
+ unsigned int ix = 0;
+
+ printf("\n");
+ if (!attr || !attr->dump) {
+ printf("<no dumper>\n");
+ return;
+ }
+
+ if (attr->type == FSINFO_TYPE_VSTRUCT && size < attr->size) {
+ printf("<short data %u/%u>\n", size, attr->size);
+ return;
+ }
+
+ list_last = false;
+ while (size >= elem_size) {
+ printf("\t[%02x] ", ix);
+ if (size == elem_size)
+ list_last = true;
+ attr->dump(reply, size);
+ reply += elem_size;
+ size -= elem_size;
+ ix++;
+ }
+}
+
+/*
+ * Call fsinfo, expanding the buffer as necessary.
+ */
+static ssize_t get_fsinfo(const char *file, const char *name,
+ struct fsinfo_params *params, void **_r)
+{
+ ssize_t ret;
+ size_t buf_size = 4096;
+ void *r;
+
+ for (;;) {
+ r = malloc(buf_size);
+ if (!r) {
+ perror("malloc");
+ exit(1);
+ }
+ memset(r, 0xbd, buf_size);
+
+ errno = 0;
+ ret = fsinfo(AT_FDCWD, file, params, sizeof(*params), r, buf_size - 1);
+ if (ret == -1)
+ goto error;
+
+ if (ret <= buf_size - 1)
+ break;
+ buf_size = (ret + 4096 - 1) & ~(4096 - 1);
+ }
+
+ if (debug)
+ printf("fsinfo(%s,%s,%u,%u) = %zd\n",
+ file, name, params->Nth, params->Mth, ret);
+
+ ((char *)r)[ret] = 0;
+ *_r = r;
+ return ret;
+
+error:
+ *_r = NULL;
+ free(r);
+ if (debug)
+ printf("fsinfo(%s,%s,%u,%u) = %m\n",
+ file, name, params->Nth, params->Mth);
+ return ret;
+}
+
+/*
+ * Try one subinstance of an attribute.
+ */
+static int try_one(const char *file, struct fsinfo_params *params,
+ const struct fsinfo_attribute_info *attr_info, bool raw)
+{
+ const struct fsinfo_attribute *attr;
+ const char *name;
+ size_t size = 4096;
+ char namebuf[32];
+ void *r;
+
+ for (attr = fsinfo_attributes; attr->name; attr++) {
+ if (attr->attr_id == params->request) {
+ name = attr->name;
+ if (strncmp(name, "fsinfo_generic_", 15) == 0)
+ name += 15;
+ goto found;
+ }
+ }
+
+ sprintf(namebuf, "<unknown-%x>", params->request);
+ name = namebuf;
+ attr = NULL;
+
+found:
+ size = get_fsinfo(file, name, params, &r);
+
+ if (size == -1) {
+ if (errno == ENODATA) {
+ if (!(attr_info->flags & (FSINFO_FLAGS_N | FSINFO_FLAGS_NM)) &&
+ params->Nth == 0 && params->Mth == 0)
+ bad_value("Unexpected ENODATA",
+ params, attr, attr_info, r, size);
+ free(r);
+ return (params->Mth == 0) ? 2 : 1;
+ }
+ if (errno == EOPNOTSUPP) {
+ if (params->Nth > 0 || params->Mth > 0)
+ bad_value("Should return ENODATA",
+ params, attr, attr_info, r, size);
+ //printf("\e[33m%s\e[m: <not supported>\n",
+ // fsinfo_attr_names[attr]);
+ free(r);
+ return 2;
+ }
+ perror(file);
+ exit(1);
+ }
+
+ if (raw) {
+ if (size > 4096)
+ size = 4096;
+ dump_hex(stdout, r, 0, size);
+ free(r);
+ return 0;
+ }
+
+ switch (attr_info->flags & (FSINFO_FLAGS_N | FSINFO_FLAGS_NM)) {
+ case 0:
+ printf("\e[33m%s\e[m: ", name);
+ break;
+ case FSINFO_FLAGS_N:
+ printf("\e[33m%s{%u}\e[m: ", name, params->Nth);
+ break;
+ case FSINFO_FLAGS_NM:
+ printf("\e[33m%s{%u,%u}\e[m: ", name, params->Nth, params->Mth);
+ break;
+ }
+
+ switch (attr_info->type) {
+ case FSINFO_TYPE_STRING:
+ if (size == 0 || ((char *)r)[size - 1] != 0)
+ bad_value("Unterminated string",
+ params, attr, attr_info, r, size);
+ case FSINFO_TYPE_VSTRUCT:
+ case FSINFO_TYPE_OPAQUE:
+ dump_value(params->request, attr, attr_info, r, size);
+ free(r);
+ return 0;
+
+ case FSINFO_TYPE_LIST:
+ dump_list(params->request, attr, attr_info, r, size);
+ free(r);
+ return 0;
+
+ default:
+ bad_value("Fishy type", params, attr, attr_info, r, size);
+ }
+}
+
+static int cmp_u32(const void *a, const void *b)
+{
+ return *(const int *)a - *(const int *)b;
+}
+
+/*
+ *
+ */
+int main(int argc, char **argv)
+{
+ struct fsinfo_attribute_info attr_info;
+ struct fsinfo_params params = {
+ .at_flags = AT_SYMLINK_NOFOLLOW,
+ .flags = FSINFO_FLAGS_QUERY_PATH,
+ };
+ unsigned int *attrs, ret, nr, i;
+ bool meta = false;
+ int raw = 0, opt, Nth, Mth;
+
+ while ((opt = getopt(argc, argv, "Madlr"))) {
+ switch (opt) {
+ case 'M':
+ meta = true;
+ continue;
+ case 'a':
+ params.at_flags |= AT_NO_AUTOMOUNT;
+ params.flags = FSINFO_FLAGS_QUERY_PATH;
+ continue;
+ case 'd':
+ debug = true;
+ continue;
+ case 'l':
+ params.at_flags &= ~AT_SYMLINK_NOFOLLOW;
+ params.flags = FSINFO_FLAGS_QUERY_PATH;
+ continue;
+ case 'r':
+ raw = 1;
+ continue;
+ }
+ break;
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1) {
+ printf("Format: test-fsinfo [-Madlr] <path>\n");
+ exit(2);
+ }
+
+ /* Retrieve a list of supported attribute IDs */
+ params.request = FSINFO_ATTR_FSINFO_ATTRIBUTES;
+ params.Nth = 0;
+ params.Mth = 0;
+ ret = get_fsinfo(argv[0], "attributes", ¶ms, (void **)&attrs);
+ if (ret == -1) {
+ fprintf(stderr, "Unable to get attribute list: %m\n");
+ exit(1);
+ }
+
+ if (ret % sizeof(attrs[0])) {
+ fprintf(stderr, "Bad length of attribute list (0x%x)\n", ret);
+ exit(2);
+ }
+
+ nr = ret / sizeof(attrs[0]);
+ qsort(attrs, nr, sizeof(attrs[0]), cmp_u32);
+
+ if (meta) {
+ printf("ATTR ID TYPE FLAGS SIZE NAME\n");
+ printf("======== ============ ======== ===== =========\n");
+ for (i = 0; i < nr; i++) {
+ params.request = FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO;
+ params.Nth = attrs[i];
+ params.Mth = 0;
+ ret = fsinfo(AT_FDCWD, argv[0],
+ ¶ms, sizeof(params),
+ &attr_info, sizeof(attr_info));
+ if (ret == -1) {
+ fprintf(stderr, "Can't get info for attribute %x: %m\n", attrs[i]);
+ exit(1);
+ }
+
+ dump_attribute_info(&attr_info, ret);
+ }
+ exit(0);
+ }
+
+ for (i = 0; i < nr; i++) {
+ params.request = FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO;
+ params.Nth = attrs[i];
+ params.Mth = 0;
+ ret = fsinfo(AT_FDCWD, argv[0],
+ ¶ms, sizeof(params),
+ &attr_info, sizeof(attr_info));
+ if (ret == -1) {
+ fprintf(stderr, "Can't get info for attribute %x: %m\n", attrs[i]);
+ exit(1);
+ }
+
+ if (attrs[i] == FSINFO_ATTR_FSINFO_ATTRIBUTE_INFO ||
+ attrs[i] == FSINFO_ATTR_FSINFO_ATTRIBUTES)
+ continue;
+
+ if (attrs[i] != attr_info.attr_id) {
+ fprintf(stderr, "ID for %03x returned %03x\n",
+ attrs[i], attr_info.attr_id);
+ break;
+ }
+ Nth = 0;
+ do {
+ Mth = 0;
+ do {
+ params.request = attrs[i];
+ params.Nth = Nth;
+ params.Mth = Mth;
+
+ switch (try_one(argv[0], ¶ms, &attr_info, raw)) {
+ case 0:
+ continue;
+ case 1:
+ goto done_M;
+ case 2:
+ goto done_N;
+ }
+ } while (++Mth < 100);
+
+ done_M:
+ if (Mth >= 100) {
+ fprintf(stderr, "Fishy: Mth %x[%u][%u]\n", attrs[i], Nth, Mth);
+ break;
+ }
+
+ } while (++Nth < 100);
+
+ done_N:
+ if (Nth >= 100) {
+ fprintf(stderr, "Fishy: Nth %x[%u]\n", attrs[i], Nth);
+ break;
+ }
+ }
+
+ return 0;
+}
^ permalink raw reply related
* [PATCH 01/18] fsinfo: Introduce a non-repeating system-unique superblock ID [ver #21]
From: David Howells @ 2020-08-03 13:36 UTC (permalink / raw)
To: viro
Cc: dhowells, torvalds, raven, mszeredi, christian, jannh,
darrick.wong, kzak, jlayton, linux-api, linux-fsdevel,
linux-security-module, linux-kernel
In-Reply-To: <159646178122.1784947.11705396571718464082.stgit@warthog.procyon.org.uk>
Introduce an (effectively) non-repeating system-unique superblock ID that
can be used to determine that two objects are in the same superblock
without needing to worry about the ID changing in the meantime (as is
possible with device IDs).
The counter could also be used to tag other features, such as mount
objects.
Signed-off-by: David Howells <dhowells@redhat.com>
---
fs/internal.h | 1 +
fs/super.c | 2 ++
include/linux/fs.h | 3 +++
3 files changed, 6 insertions(+)
diff --git a/fs/internal.h b/fs/internal.h
index 9b863a7bd708..ea60d864a8cb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -103,6 +103,7 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
/*
* super.c
*/
+extern atomic64_t vfs_unique_counter;
extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
extern struct super_block *user_get_super(dev_t);
diff --git a/fs/super.c b/fs/super.c
index 904459b35119..21ae8afeba3a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -44,6 +44,7 @@ static int thaw_super_locked(struct super_block *sb);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
+atomic64_t vfs_unique_counter; /* Unique identifier counter */
static char *sb_writers_name[SB_FREEZE_LEVELS] = {
"sb_writers",
@@ -273,6 +274,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
goto fail;
if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
goto fail;
+ s->s_unique_id = atomic64_inc_return(&vfs_unique_counter);
return s;
fail:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f5abba86107d..28a29356eace 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1564,6 +1564,9 @@ struct super_block {
spinlock_t s_inode_wblist_lock;
struct list_head s_inodes_wb; /* writeback inodes */
+
+ /* Superblock information */
+ u64 s_unique_id;
} __randomize_layout;
/* Helper functions so that in most cases filesystems will
^ permalink raw reply related
* [PATCH 00/18] VFS: Filesystem information [ver #21]
From: David Howells @ 2020-08-03 13:36 UTC (permalink / raw)
To: viro
Cc: Theodore Ts'o, Andreas Dilger, Eric Biggers, Jeff Layton,
linux-ext4, Carlos Maiolino, Darrick J. Wong, linux-api, dhowells,
torvalds, raven, mszeredi, christian, jannh, darrick.wong, kzak,
jlayton, linux-api, linux-fsdevel, linux-security-module,
linux-kernel
Here's a set of patches that adds a system call, fsinfo(), that allows
information about the VFS, mount topology, superblock and files to be
retrieved.
The patchset is based on top of the notifications patchset and allows event
counters implemented in the latter to be retrieved to allow overruns to be
efficiently managed.
=======
THE WHY
=======
Why do we want this?
Using /proc/mounts (or similar) has problems:
(1) Reading from it holds a global lock (namespace_sem) that prevents
mounting and unmounting. Lots of data is encoded and mangled into
text whilst the lock is held, including superblock option strings and
mount point paths. This causes performance problems when there are a
lot of mount objects in a system.
(2) Even though namespace_sem is held during a read, reading the whole
file isn't necessarily atomic with respect to mount-type operations.
If a read isn't satisfied in one go, then it may return to userspace
briefly and then continue reading some way into the file. But changes
can occur in the interval that may then go unseen.
(3) Determining what has changed means parsing and comparing consecutive
outputs of /proc/mounts.
(4) Querying a specific mount or superblock means searching through
/proc/mounts and searching by path or mount ID - but we might have an
fd we want to query.
(5) Whilst you can poll() it for events, it only tells you that something
changed in the namespace, not what or whether you can even see the
change.
To fix the notification issues, the preceding notifications patchset added
mount watch notifications whereby you can watch for notifications in a
specific mount subtree. The notification messages include the ID(s) of the
affected mounts.
To support notifications, however, we need to be able to handle overruns in
the notification queue. I added a number of event counters to struct
super_block and struct mount to allow you to pin down the changes, but
there needs to be a way to retrieve them. Exposing them through /proc
would require adding yet another /proc/mounts-type file. We could add
per-mount directories full of attributes in sysfs, but that has issues also
(see below).
Adding an extensible system call interface for retrieving filesystem
information also allows other things to be exposed:
(1) Jeff Layton's error handling changes need a way to allow error event
information to be retrieved.
(2) Bits in masks returned by things like statx() and FS_IOC_GETFLAGS are
actually 3-state { Set, Unset, Not supported }. It could be useful to
provide a way to expose information like this[*].
(3) Limits of the numerical metadata values in a filesystem[*].
(4) Filesystem capability information[*]. Filesystems don't all have the
same capabilities, and even different instances may have different
capabilities, particularly with network filesystems where the set of
may be server-dependent. Capabilities might even vary at file
granularity - though possibly such information should be conveyed
through statx() instead.
(5) ID mapping/shifting tables in use for a superblock.
(6) Filesystem-specific information. I need something for AFS so that I
can do pioctl()-emulation, thereby allowing me to implement certain of
the AFS command line utilities that query state of a particular file.
This could also have application for other filesystems, such as NFS,
CIFS and ext4.
[*] In a lot of cases these are probably invariant and can be memcpy'd
from static data.
There's a further consideration: I want to make it possible to have
fsconfig(fd, FSCONFIG_CMD_CREATE) be intercepted by a container manager
such that the manager can supervise a mount attempted inside the container.
The manager would be given an fd pointing to the fs_context struct and
would then need some way to query it (fsinfo()) and modify it (fsconfig()).
This could also be used to arbitrate user-requested mounts when containers
are not in play.
================
DESIGN DECISIONS
================
(1) Information is partitioned into sets of attributes.
(2) Attribute IDs are integers as they're fast to compare.
(3) Attribute values are typed (struct, list of structs, string, opaque
blob). They type is fixed for a particular attribute.
(4) For structure types, the length is also a version. New fields can be
tacked onto the end.
(5) When copying a versioned struct to userspace, the core handles a
version mismatch by truncating or zero-padding the data as necessary.
This is transparent to the filesystem.
(6) The core handles all the buffering and buffer resizing.
(7) The filesystem never gets any access to the userspace parameter buffer
or result buffer.
(8) "Meta" attributes can describe other attributes.
========
OVERVIEW
========
fsinfo() is a system call that allows information about the filesystem at a
particular path point to be queried as a set of attributes.
Attribute values are of four basic types:
(1) Structure with version-dependent length (the length is the version).
(2) Variable-length string.
(3) List of structures (all the same length).
(4) Opaque blob.
Attributes can have multiple values either as a sequence of values or a
sequence-of-sequences of values and all the values of a particular
attribute must be of the same type. Values can be up to INT_MAX size,
subject to memory availability.
Note that the values of an attribute *are* allowed to vary between dentries
within a single superblock, depending on the specific dentry that you're
looking at, but the values still have to be of the type for that attribute.
I've tried to make the interface as light as possible, so integer attribute
IDs rather than string and the core does all the buffer allocation and
expansion and all the extensibility support work rather than leaving that
to the filesystems. This also means that userspace pointers are not
exposed to the filesystem.
fsinfo() allows a variety of information to be retrieved about a filesystem
and the mount topology:
(1) General superblock attributes:
- Filesystem identifiers (UUID, volume label, device numbers, ...)
- The limits on a filesystem's capabilities
- Information on supported statx fields and attributes and IOC flags.
- A variety single-bit flags indicating supported capabilities.
- Timestamp resolution and range.
- The amount of space/free space in a filesystem (as statfs()).
- Superblock notification counter.
(2) Filesystem-specific superblock attributes:
- Superblock-level timestamps.
- Cell name, workgroup or other netfs grouping concept.
- Server names and addresses.
(3) VFS information:
- Mount topology information.
- Mount attributes.
- Mount notification counter.
- Mount point path.
(4) Information about what the fsinfo() syscall itself supports, including
the type and struct size of attributes.
The system is extensible:
(1) New attributes can be added. There is no requirement that a
filesystem implement every attribute. A helper function is provided
to scan a list of attributes and a filesystem can have multiple such
lists.
(2) Version length-dependent structure attributes can be made larger and
have additional information tacked on the end, provided it keeps the
layout of the existing fields. If an older process asks for a shorter
structure, it will only be given the bits it asks for. If a newer
process asks for a longer structure on an older kernel, the extra
space will be set to 0. In all cases, the size of the data actually
available is returned.
In essence, the size of a structure is that structure's version: a
smaller size is an earlier version and a later version includes
everything that the earlier version did.
(3) New single-bit capability flags can be added. This is a structure-typed
attribute and, as such, (2) applies. Any bits you wanted but the kernel
doesn't support are automatically set to 0.
fsinfo() may be called like the following, for example:
struct fsinfo_params params = {
.at_flags = AT_SYMLINK_NOFOLLOW,
.flags = FSINFO_FLAGS_QUERY_PATH,
.request = FSINFO_ATTR_AFS_SERVER_ADDRESSES,
.Nth = 2,
};
struct fsinfo_server_address address;
len = fsinfo(AT_FDCWD, "/afs/grand.central.org/doc",
¶ms, sizeof(params),
&address, sizeof(address));
The above example would query an AFS filesystem to retrieve the address
list for the 3rd server, and:
struct fsinfo_params params = {
.at_flags = AT_SYMLINK_NOFOLLOW,
.flags = FSINFO_FLAGS_QUERY_PATH,
.request = FSINFO_ATTR_NFS_SERVER_NAME;
};
char server_name[256];
len = fsinfo(AT_FDCWD, "/home/dhowells/",
¶ms, sizeof(params),
&server_name, sizeof(server_name));
would retrieve the name of the NFS server as a string.
In future, I want to make fsinfo() capable of querying a context created by
fsopen() or fspick(), e.g.:
fd = fsopen("ext4", 0);
struct fsinfo_params params = {
.flags = FSINFO_FLAGS_QUERY_FSCONTEXT,
.request = FSINFO_ATTR_CONFIGURATION;
};
char buffer[65536];
fsinfo(fd, NULL, ¶ms, sizeof(params), &buffer, sizeof(buffer));
even if that context doesn't currently have a superblock attached.
The patches can be found here also:
https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
on branch:
fsinfo-core
===================
SIGNIFICANT CHANGES
===================
ver #21:
(*) Moved the mount event counters here from the mount notifications
patchset. Made the counters in the kernel atomic_long_t and the UAPI
counters __u64.
(*) Added Jeff Layton's patches to allow userspace to retrieve writeback
error information through fsinfo().
ver #20:
(*) Changed MOUNT_PROPAGATION_SLAVE to MOUNT_PROPAGATION_DEPENDENT and
renamed the fields in the fsinfo_mount_topology struct. The
MOUNT_PROPAGATION_* settings have been turned into an enum and will
also be passed to mount_setattr().
(*) Adjusted the Ext4 patch from feedback and removed the example status
from it.
(*) Dropped the NFS patch.
(*) I've dropped the superblock notifications for now.
ver #19:
(*) Split FSINFO_ATTR_MOUNT_TOPOLOGY from FSINFO_ATTR_MOUNT_INFO. The
latter requires no locking as it looks no further than the mount
object it's dealing with. The topology attribute, however, has to
take the namespace lock. That said, the info attribute includes a
counter that indicates how many times a mount object's position in the
topology has changed.
(*) A bit of patch rearrangement to put the mount topology-exposing
attributes into one patch.
(*) Pass both AT_* and RESOLVE_* flags to fsinfo() as suggested by Linus,
rather than adding missing RESOLVE_* flags.
David
---
David Howells (15):
fsinfo: Introduce a non-repeating system-unique superblock ID
fsinfo: Add fsinfo() syscall to query filesystem information
fsinfo: Provide a bitmap of the features a filesystem supports
fsinfo: Allow retrieval of superblock devname, options and stats
fsinfo: Allow fsinfo() to look up a mount object by ID
fsinfo: Add a uniquifier ID to struct mount
fsinfo: Allow mount information to be queried
fsinfo: Allow mount topology and propagation info to be retrieved
watch_queue: Mount event counters
fsinfo: Provide notification overrun handling support
fsinfo: sample: Mount listing program
fsinfo: Add API documentation
fsinfo: Add support for AFS
fsinfo: Add support to ext4
fsinfo: Add an attribute that lists all the visible mounts in a namespace
Jeff Layton (3):
errseq: add a new errseq_scrape function
vfs: allow fsinfo to fetch the current state of s_wb_err
samples: add error state information to test-fsinfo.c
Documentation/filesystems/fsinfo.rst | 574 +++++++++++++
arch/alpha/kernel/syscalls/syscall.tbl | 1 +
arch/arm/tools/syscall.tbl | 1 +
arch/arm64/include/asm/unistd.h | 2 +-
arch/arm64/include/asm/unistd32.h | 2 +
arch/ia64/kernel/syscalls/syscall.tbl | 1 +
arch/m68k/kernel/syscalls/syscall.tbl | 1 +
arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
arch/parisc/kernel/syscalls/syscall.tbl | 1 +
arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
arch/s390/kernel/syscalls/syscall.tbl | 1 +
arch/sh/kernel/syscalls/syscall.tbl | 1 +
arch/sparc/kernel/syscalls/syscall.tbl | 1 +
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
fs/Kconfig | 7 +
fs/Makefile | 1 +
fs/afs/internal.h | 1 +
fs/afs/super.c | 216 ++++-
fs/d_path.c | 2 +-
fs/ext4/Makefile | 1 +
fs/ext4/ext4.h | 6 +
fs/ext4/fsinfo.c | 97 +++
fs/ext4/super.c | 3 +
fs/fsinfo.c | 748 +++++++++++++++++
fs/internal.h | 15 +
fs/mount.h | 6 +
fs/mount_notify.c | 10 +-
fs/namespace.c | 427 +++++++++-
include/linux/errseq.h | 1 +
include/linux/fs.h | 4 +
include/linux/fsinfo.h | 112 +++
include/linux/syscalls.h | 4 +
include/uapi/asm-generic/unistd.h | 4 +-
include/uapi/linux/fsinfo.h | 344 ++++++++
include/uapi/linux/mount.h | 13 +-
kernel/sys_ni.c | 1 +
lib/errseq.c | 33 +-
samples/vfs/Makefile | 6 +-
samples/vfs/test-fsinfo.c | 883 ++++++++++++++++++++
samples/vfs/test-mntinfo.c | 277 ++++++
45 files changed, 3802 insertions(+), 14 deletions(-)
create mode 100644 Documentation/filesystems/fsinfo.rst
create mode 100644 fs/ext4/fsinfo.c
create mode 100644 fs/fsinfo.c
create mode 100644 include/linux/fsinfo.h
create mode 100644 include/uapi/linux/fsinfo.h
create mode 100644 samples/vfs/test-fsinfo.c
create mode 100644 samples/vfs/test-mntinfo.c
^ permalink raw reply
* Re: [PATCH 0/5] Mount notifications [ver #2]
From: David Howells @ 2020-08-03 13:22 UTC (permalink / raw)
To: viro
Cc: dhowells, linux-security-module, Stephen Smalley, Linus Torvalds,
Casey Schaufler, Miklos Szeredi, Jarkko Sakkinen, James Morris,
nicolas.dichtel, raven, christian, jlayton, kzak, linux-api,
linux-fsdevel, linux-kernel
In-Reply-To: <159645997768.1779777.8286723139418624756.stgit@warthog.procyon.org.uk>
David Howells <dhowells@redhat.com> wrote:
>
> ===================
> SIGNIFICANT CHANGES
> ===================
>
> ver #2:
>
> (*) Make the ID fields in the mount notification 64-bits. They're left
> referring to the mount ID here, but switched to the mount unique ID in
> the patch in fsinfo that adds that. [Requested by Miklós Szeredi]
>
> (*) Dropped the event counters from the mount notification message.
> [Requested by Miklós].
>
> This can easily be added back later as the message length can be
> increased to show it.
>
> (*) Moved the mount event counters over to the fsinfo patchset.
Also:
(*) Added limitations on the number of concurrent watches that can be held
by a user. [Requested by Linus]
(*) Removed the unused NOTIFY_MOUNT_IS_RECURSIVE flag. [Requested by
Miklós]
David
^ permalink raw reply
* [PATCH 2/5] watch_queue: Make watch_sizeof() check record size [ver #2]
From: David Howells @ 2020-08-03 13:06 UTC (permalink / raw)
To: viro
Cc: Miklos Szeredi, dhowells, torvalds, casey, sds, nicolas.dichtel,
raven, christian, jlayton, kzak, mszeredi, linux-api,
linux-fsdevel, linux-security-module, linux-kernel
In-Reply-To: <159645997768.1779777.8286723139418624756.stgit@warthog.procyon.org.uk>
Make watch_sizeof() give a build error if the size of the struct won't fit
into the size field in the header.
Reported-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
include/linux/watch_queue.h | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 5e08db2adc31..38e04c7a7951 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -120,7 +120,12 @@ static inline void remove_watch_list(struct watch_list *wlist, u64 id)
* watch_sizeof - Calculate the information part of the size of a watch record,
* given the structure size.
*/
-#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT)
+#define watch_sizeof(STRUCT) \
+ ({ \
+ size_t max = WATCH_INFO_LENGTH >> WATCH_INFO_LENGTH__SHIFT; \
+ BUILD_BUG_ON(sizeof(STRUCT) > max); \
+ sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT; \
+ })
#endif
^ permalink raw reply related
* [PATCH 5/5] watch_queue: sample: Display mount tree change notifications [ver #2]
From: David Howells @ 2020-08-03 13:07 UTC (permalink / raw)
To: viro
Cc: dhowells, torvalds, casey, sds, nicolas.dichtel, raven, christian,
jlayton, kzak, mszeredi, linux-api, linux-fsdevel,
linux-security-module, linux-kernel
In-Reply-To: <159645997768.1779777.8286723139418624756.stgit@warthog.procyon.org.uk>
This is run like:
./watch_test
and watches "/" for changes to the mount topology and the attributes of
individual mount objects.
# mount -t tmpfs none /mnt
# mount -o remount,ro /mnt
# mount -o remount,rw /mnt
producing:
# ./watch_test
read() = 16
NOTIFY[000]: ty=000002 sy=00 i=02000010
MOUNT 00000060 change=0[new_mount] aux=416
read() = 16
NOTIFY[000]: ty=000002 sy=04 i=02010010
MOUNT 000001a0 change=4[setattr] aux=0
read() = 16
NOTIFY[000]: ty=000002 sy=04 i=02010010
MOUNT 000001a0 change=4[setattr] aux=0
Signed-off-by: David Howells <dhowells@redhat.com>
---
samples/watch_queue/watch_test.c | 41 +++++++++++++++++++++++++++++++++++++-
1 file changed, 40 insertions(+), 1 deletion(-)
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
index 46e618a897fe..d82554436152 100644
--- a/samples/watch_queue/watch_test.c
+++ b/samples/watch_queue/watch_test.c
@@ -26,6 +26,9 @@
#ifndef __NR_keyctl
#define __NR_keyctl -1
#endif
+#ifndef __NR_watch_mount
+#define __NR_watch_mount -1
+#endif
#define BUF_SIZE 256
@@ -58,6 +61,29 @@ static void saw_key_change(struct watch_notification *n, size_t len)
k->key_id, n->subtype, key_subtypes[n->subtype], k->aux);
}
+static const char *mount_subtypes[256] = {
+ [NOTIFY_MOUNT_NEW_MOUNT] = "new_mount",
+ [NOTIFY_MOUNT_UNMOUNT] = "unmount",
+ [NOTIFY_MOUNT_EXPIRY] = "expiry",
+ [NOTIFY_MOUNT_READONLY] = "readonly",
+ [NOTIFY_MOUNT_SETATTR] = "setattr",
+ [NOTIFY_MOUNT_MOVE_FROM] = "move_from",
+ [NOTIFY_MOUNT_MOVE_TO] = "move_to",
+};
+
+static void saw_mount_change(struct watch_notification *n, size_t len)
+{
+ struct mount_notification *m = (struct mount_notification *)n;
+
+ if (len != sizeof(struct mount_notification))
+ return;
+
+ printf("MOUNT %08llx change=%u[%s] aux=%llx\n",
+ (unsigned long long)m->triggered_on,
+ n->subtype, mount_subtypes[n->subtype],
+ (unsigned long long)m->auxiliary_mount);
+}
+
/*
* Consume and display events.
*/
@@ -134,6 +160,9 @@ static void consumer(int fd)
default:
printf("other type\n");
break;
+ case WATCH_TYPE_MOUNT_NOTIFY:
+ saw_mount_change(&n.n, len);
+ break;
}
p += len;
@@ -142,12 +171,17 @@ static void consumer(int fd)
}
static struct watch_notification_filter filter = {
- .nr_filters = 1,
+ .nr_filters = 2,
.filters = {
[0] = {
.type = WATCH_TYPE_KEY_NOTIFY,
.subtype_filter[0] = UINT_MAX,
},
+ [1] = {
+ .type = WATCH_TYPE_MOUNT_NOTIFY,
+ // Reject move-from notifications
+ .subtype_filter[0] = UINT_MAX & ~(1 << NOTIFY_MOUNT_MOVE_FROM),
+ },
},
};
@@ -181,6 +215,11 @@ int main(int argc, char **argv)
exit(1);
}
+ if (syscall(__NR_watch_mount, AT_FDCWD, "/", 0, fd, 0xde) == -1) {
+ perror("watch_mount");
+ exit(1);
+ }
+
consumer(fd);
exit(0);
}
^ permalink raw reply related
* [PATCH 4/5] watch_queue: Implement mount topology and attribute change notifications [ver #2]
From: David Howells @ 2020-08-03 13:06 UTC (permalink / raw)
To: viro
Cc: dhowells, torvalds, casey, sds, nicolas.dichtel, raven, christian,
jlayton, kzak, mszeredi, linux-api, linux-fsdevel,
linux-security-module, linux-kernel
In-Reply-To: <159645997768.1779777.8286723139418624756.stgit@warthog.procyon.org.uk>
Add a mount notification facility whereby notifications about changes in
mount topology and configuration can be received. Note that this only
covers vfsmount topology changes and not superblock events. A separate
facility will be added for that.
Firstly, a watch queue needs to be created:
pipe2(fds, O_NOTIFICATION_PIPE);
ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
then a notification can be set up to report notifications via that queue:
struct watch_notification_filter filter = {
.nr_filters = 1,
.filters = {
[0] = {
.type = WATCH_TYPE_MOUNT_NOTIFY,
.subtype_filter[0] = UINT_MAX,
},
},
};
ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);
watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);
In this case, it would let me monitor the mount topology subtree rooted at
"/" for events. Mount notifications propagate up the tree towards the
root, so a watch will catch all of the events happening in the subtree
rooted at the watch.
After setting the watch, records will be placed into the queue when, for
example, as superblock switches between read-write and read-only. Records
are of the following format:
struct mount_notification {
struct watch_notification watch;
__u64 triggered_on;
__u64 auxiliary_mount;
} *n;
Where:
n->watch.type will be WATCH_TYPE_MOUNT_NOTIFY.
n->watch.subtype will indicate the type of event, such as
NOTIFY_MOUNT_NEW_MOUNT.
n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
record.
n->watch.info & WATCH_INFO_ID will be the fifth argument to
watch_mount(), shifted.
n->watch.info & NOTIFY_MOUNT_IN_SUBTREE if true indicates that the
notification was generated in the mount subtree rooted at the
watch, and not actually in the watch itself.
n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true indicates that
the notification was generated by an event (eg. SETATTR) that was
applied recursively. The notification is only generated for the
object that initially triggered it.
n->watch.info & NOTIFY_MOUNT_IS_NOW_RO will be used for
NOTIFY_MOUNT_READONLY, being set if the mount becomes R/O, and
being cleared otherwise, and for NOTIFY_MOUNT_NEW_MOUNT, being set
if the new mount is readonly.
n->watch.info & NOTIFY_MOUNT_IS_SUBMOUNT if true indicates that the
NOTIFY_MOUNT_NEW_MOUNT notification is in response to a mount
performed by the kernel (e.g. an automount).
n->triggered_on indicates the ID of the mount to which the change
was accounted (e.g. the new parent of a new mount).
n->axiliary_mount indicates the ID of an additional mount that was
affected (e.g. a new mount itself) or 0.
Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype. Note also that
the queue can be shared between multiple notifications of various types.
Signed-off-by: David Howells <dhowells@redhat.com>
---
Documentation/watch_queue.rst | 12 +
arch/alpha/kernel/syscalls/syscall.tbl | 1
arch/arm/tools/syscall.tbl | 1
arch/arm64/include/asm/unistd.h | 2
arch/arm64/include/asm/unistd32.h | 2
arch/ia64/kernel/syscalls/syscall.tbl | 1
arch/m68k/kernel/syscalls/syscall.tbl | 1
arch/microblaze/kernel/syscalls/syscall.tbl | 1
arch/mips/kernel/syscalls/syscall_n32.tbl | 1
arch/mips/kernel/syscalls/syscall_n64.tbl | 1
arch/mips/kernel/syscalls/syscall_o32.tbl | 1
arch/parisc/kernel/syscalls/syscall.tbl | 1
arch/powerpc/kernel/syscalls/syscall.tbl | 1
arch/s390/kernel/syscalls/syscall.tbl | 1
arch/sh/kernel/syscalls/syscall.tbl | 1
arch/sparc/kernel/syscalls/syscall.tbl | 1
arch/x86/entry/syscalls/syscall_32.tbl | 1
arch/x86/entry/syscalls/syscall_64.tbl | 1
arch/xtensa/kernel/syscalls/syscall.tbl | 1
fs/Kconfig | 9 +
fs/Makefile | 1
fs/mount.h | 18 ++
fs/mount_notify.c | 222 +++++++++++++++++++++++++++
fs/namespace.c | 22 +++
include/linux/dcache.h | 1
include/linux/syscalls.h | 2
include/uapi/asm-generic/unistd.h | 4
include/uapi/linux/watch_queue.h | 31 ++++
kernel/sys_ni.c | 3
29 files changed, 341 insertions(+), 4 deletions(-)
create mode 100644 fs/mount_notify.c
diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index 849fad6893ef..3e647992be31 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -8,6 +8,7 @@ opened by userspace. This can be used in conjunction with::
* Key/keyring notifications
+ * Mount notifications.
The notifications buffers can be enabled by:
@@ -233,6 +234,11 @@ Any particular buffer can be fed from multiple sources. Sources include:
See Documentation/security/keys/core.rst for more information.
+ * WATCH_TYPE_MOUNT_NOTIFY
+
+ Notifications of this type indicate changes to mount attributes and the
+ mount topology within the subtree at the indicated point.
+
Event Filtering
===============
@@ -292,9 +298,10 @@ A buffer is created with something like the following::
pipe2(fds, O_TMPFILE);
ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
-It can then be set to receive keyring change notifications::
+It can then be set to receive notifications::
keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01);
+ watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);
The notifications can then be consumed by something like the following::
@@ -331,6 +338,9 @@ The notifications can then be consumed by something like the following::
case WATCH_TYPE_KEY_NOTIFY:
saw_key_change(&n.n);
break;
+ case WATCH_TYPE_MOUNT_NOTIFY:
+ saw_mount_change(&n.n);
+ break;
}
p += len;
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 5ddd128d4b7a..b6cf8403da35 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -478,3 +478,4 @@
547 common openat2 sys_openat2
548 common pidfd_getfd sys_pidfd_getfd
549 common faccessat2 sys_faccessat2
+550 common watch_mount sys_watch_mount
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index d5cae5ffede0..27cc1f53f4a0 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -452,3 +452,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 3b859596840d..b3b2019f8d16 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 440
+#define __NR_compat_syscalls 441
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 6d95d0c8bf2f..4f9cf98cdf0f 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -885,6 +885,8 @@ __SYSCALL(__NR_openat2, sys_openat2)
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#define __NR_faccessat2 439
__SYSCALL(__NR_faccessat2, sys_faccessat2)
+#define __NR_watch_mount 440
+__SYSCALL(__NR_watch_mount, sys_watch_mount)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 49e325b604b3..fc6d87903781 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -359,3 +359,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f71b1bbcc198..c671aa0e4d25 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index edacc4561f2b..65cc53f129ef 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index f777141f5256..7f034a239930 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -377,3 +377,4 @@
437 n32 openat2 sys_openat2
438 n32 pidfd_getfd sys_pidfd_getfd
439 n32 faccessat2 sys_faccessat2
+440 n32 watch_mount sys_watch_mount
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index da8c76394e17..d39b90de3642 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -353,3 +353,4 @@
437 n64 openat2 sys_openat2
438 n64 pidfd_getfd sys_pidfd_getfd
439 n64 faccessat2 sys_faccessat2
+440 n64 watch_mount sys_watch_mount
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 13280625d312..09f426cb45b1 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -426,3 +426,4 @@
437 o32 openat2 sys_openat2
438 o32 pidfd_getfd sys_pidfd_getfd
439 o32 faccessat2 sys_faccessat2
+440 o32 watch_mount sys_watch_mount
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 5a758fa6ec52..52ff3454baa1 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index f833a3190822..10b7ed3c7a1b 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -528,3 +528,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index bfdcb7633957..86f317bf52df 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
437 common openat2 sys_openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount sys_watch_mount
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index acc35daa1b79..0bb0f0b372c7 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 8004a276cb74..369ab65c1e9a 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -484,3 +484,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d8f8a1a69ed1..e760ba92c58d 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -443,3 +443,4 @@
437 i386 openat2 sys_openat2
438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2
+440 i386 watch_mount sys_watch_mount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78847b32e137..5b58621d4f75 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -360,6 +360,7 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 69d0d73876b3..5b28ee39f70f 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -409,3 +409,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
+440 common watch_mount sys_watch_mount
diff --git a/fs/Kconfig b/fs/Kconfig
index a88aa3af73c1..1a55e56d5c54 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -117,6 +117,15 @@ source "fs/verity/Kconfig"
source "fs/notify/Kconfig"
+config MOUNT_NOTIFICATIONS
+ bool "Mount topology change notifications"
+ select WATCH_QUEUE
+ help
+ This option provides support for getting change notifications on the
+ mount tree topology. This makes use of the /dev/watch_queue misc
+ device to handle the notification buffer and provides the
+ mount_notify() system call to enable/disable watchpoints.
+
source "fs/quota/Kconfig"
source "fs/autofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 2ce5112b02c8..dd0d87e2ef19 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -22,6 +22,7 @@ obj-y += no-block.o
endif
obj-$(CONFIG_PROC_FS) += proc_namespace.o
+obj-$(CONFIG_MOUNT_NOTIFICATIONS) += mount_notify.o
obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/mount.h b/fs/mount.h
index c7abb7b394d8..85456a5f5a3a 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+#include <linux/watch_queue.h>
struct mnt_namespace {
atomic_t count;
@@ -78,6 +79,9 @@ struct mount {
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct hlist_head mnt_stuck_children;
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+ struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
+#endif
} __randomize_layout;
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
@@ -159,3 +163,17 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
}
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+extern void notify_mount(struct mount *triggered,
+ struct mount *aux,
+ enum mount_notification_subtype subtype,
+ u32 info_flags);
+#else
+static inline void notify_mount(struct mount *triggered,
+ struct mount *aux,
+ enum mount_notification_subtype subtype,
+ u32 info_flags)
+{
+}
+#endif
diff --git a/fs/mount_notify.c b/fs/mount_notify.c
new file mode 100644
index 000000000000..44f570e4cebe
--- /dev/null
+++ b/fs/mount_notify.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Provide mount topology/attribute change notifications.
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "mount.h"
+
+/*
+ * Post mount notifications to all watches going rootwards along the tree.
+ *
+ * Must be called with the mount_lock held.
+ */
+static void post_mount_notification(struct mount *changed,
+ struct mount_notification *notify)
+{
+ const struct cred *cred = current_cred();
+ struct path cursor;
+ struct mount *mnt;
+ unsigned seq;
+
+ seq = 0;
+ rcu_read_lock();
+restart:
+ cursor.mnt = &changed->mnt;
+ cursor.dentry = changed->mnt.mnt_root;
+ mnt = real_mount(cursor.mnt);
+ notify->watch.info &= ~NOTIFY_MOUNT_IN_SUBTREE;
+
+ read_seqbegin_or_lock(&rename_lock, &seq);
+ for (;;) {
+ if (mnt->mnt_watchers &&
+ !hlist_empty(&mnt->mnt_watchers->watchers)) {
+ if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
+ post_watch_notification(mnt->mnt_watchers,
+ ¬ify->watch, cred,
+ (unsigned long)cursor.dentry);
+ } else {
+ cursor.dentry = mnt->mnt.mnt_root;
+ }
+ notify->watch.info |= NOTIFY_MOUNT_IN_SUBTREE;
+
+ if (cursor.dentry == cursor.mnt->mnt_root ||
+ IS_ROOT(cursor.dentry)) {
+ struct mount *parent = READ_ONCE(mnt->mnt_parent);
+
+ /* Escaped? */
+ if (cursor.dentry != cursor.mnt->mnt_root)
+ break;
+
+ /* Global root? */
+ if (mnt == parent)
+ break;
+
+ cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
+ mnt = parent;
+ cursor.mnt = &mnt->mnt;
+ } else {
+ cursor.dentry = cursor.dentry->d_parent;
+ }
+ }
+
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+
+ done_seqretry(&rename_lock, seq);
+ rcu_read_unlock();
+}
+
+/*
+ * Generate a mount notification.
+ */
+void notify_mount(struct mount *trigger,
+ struct mount *aux,
+ enum mount_notification_subtype subtype,
+ u32 info_flags)
+{
+
+ struct mount_notification n;
+
+ memset(&n, 0, sizeof(n));
+ n.watch.type = WATCH_TYPE_MOUNT_NOTIFY;
+ n.watch.subtype = subtype;
+ n.watch.info = info_flags | watch_sizeof(n);
+ n.triggered_on = trigger->mnt_id;
+
+ switch (subtype) {
+ case NOTIFY_MOUNT_EXPIRY:
+ case NOTIFY_MOUNT_READONLY:
+ case NOTIFY_MOUNT_SETATTR:
+ break;
+
+ case NOTIFY_MOUNT_NEW_MOUNT:
+ case NOTIFY_MOUNT_UNMOUNT:
+ case NOTIFY_MOUNT_MOVE_FROM:
+ case NOTIFY_MOUNT_MOVE_TO:
+ n.auxiliary_mount = aux->mnt_id;
+ break;
+
+ default:
+ BUG();
+ }
+
+ post_mount_notification(trigger, &n);
+}
+
+static void release_mount_watch(struct watch *watch)
+{
+ struct dentry *dentry = (struct dentry *)(unsigned long)watch->id;
+
+ dput(dentry);
+}
+
+/**
+ * sys_watch_mount - Watch for mount topology/attribute changes
+ * @dfd: Base directory to pathwalk from or fd referring to mount.
+ * @filename: Path to mount to place the watch upon
+ * @at_flags: Pathwalk control flags
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE5(watch_mount,
+ int, dfd,
+ const char __user *, filename,
+ unsigned int, at_flags,
+ int, watch_fd,
+ int, watch_id)
+{
+ struct watch_queue *wqueue;
+ struct watch_list *wlist = NULL;
+ struct watch *watch = NULL;
+ struct mount *m;
+ struct path path;
+ unsigned int lookup_flags =
+ LOOKUP_DIRECTORY | LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+ int ret;
+
+ if (watch_id < -1 || watch_id > 0xff)
+ return -EINVAL;
+ if ((at_flags & ~(AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+ if (at_flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (at_flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ ret = user_path_at(dfd, filename, lookup_flags, &path);
+ if (ret)
+ return ret;
+
+ ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
+ if (ret)
+ goto err_path;
+
+ wqueue = get_watch_queue(watch_fd);
+ if (IS_ERR(wqueue))
+ goto err_path;
+
+ m = real_mount(path.mnt);
+
+ if (watch_id >= 0) {
+ ret = -ENOMEM;
+ if (!READ_ONCE(m->mnt_watchers)) {
+ wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+ if (!wlist)
+ goto err_wqueue;
+ init_watch_list(wlist, release_mount_watch);
+ }
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (!watch)
+ goto err_wlist;
+
+ init_watch(watch, wqueue);
+ watch->id = (unsigned long)path.dentry;
+ watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT;
+
+ ret = security_watch_mount(watch, &path);
+ if (ret < 0)
+ goto err_watch;
+
+ down_write(&m->mnt.mnt_sb->s_umount);
+ if (!m->mnt_watchers) {
+ m->mnt_watchers = wlist;
+ wlist = NULL;
+ }
+
+ ret = add_watch_to_object(watch, m->mnt_watchers);
+ if (ret == 0) {
+ spin_lock(&path.dentry->d_lock);
+ path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
+ spin_unlock(&path.dentry->d_lock);
+ dget(path.dentry);
+ watch = NULL;
+ }
+ up_write(&m->mnt.mnt_sb->s_umount);
+ } else {
+ down_write(&m->mnt.mnt_sb->s_umount);
+ ret = remove_watch_from_object(m->mnt_watchers, wqueue,
+ (unsigned long)path.dentry,
+ false);
+ up_write(&m->mnt.mnt_sb->s_umount);
+ }
+
+err_watch:
+ kfree(watch);
+err_wlist:
+ kfree(wlist);
+err_wqueue:
+ put_watch_queue(wqueue);
+err_path:
+ path_put(&path);
+ return ret;
+}
diff --git a/fs/namespace.c b/fs/namespace.c
index 4a0f600a3328..73ff5bf0c9af 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -498,6 +498,9 @@ static int mnt_make_readonly(struct mount *mnt)
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
unlock_mount_hash();
+ if (ret == 0)
+ notify_mount(mnt, NULL, NOTIFY_MOUNT_READONLY,
+ NOTIFY_MOUNT_IS_NOW_RO);
return ret;
}
@@ -506,6 +509,7 @@ static int __mnt_unmake_readonly(struct mount *mnt)
lock_mount_hash();
mnt->mnt.mnt_flags &= ~MNT_READONLY;
unlock_mount_hash();
+ notify_mount(mnt, NULL, NOTIFY_MOUNT_READONLY, 0);
return 0;
}
@@ -835,6 +839,7 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
*/
static void umount_mnt(struct mount *mnt)
{
+ notify_mount(mnt->mnt_parent, mnt, NOTIFY_MOUNT_UNMOUNT, 0);
put_mountpoint(unhash_mnt(mnt));
}
@@ -1175,6 +1180,11 @@ static void mntput_no_expire(struct mount *mnt)
mnt->mnt.mnt_flags |= MNT_DOOMED;
rcu_read_unlock();
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+ if (mnt->mnt_watchers)
+ remove_watch_list(mnt->mnt_watchers, mnt->mnt_id);
+#endif
+
list_del(&mnt->mnt_instance);
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
@@ -1503,6 +1513,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
+
ns = p->mnt_ns;
if (ns) {
ns->mounts--;
@@ -2137,7 +2148,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
}
if (moving) {
unhash_mnt(source_mnt);
+ notify_mount(source_mnt->mnt_parent, source_mnt,
+ NOTIFY_MOUNT_MOVE_FROM, 0);
attach_mnt(source_mnt, dest_mnt, dest_mp);
+ notify_mount(dest_mnt, source_mnt, NOTIFY_MOUNT_MOVE_TO, 0);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
@@ -2146,6 +2160,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
}
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt);
+ notify_mount(dest_mnt, source_mnt, NOTIFY_MOUNT_NEW_MOUNT,
+ (source_mnt->mnt.mnt_sb->s_flags & SB_RDONLY ?
+ NOTIFY_MOUNT_IS_NOW_RO : 0) |
+ (source_mnt->mnt.mnt_sb->s_flags & SB_SUBMOUNT ?
+ NOTIFY_MOUNT_IS_SUBMOUNT : 0));
}
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
@@ -2522,6 +2541,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
mnt->mnt.mnt_flags = mnt_flags;
touch_mnt_namespace(mnt->mnt_ns);
unlock_mount_hash();
+ notify_mount(mnt, NULL, NOTIFY_MOUNT_SETATTR,
+ (mnt_flags & SB_RDONLY ? NOTIFY_MOUNT_IS_NOW_RO : 0));
}
static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
@@ -2992,6 +3013,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
propagate_mount_busy(mnt, 1))
continue;
list_move(&mnt->mnt_expire, &graveyard);
+ notify_mount(mnt, NULL, NOTIFY_MOUNT_EXPIRY, 0);
}
while (!list_empty(&graveyard)) {
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a81f0c3cf352..a94c551c62a3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -219,6 +219,7 @@ struct dentry_operations {
#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR 0x20000000
#define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */
+#define DCACHE_MOUNT_WATCH 0x80000000 /* There's a mount watch here */
extern seqlock_t rename_lock;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b951a87da987..88d03fd627ab 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1005,6 +1005,8 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
siginfo_t __user *info,
unsigned int flags);
asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
+asmlinkage long sys_watch_mount(int dfd, const char __user *path,
+ unsigned int at_flags, int watch_fd, int watch_id);
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f4a01305d9a6..fcdca8c7d30a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -857,9 +857,11 @@ __SYSCALL(__NR_openat2, sys_openat2)
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#define __NR_faccessat2 439
__SYSCALL(__NR_faccessat2, sys_faccessat2)
+#define __NR_watch_mount 440
+__SYSCALL(__NR_watch_mount, sys_watch_mount)
#undef __NR_syscalls
-#define __NR_syscalls 440
+#define __NR_syscalls 441
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index c3d8320b5d3a..83b11242c10e 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -14,7 +14,8 @@
enum watch_notification_type {
WATCH_TYPE_META = 0, /* Special record */
WATCH_TYPE_KEY_NOTIFY = 1, /* Key change event notification */
- WATCH_TYPE__NR = 2
+ WATCH_TYPE_MOUNT_NOTIFY = 2, /* Mount topology change notification */
+ WATCH_TYPE___NR = 3
};
enum watch_meta_notification_subtype {
@@ -101,4 +102,32 @@ struct key_notification {
__u32 aux; /* Per-type auxiliary data */
};
+/*
+ * Type of mount topology change notification.
+ */
+enum mount_notification_subtype {
+ NOTIFY_MOUNT_NEW_MOUNT = 0, /* New mount added */
+ NOTIFY_MOUNT_UNMOUNT = 1, /* Mount removed manually */
+ NOTIFY_MOUNT_EXPIRY = 2, /* Automount expired */
+ NOTIFY_MOUNT_READONLY = 3, /* Mount R/O state changed */
+ NOTIFY_MOUNT_SETATTR = 4, /* Mount attributes changed */
+ NOTIFY_MOUNT_MOVE_FROM = 5, /* Mount moved from here */
+ NOTIFY_MOUNT_MOVE_TO = 6, /* Mount moved to here (compare op_id) */
+};
+
+#define NOTIFY_MOUNT_IN_SUBTREE WATCH_INFO_FLAG_0 /* Event not actually at watched dentry */
+#define NOTIFY_MOUNT_IS_NOW_RO WATCH_INFO_FLAG_1 /* Mount changed to R/O */
+#define NOTIFY_MOUNT_IS_SUBMOUNT WATCH_INFO_FLAG_2 /* New mount is submount */
+
+/*
+ * Mount topology/configuration change notification record.
+ * - watch.type = WATCH_TYPE_MOUNT_NOTIFY
+ * - watch.subtype = enum mount_notification_subtype
+ */
+struct mount_notification {
+ struct watch_notification watch; /* WATCH_TYPE_MOUNT_NOTIFY */
+ __u64 triggered_on; /* The mount that triggered the notification */
+ __u64 auxiliary_mount; /* Added/moved/removed mount or 0 */
+};
+
#endif /* _UAPI_LINUX_WATCH_QUEUE_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..3e1c5c9d2efe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -85,6 +85,9 @@ COND_SYSCALL(ioprio_get);
/* fs/locks.c */
COND_SYSCALL(flock);
+/* fs/mount_notify.c */
+COND_SYSCALL(watch_mount);
+
/* fs/namei.c */
/* fs/namespace.c */
^ permalink raw reply related
* [PATCH 3/5] watch_queue: Add security hooks to rule on setting mount watches [ver #2]
From: David Howells @ 2020-08-03 13:06 UTC (permalink / raw)
To: viro
Cc: James Morris, Casey Schaufler, Stephen Smalley,
linux-security-module, dhowells, torvalds, casey, sds,
nicolas.dichtel, raven, christian, jlayton, kzak, mszeredi,
linux-api, linux-fsdevel, linux-security-module, linux-kernel
In-Reply-To: <159645997768.1779777.8286723139418624756.stgit@warthog.procyon.org.uk>
Add a security hook that will allow an LSM to rule on whether or not a
watch may be set on a mount.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: James Morris <jamorris@linux.microsoft.com>
cc: Casey Schaufler <casey@schaufler-ca.com>
cc: Stephen Smalley <sds@tycho.nsa.gov>
cc: linux-security-module@vger.kernel.org
---
include/linux/lsm_hook_defs.h | 3 +++
include/linux/lsm_hooks.h | 6 ++++++
include/linux/security.h | 8 ++++++++
security/security.c | 7 +++++++
4 files changed, 24 insertions(+)
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index af998f93d256..f6eaf8bd617b 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -264,6 +264,9 @@ LSM_HOOK(int, 0, post_notification, const struct cred *w_cred,
#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS)
LSM_HOOK(int, 0, watch_key, struct key *key)
#endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+LSM_HOOK(int, 0, watch_mount, struct watch *watch, struct path *path)
+#endif
#ifdef CONFIG_SECURITY_NETWORK
LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 95b7c1d32062..56275145b91d 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1468,6 +1468,12 @@
* from a key or keyring.
* @key: The key to watch.
*
+ * @watch_mount:
+ * Check to see if a process is allowed to watch for mount topology change
+ * notifications on a mount subtree.
+ * @watch: The watch object
+ * @path: The root of the subtree to watch.
+ *
* Security hooks for using the eBPF maps and programs functionalities through
* eBPF syscalls.
*
diff --git a/include/linux/security.h b/include/linux/security.h
index 0a0a03b36a3b..318fdfe7f4d6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1314,6 +1314,14 @@ static inline int security_watch_key(struct key *key)
return 0;
}
#endif
+#if defined(CONFIG_SECURITY) && defined(CONFIG_MOUNT_NOTIFICATIONS)
+int security_watch_mount(struct watch *watch, struct path *path);
+#else
+static inline int security_watch_mount(struct watch *watch, struct path *path)
+{
+ return 0;
+}
+#endif
#ifdef CONFIG_SECURITY_NETWORK
diff --git a/security/security.c b/security/security.c
index 70a7ad357bc6..3cdf5039f727 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2067,6 +2067,13 @@ int security_watch_key(struct key *key)
}
#endif
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+int security_watch_mount(struct watch *watch, struct path *path)
+{
+ return call_int_hook(watch_mount, 0, watch, path);
+}
+#endif
+
#ifdef CONFIG_SECURITY_NETWORK
int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
^ permalink raw reply related
* [PATCH 1/5] watch_queue: Limit the number of watches a user can hold [ver #2]
From: David Howells @ 2020-08-03 13:06 UTC (permalink / raw)
To: viro
Cc: Linus Torvalds, Jarkko Sakkinen, dhowells, torvalds, casey, sds,
nicolas.dichtel, raven, christian, jlayton, kzak, mszeredi,
linux-api, linux-fsdevel, linux-security-module, linux-kernel
In-Reply-To: <159645997768.1779777.8286723139418624756.stgit@warthog.procyon.org.uk>
Impose a limit on the number of watches that a user can hold so that they
can't use this mechanism to fill up all the available memory.
This is done by putting a counter in user_struct that's incremented when a
watch is allocated and decreased when it is released. If the number
exceeds the RLIMIT_NOFILE limit, the watch is rejected with EAGAIN.
This can be tested by the following means:
(1) Create a watch queue and attach it to fd 5 in the program given - in
this case, bash:
keyctl watch_session /tmp/nlog /tmp/gclog 5 bash
(2) In the shell, set the maximum number of files to, say, 99:
ulimit -n 99
(3) Add 200 keyrings:
for ((i=0; i<200; i++)); do keyctl newring a$i @s || break; done
(4) Try to watch all of the keyrings:
for ((i=0; i<200; i++)); do echo $i; keyctl watch_add 5 %:a$i || break; done
This should fail when the number of watches belonging to the user hits
99.
(5) Remove all the keyrings and all of those watches should go away:
for ((i=0; i<200; i++)); do keyctl unlink %:a$i; done
(6) Kill off the watch queue by exiting the shell spawned by
watch_session.
Fixes: c73be61cede5 ("pipe: Add general notification queue support")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
include/linux/sched/user.h | 3 +++
kernel/watch_queue.c | 8 ++++++++
2 files changed, 11 insertions(+)
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 917d88edb7b9..a8ec3b6093fc 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -36,6 +36,9 @@ struct user_struct {
defined(CONFIG_NET) || defined(CONFIG_IO_URING)
atomic_long_t locked_vm;
#endif
+#ifdef CONFIG_WATCH_QUEUE
+ atomic_t nr_watches; /* The number of watches this user currently has */
+#endif
/* Miscellaneous per-user rate limit */
struct ratelimit_state ratelimit;
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index f74020f6bd9d..0ef8f65bd2d7 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -393,6 +393,7 @@ static void free_watch(struct rcu_head *rcu)
struct watch *watch = container_of(rcu, struct watch, rcu);
put_watch_queue(rcu_access_pointer(watch->queue));
+ atomic_dec(&watch->cred->user->nr_watches);
put_cred(watch->cred);
}
@@ -452,6 +453,13 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
watch->cred = get_current_cred();
rcu_assign_pointer(watch->watch_list, wlist);
+ if (atomic_inc_return(&watch->cred->user->nr_watches) >
+ task_rlimit(current, RLIMIT_NOFILE)) {
+ atomic_dec(&watch->cred->user->nr_watches);
+ put_cred(watch->cred);
+ return -EAGAIN;
+ }
+
spin_lock_bh(&wqueue->lock);
kref_get(&wqueue->usage);
kref_get(&watch->usage);
^ permalink raw reply related
* [PATCH 0/5] Mount notifications [ver #2]
From: David Howells @ 2020-08-03 13:06 UTC (permalink / raw)
To: viro
Cc: linux-security-module, Stephen Smalley, Linus Torvalds,
Casey Schaufler, Miklos Szeredi, Jarkko Sakkinen, James Morris,
dhowells, torvalds, casey, sds, nicolas.dichtel, raven, christian,
jlayton, kzak, mszeredi, linux-api, linux-fsdevel,
linux-security-module, linux-kernel
Here's a set of patches to add notifications for mount topology events,
such as mounting, unmounting, mount expiry, mount reconfiguration.
An LSM hook is included to an LSM to rule on whether or not a mount watch
may be set on a particular path.
Why do we want mount notifications? Whilst /proc/mounts can be polled, it
only tells you that something changed in your namespace. To find out, you
have to trawl /proc/mounts or similar to work out what changed in the mount
object attributes and mount topology. I'm told that the proc file holding
the namespace_sem is a point of contention, especially as the process of
generating the text descriptions of the mounts/superblocks can be quite
involved.
The notification generated here directly indicates the mounts involved in
any particular event and gives an idea of what the change was.
This is combined with a new fsinfo() system call that allows, amongst other
things, the ability to retrieve in one go an { id, change_counter } tuple
from all the children of a specified mount, allowing buffer overruns to be
dealt with quickly.
This is of use to systemd to improve efficiency:
https://lore.kernel.org/linux-fsdevel/20200227151421.3u74ijhqt6ekbiss@ws.net.home/
And it's not just Red Hat that's potentially interested in this:
https://lore.kernel.org/linux-fsdevel/293c9bd3-f530-d75e-c353-ddeabac27cf6@6wind.com/
The kernel patches can also be found here:
https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h=notifications-pipe-core
===================
SIGNIFICANT CHANGES
===================
ver #2:
(*) Make the ID fields in the mount notification 64-bits. They're left
referring to the mount ID here, but switched to the mount unique ID in
the patch in fsinfo that adds that. [Requested by Miklós Szeredi]
(*) Dropped the event counters from the mount notification message.
[Requested by Miklós].
This can easily be added back later as the message length can be
increased to show it.
(*) Moved the mount event counters over to the fsinfo patchset.
David
---
David Howells (5):
watch_queue: Limit the number of watches a user can hold
watch_queue: Make watch_sizeof() check record size
watch_queue: Add security hooks to rule on setting mount watches
watch_queue: Implement mount topology and attribute change notifications
watch_queue: sample: Display mount tree change notifications
Documentation/watch_queue.rst | 12 +-
arch/alpha/kernel/syscalls/syscall.tbl | 1 +
arch/arm/tools/syscall.tbl | 1 +
arch/arm64/include/asm/unistd.h | 2 +-
arch/arm64/include/asm/unistd32.h | 2 +
arch/ia64/kernel/syscalls/syscall.tbl | 1 +
arch/m68k/kernel/syscalls/syscall.tbl | 1 +
arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
arch/parisc/kernel/syscalls/syscall.tbl | 1 +
arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
arch/s390/kernel/syscalls/syscall.tbl | 1 +
arch/sh/kernel/syscalls/syscall.tbl | 1 +
arch/sparc/kernel/syscalls/syscall.tbl | 1 +
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
fs/Kconfig | 9 +
fs/Makefile | 1 +
fs/mount.h | 18 ++
fs/mount_notify.c | 222 ++++++++++++++++++++
fs/namespace.c | 22 ++
include/linux/dcache.h | 1 +
include/linux/lsm_hook_defs.h | 3 +
include/linux/lsm_hooks.h | 6 +
include/linux/security.h | 8 +
include/linux/syscalls.h | 2 +
include/linux/watch_queue.h | 7 +-
include/uapi/asm-generic/unistd.h | 4 +-
include/uapi/linux/watch_queue.h | 31 ++-
kernel/sys_ni.c | 3 +
samples/watch_queue/watch_test.c | 41 +++-
security/security.c | 7 +
35 files changed, 411 insertions(+), 6 deletions(-)
create mode 100644 fs/mount_notify.c
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: David Howells @ 2020-08-03 12:31 UTC (permalink / raw)
To: Ian Kent
Cc: dhowells, Miklos Szeredi, Linus Torvalds, Al Viro,
Casey Schaufler, Stephen Smalley, Nicolas Dichtel,
Christian Brauner, andres, Jeff Layton, dray, Karel Zak, keyrings,
Linux API, linux-fsdevel, LSM, linux-kernel
In-Reply-To: <303106be4785135446e56cb606138a6e94885887.camel@themaw.net>
Ian Kent <raven@themaw.net> wrote:
> > I'm changing it so that the fields are 64-bit, but initialised with the
> > existing mount ID in the notifications set. The fsinfo set changes that
> > to a unique ID. I'm tempted to make the unique IDs start at UINT_MAX+1 to
> > disambiguate them.
>
> Mmm ... so what would I use as a mount id that's not used, like NULL
> for strings?
Zero is skipped, so you could use that.
> I'm using -1 now but changing this will mean I need something
> different.
It's 64-bits, so you're not likely to see it reach -1, even if it does start
at UINT_MAX+1.
David
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: Ian Kent @ 2020-08-03 12:01 UTC (permalink / raw)
To: David Howells, Miklos Szeredi
Cc: Linus Torvalds, Al Viro, Casey Schaufler, Stephen Smalley,
Nicolas Dichtel, Christian Brauner, andres, Jeff Layton, dray,
Karel Zak, keyrings, Linux API, linux-fsdevel, LSM, linux-kernel
In-Reply-To: <1576646.1596455376@warthog.procyon.org.uk>
On Mon, 2020-08-03 at 12:49 +0100, David Howells wrote:
> Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> > OTOH mount notification is way smaller and IMO a more mature
> > interface. So just picking the unique ID patch into this set might
> > make sense.
>
> But userspace can't retrieve the unique ID without fsinfo() as things
> stand.
>
> I'm changing it so that the fields are 64-bit, but initialised with
> the
> existing mount ID in the notifications set. The fsinfo set changes
> that to a
> unique ID. I'm tempted to make the unique IDs start at UINT_MAX+1 to
> disambiguate them.
Mmm ... so what would I use as a mount id that's not used, like NULL
for strings?
I'm using -1 now but changing this will mean I need something
different.
Could we set aside a mount id that will never be used so it can be
used for this case?
Maybe mount ids should start at 1 instead of zero ...
Ian
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: David Howells @ 2020-08-03 11:49 UTC (permalink / raw)
To: Miklos Szeredi
Cc: dhowells, Ian Kent, Linus Torvalds, Al Viro, Casey Schaufler,
Stephen Smalley, Nicolas Dichtel, Christian Brauner, andres,
Jeff Layton, dray, Karel Zak, keyrings, Linux API, linux-fsdevel,
LSM, linux-kernel
In-Reply-To: <CAJfpeguO8Qwkzx9zfGVT7W+pT5p6fgj-_8oJqJbXX_KQBpLLEQ@mail.gmail.com>
Miklos Szeredi <miklos@szeredi.hu> wrote:
> OTOH mount notification is way smaller and IMO a more mature
> interface. So just picking the unique ID patch into this set might
> make sense.
But userspace can't retrieve the unique ID without fsinfo() as things stand.
I'm changing it so that the fields are 64-bit, but initialised with the
existing mount ID in the notifications set. The fsinfo set changes that to a
unique ID. I'm tempted to make the unique IDs start at UINT_MAX+1 to
disambiguate them.
David
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: Miklos Szeredi @ 2020-08-03 11:17 UTC (permalink / raw)
To: David Howells
Cc: Ian Kent, Linus Torvalds, Al Viro, Casey Schaufler,
Stephen Smalley, Nicolas Dichtel, Christian Brauner, andres,
Jeff Layton, dray, Karel Zak, keyrings, Linux API, linux-fsdevel,
LSM, linux-kernel
In-Reply-To: <1283475.1596449889@warthog.procyon.org.uk>
On Mon, Aug 3, 2020 at 12:18 PM David Howells <dhowells@redhat.com> wrote:
>
> Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> > > fsinfo() then allows you to retrieve them by path or by mount ID.
> >
> > Shouldn't the notification interface provide the unique ID?
>
> Hmmm... If I'm going to do that, I have to put the fsinfo-core branch first
> otherwise you can't actually retrieve the unique ID - and thus won't be able
> to make sense of the notification record. Such a rearrangement might make
> sense anyway since Ian and Karel have been primarily concentrating on fsinfo
> and only more recently started adding notification support.
OTOH mount notification is way smaller and IMO a more mature
interface. So just picking the unique ID patch into this set might
make sense.
Thanks,
Miklos
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: David Howells @ 2020-08-03 10:18 UTC (permalink / raw)
To: Miklos Szeredi
Cc: dhowells, Ian Kent, Linus Torvalds, Al Viro, Casey Schaufler,
Stephen Smalley, Nicolas Dichtel, Christian Brauner, andres,
Jeff Layton, dray, Karel Zak, keyrings, Linux API, linux-fsdevel,
LSM, linux-kernel
In-Reply-To: <CAJfpegsT_3YqHPWCZGX7Lr+sE0NVmczWz5L6cN8CzsVz4YKLCQ@mail.gmail.com>
Miklos Szeredi <miklos@szeredi.hu> wrote:
> > fsinfo() then allows you to retrieve them by path or by mount ID.
>
> Shouldn't the notification interface provide the unique ID?
Hmmm... If I'm going to do that, I have to put the fsinfo-core branch first
otherwise you can't actually retrieve the unique ID - and thus won't be able
to make sense of the notification record. Such a rearrangement might make
sense anyway since Ian and Karel have been primarily concentrating on fsinfo
and only more recently started adding notification support.
David
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: David Howells @ 2020-08-03 10:08 UTC (permalink / raw)
To: Miklos Szeredi
Cc: dhowells, Ian Kent, Linus Torvalds, Al Viro, Casey Schaufler,
Stephen Smalley, Nicolas Dichtel, Christian Brauner, andres,
Jeff Layton, dray, Karel Zak, keyrings, Linux API, linux-fsdevel,
LSM, linux-kernel
In-Reply-To: <CAJfpegsT_3YqHPWCZGX7Lr+sE0NVmczWz5L6cN8CzsVz4YKLCQ@mail.gmail.com>
Miklos Szeredi <miklos@szeredi.hu> wrote:
> > fsinfo() then allows you to retrieve them by path or by mount ID.
>
> Shouldn't the notification interface provide the unique ID?
It could make sense - instead of the reusable mnt_id.
David
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: Miklos Szeredi @ 2020-08-03 10:02 UTC (permalink / raw)
To: David Howells
Cc: Ian Kent, Linus Torvalds, Al Viro, Casey Schaufler,
Stephen Smalley, Nicolas Dichtel, Christian Brauner, andres,
Jeff Layton, dray, Karel Zak, keyrings, Linux API, linux-fsdevel,
LSM, linux-kernel
In-Reply-To: <2023286.1595590563@warthog.procyon.org.uk>
On Fri, Jul 24, 2020 at 1:36 PM David Howells <dhowells@redhat.com> wrote:
>
> Ian Kent <raven@themaw.net> wrote:
>
> > I was wondering about id re-use.
> >
> > Assuming that ids that are returned to the idr db are re-used
> > what would the chance that a recently used id would end up
> > being used?
> >
> > Would that chance increase as ids are consumed and freed over
> > time?
>
> I've added something to deal with that in the fsinfo branch. I've given each
> mount object and superblock a supplementary 64-bit unique ID that's not likely
> to repeat before we're no longer around to have to worry about it.
>
> fsinfo() then allows you to retrieve them by path or by mount ID.
Shouldn't the notification interface provide the unique ID?
Thanks,
Miklos
>
> So, yes, mnt_id and s_dev are not unique and may be reused very quickly, but
> I'm also providing uniquifiers that you can check.
>
> David
>
^ permalink raw reply
* Re: [PATCH 13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]
From: Miklos Szeredi @ 2020-08-03 9:29 UTC (permalink / raw)
To: David Howells
Cc: Linus Torvalds, Al Viro, Casey Schaufler, Stephen Smalley,
Nicolas Dichtel, Ian Kent, Christian Brauner, andres, Jeff Layton,
dray, Karel Zak, keyrings, Linux API, linux-fsdevel, LSM,
linux-kernel
In-Reply-To: <1293241.1595501326@warthog.procyon.org.uk>
On Thu, Jul 23, 2020 at 12:48 PM David Howells <dhowells@redhat.com> wrote:
>
> > > __u32 topology_changes;
> > > __u32 attr_changes;
> > > __u32 aux_topology_changes;
> >
> > Being 32bit this introduces wraparound effects. Is that really worth it?
>
> You'd have to make 2 billion changes without whoever's monitoring getting a
> chance to update their counters. But maybe it's not worth it putting them
> here. If you'd prefer, I can make the counters all 64-bit and just retrieve
> them with fsinfo().
Yes, I think that would be preferable.
> > > n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true indicates that
> > > the notifcation was generated by an event (eg. SETATTR) that was
> > > applied recursively. The notification is only generated for the
> > > object that initially triggered it.
> >
> > Unused in this patchset. Please don't add things to the API which are not
> > used.
>
> Christian Brauner has patches for mount_setattr() that will need to use this.
Fine, then that patch can add the flag.
Thanks,
Miklos
^ permalink raw reply
* RE: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: David Laight @ 2020-08-03 8:27 UTC (permalink / raw)
To: 'Mark Rutland', Madhavan T. Venkataraman
Cc: Andy Lutomirski, Kernel Hardening, Linux API, linux-arm-kernel,
Linux FS Devel, linux-integrity, LKML, LSM List, Oleg Nesterov,
X86 ML
In-Reply-To: <20200731183146.GD67415@C02TD0UTHF1T.local>
From: Mark Rutland
> Sent: 31 July 2020 19:32
...
> > It requires PC-relative data references. I have not worked on all architectures.
> > So, I need to study this. But do all ISAs support PC-relative data references?
>
> Not all do, but pretty much any recent ISA will as it's a practical
> necessity for fast position-independent code.
i386 has neither PC-relative addressing nor moves from %pc.
The cpu architecture knows that the sequence:
call 1f
1: pop %reg
is used to get the %pc value so is treated specially so that
it doesn't 'trash' the return stack.
So PIC code isn't too bad, but you have to use the correct
sequence.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply
* RE: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: David Laight @ 2020-08-03 8:23 UTC (permalink / raw)
To: 'Madhavan T. Venkataraman', Andy Lutomirski
Cc: Kernel Hardening, Linux API, linux-arm-kernel, Linux FS Devel,
linux-integrity, LKML, LSM List, Oleg Nesterov, X86 ML
In-Reply-To: <3b916198-3a98-bd19-9a1c-f2d8d44febe8@linux.microsoft.com>
From: Madhavan T. Venkataraman
> Sent: 02 August 2020 19:55
> To: Andy Lutomirski <luto@kernel.org>
> Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>; Linux API <linux-api@vger.kernel.org>;
> linux-arm-kernel <linux-arm-kernel@lists.infradead.org>; Linux FS Devel <linux-
> fsdevel@vger.kernel.org>; linux-integrity <linux-integrity@vger.kernel.org>; LKML <linux-
> kernel@vger.kernel.org>; LSM List <linux-security-module@vger.kernel.org>; Oleg Nesterov
> <oleg@redhat.com>; X86 ML <x86@kernel.org>
> Subject: Re: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
>
> More responses inline..
>
> On 7/28/20 12:31 PM, Andy Lutomirski wrote:
> >> On Jul 28, 2020, at 6:11 AM, madvenka@linux.microsoft.com wrote:
> >>
> >> From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
> >>
> >
> > 2. Use existing kernel functionality. Raise a signal, modify the
> > state, and return from the signal. This is very flexible and may not
> > be all that much slower than trampfd.
>
> Let me understand this. You are saying that the trampoline code
> would raise a signal and, in the signal handler, set up the context
> so that when the signal handler returns, we end up in the target
> function with the context correctly set up. And, this trampoline code
> can be generated statically at build time so that there are no
> security issues using it.
>
> Have I understood your suggestion correctly?
I was thinking that you'd just let the 'not executable' page fault
signal happen (SIGSEGV?) when the code jumps to on-stack trampoline
is executed.
The user signal handler can then decode the faulting instruction
and, if it matches the expected on-stack trampoline, modify the
saved registers before returning from the signal.
No kernel changes and all you need to add to the program is
an architecture-dependant signal handler.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply
* RE: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: David Laight @ 2020-08-03 8:08 UTC (permalink / raw)
To: 'Pavel Machek'
Cc: 'Andy Lutomirski', madvenka@linux.microsoft.com,
Kernel Hardening, Linux API, linux-arm-kernel, Linux FS Devel,
linux-integrity, LKML, LSM List, Oleg Nesterov, X86 ML
In-Reply-To: <20200802115600.GB1162@bug>
From: Pavel Machek <pavel@ucw.cz>
> Sent: 02 August 2020 12:56
> Hi!
>
> > > This is quite clever, but now I???m wondering just how much kernel help
> > > is really needed. In your series, the trampoline is an non-executable
> > > page. I can think of at least two alternative approaches, and I'd
> > > like to know the pros and cons.
> > >
> > > 1. Entirely userspace: a return trampoline would be something like:
> > >
> > > 1:
> > > pushq %rax
> > > pushq %rbc
> > > pushq %rcx
> > > ...
> > > pushq %r15
> > > movq %rsp, %rdi # pointer to saved regs
> > > leaq 1b(%rip), %rsi # pointer to the trampoline itself
> > > callq trampoline_handler # see below
> >
> > For nested calls (where the trampoline needs to pass the
> > original stack frame to the nested function) I think you
> > just need a page full of:
> > mov $0, scratch_reg; jmp trampoline_handler
>
> I believe you could do with mov %pc, scratch_reg; jmp ...
>
> That has advantage of being able to share single physical
> page across multiple virtual pages...
A lot of architecture don't let you copy %pc that way so you would
have to use 'call' - but that trashes the return address cache.
It also needs the trampoline handler to know the addresses
of the trampolines.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply
* Re: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: Madhavan T. Venkataraman @ 2020-08-02 22:58 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Kernel Hardening, Linux API, linux-arm-kernel, Linux FS Devel,
linux-integrity, LKML, LSM List, Oleg Nesterov, X86 ML
In-Reply-To: <CALCETrUJ2hBmJujyCtEqx4=pknRvjvi1-Gj9wfRcMMzejjKQsQ@mail.gmail.com>
On 8/2/20 3:00 PM, Andy Lutomirski wrote:
> On Sun, Aug 2, 2020 at 11:54 AM Madhavan T. Venkataraman
> <madvenka@linux.microsoft.com> wrote:
>> More responses inline..
>>
>> On 7/28/20 12:31 PM, Andy Lutomirski wrote:
>>>> On Jul 28, 2020, at 6:11 AM, madvenka@linux.microsoft.com wrote:
>>>>
>>>> From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
>>>>
>>> 2. Use existing kernel functionality. Raise a signal, modify the
>>> state, and return from the signal. This is very flexible and may not
>>> be all that much slower than trampfd.
>> Let me understand this. You are saying that the trampoline code
>> would raise a signal and, in the signal handler, set up the context
>> so that when the signal handler returns, we end up in the target
>> function with the context correctly set up. And, this trampoline code
>> can be generated statically at build time so that there are no
>> security issues using it.
>>
>> Have I understood your suggestion correctly?
> yes.
>
>> So, my argument would be that this would always incur the overhead
>> of a trip to the kernel. I think twice the overhead if I am not mistaken.
>> With trampfd, we can have the kernel generate the code so that there
>> is no performance penalty at all.
> I feel like trampfd is too poorly defined at this point to evaluate.
> There are three general things it could do. It could generate actual
> code that varies by instance. It could have static code that does not
> vary. And it could actually involve a kernel entry.
>
> If it involves a kernel entry, then it's slow. Maybe this is okay for
> some use cases.
Yes. IMO, it is OK for most cases except where dynamic code
is used specifically for enhancing performance such as interpreters
using JIT code for frequently executed sequences and dynamic
binary translation.
> If it involves only static code, I see no good reason that it should
> be in the kernel.
It does not involve only static code. This is meant for dynamic code.
However, see below.
> If it involves dynamic code, then I think it needs a clearly defined
> use case that actually requires dynamic code.
Fair enough. I will work on this and get back to you. This might take
a little time. So, bear with me.
But I would like to make one point here. There are many applications
and libraries out there that use trampolines. They may all require the
same sort of things:
- set register context
- push stuff on stack
- jump to a target PC
But in each case, the context would be different:
- only register context
- only stack context
- both register and stack contexts
- different registers
- different values pushed on the stack
- different target PCs
If we had to do this purely at user level, each application/library would
need to roll its own solution, the solution has to be implemented for
each supported architecture and maintained. While the code is static
in each separate case, it is dynamic across all of them.
That is, the kernel will generate the code on the fly for each trampoline
instance based on its current context. It will not maintain any static
trampoline code at all.
Basically, it will supply the context to an arch-specific function and say:
- generate instructions for loading these regs with these values
- generate instructions to push these values on the stack
- generate an instruction to jump to this target PC
It will place all of those generated instructions on a page and return the address.
So, even with the static case, there is a lot of value in the kernel providing
this. Plus, it has the framework to handle dynamic code.
>> Also, signals are asynchronous. So, they are vulnerable to race conditions.
>> To prevent other signals from coming in while handling the raised signal,
>> we would need to block and unblock signals. This will cause more
>> overhead.
> If you're worried about raise() racing against signals from out of
> thread, you have bigger problems to deal with.
Agreed. The signal blocking is just one example of problems related
to signals. There are other bigger problems as well. So, let us remove
the signal-based approach from our discussions.
Thanks.
Madhavan
^ permalink raw reply
* [PATCH v20 04/12] landlock: Add ptrace restrictions
From: Mickaël Salaün @ 2020-08-02 21:58 UTC (permalink / raw)
To: linux-kernel
Cc: Mickaël Salaün, Al Viro, Andy Lutomirski, Anton Ivanov,
Arnd Bergmann, Casey Schaufler, James Morris, Jann Horn,
Jeff Dike, Jonathan Corbet, Kees Cook, Michael Kerrisk,
Richard Weinberger, Serge E . Hallyn, Shuah Khan,
Vincent Dagonneau, kernel-hardening, linux-api, linux-arch,
linux-doc, linux-fsdevel, linux-kselftest, linux-security-module,
x86
In-Reply-To: <20200802215903.91936-1-mic@digikod.net>
Using ptrace(2) and related debug features on a target process can lead
to a privilege escalation. Indeed, ptrace(2) can be used by an attacker
to impersonate another task and to remain undetected while performing
malicious activities. Thanks to ptrace_may_access(), various part of
the kernel can check if a tracer is more privileged than a tracee.
A landlocked process has fewer privileges than a non-landlocked process
and must then be subject to additional restrictions when manipulating
processes. To be allowed to use ptrace(2) and related syscalls on a
target process, a landlocked process must have a subset of the target
process' rules (i.e. the tracee must be in a sub-domain of the tracer).
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: James Morris <jmorris@namei.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
---
Changes since v14:
* Constify variables.
Changes since v13:
* Make the ptrace restriction mandatory, like in the v10.
* Remove the eBPF dependency.
Previous changes:
https://lore.kernel.org/lkml/20191104172146.30797-5-mic@digikod.net/
---
security/landlock/Makefile | 2 +-
security/landlock/ptrace.c | 120 +++++++++++++++++++++++++++++++++++++
security/landlock/ptrace.h | 14 +++++
security/landlock/setup.c | 2 +
4 files changed, 137 insertions(+), 1 deletion(-)
create mode 100644 security/landlock/ptrace.c
create mode 100644 security/landlock/ptrace.h
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 041ea242e627..f1d1eb72fa76 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
landlock-y := setup.o object.o ruleset.o \
- cred.o
+ cred.o ptrace.o
diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c
new file mode 100644
index 000000000000..61df38b13f5c
--- /dev/null
+++ b/security/landlock/ptrace.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Ptrace hooks
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2020 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/lsm_hooks.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "common.h"
+#include "cred.h"
+#include "ptrace.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * domain_scope_le - Checks domain ordering for scoped ptrace
+ *
+ * @parent: Parent domain.
+ * @child: Potential child of @parent.
+ *
+ * Checks if the @parent domain is less or equal to (i.e. an ancestor, which
+ * means a subset of) the @child domain.
+ */
+static bool domain_scope_le(const struct landlock_ruleset *const parent,
+ const struct landlock_ruleset *const child)
+{
+ const struct landlock_hierarchy *walker;
+
+ if (!parent)
+ return true;
+ if (!child)
+ return false;
+ for (walker = child->hierarchy; walker; walker = walker->parent) {
+ if (walker == parent->hierarchy)
+ /* @parent is in the scoped hierarchy of @child. */
+ return true;
+ }
+ /* There is no relationship between @parent and @child. */
+ return false;
+}
+
+static bool task_is_scoped(const struct task_struct *const parent,
+ const struct task_struct *const child)
+{
+ bool is_scoped;
+ const struct landlock_ruleset *dom_parent, *dom_child;
+
+ rcu_read_lock();
+ dom_parent = landlock_get_task_domain(parent);
+ dom_child = landlock_get_task_domain(child);
+ is_scoped = domain_scope_le(dom_parent, dom_child);
+ rcu_read_unlock();
+ return is_scoped;
+}
+
+static int task_ptrace(const struct task_struct *const parent,
+ const struct task_struct *const child)
+{
+ /* Quick return for non-landlocked tasks. */
+ if (!landlocked(parent))
+ return 0;
+ if (task_is_scoped(parent, child))
+ return 0;
+ return -EPERM;
+}
+
+/**
+ * hook_ptrace_access_check - Determines whether the current process may access
+ * another
+ *
+ * @child: Process to be accessed.
+ * @mode: Mode of attachment.
+ *
+ * If the current task has Landlock rules, then the child must have at least
+ * the same rules. Else denied.
+ *
+ * Determines whether a process may access another, returning 0 if permission
+ * granted, -errno if denied.
+ */
+static int hook_ptrace_access_check(struct task_struct *const child,
+ const unsigned int mode)
+{
+ return task_ptrace(current, child);
+}
+
+/**
+ * hook_ptrace_traceme - Determines whether another process may trace the
+ * current one
+ *
+ * @parent: Task proposed to be the tracer.
+ *
+ * If the parent has Landlock rules, then the current task must have the same
+ * or more rules. Else denied.
+ *
+ * Determines whether the nominated task is permitted to trace the current
+ * process, returning 0 if permission is granted, -errno if denied.
+ */
+static int hook_ptrace_traceme(struct task_struct *const parent)
+{
+ return task_ptrace(parent, current);
+}
+
+static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+ LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check),
+ LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme),
+};
+
+__init void landlock_add_hooks_ptrace(void)
+{
+ security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+ LANDLOCK_NAME);
+}
diff --git a/security/landlock/ptrace.h b/security/landlock/ptrace.h
new file mode 100644
index 000000000000..6740c6a723de
--- /dev/null
+++ b/security/landlock/ptrace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Ptrace hooks
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_PTRACE_H
+#define _SECURITY_LANDLOCK_PTRACE_H
+
+__init void landlock_add_hooks_ptrace(void);
+
+#endif /* _SECURITY_LANDLOCK_PTRACE_H */
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
index 39ee1766f175..5e7540fdeefa 100644
--- a/security/landlock/setup.c
+++ b/security/landlock/setup.c
@@ -11,6 +11,7 @@
#include "common.h"
#include "cred.h"
+#include "ptrace.h"
#include "setup.h"
struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = {
@@ -20,6 +21,7 @@ struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = {
static int __init landlock_init(void)
{
landlock_add_hooks_cred();
+ landlock_add_hooks_ptrace();
pr_info("Up and running.\n");
return 0;
}
--
2.28.0.rc2
^ permalink raw reply related
* [PATCH v20 05/12] LSM: Infrastructure management of the superblock
From: Mickaël Salaün @ 2020-08-02 21:58 UTC (permalink / raw)
To: linux-kernel
Cc: Mickaël Salaün, Al Viro, Andy Lutomirski, Anton Ivanov,
Arnd Bergmann, Casey Schaufler, James Morris, Jann Horn,
Jeff Dike, Jonathan Corbet, Kees Cook, Michael Kerrisk,
Richard Weinberger, Serge E . Hallyn, Shuah Khan,
Vincent Dagonneau, kernel-hardening, linux-api, linux-arch,
linux-doc, linux-fsdevel, linux-kselftest, linux-security-module,
x86, John Johansen, Stephen Smalley
In-Reply-To: <20200802215903.91936-1-mic@digikod.net>
From: Casey Schaufler <casey@schaufler-ca.com>
Move management of the superblock->sb_security blob out
of the individual security modules and into the security
infrastructure. Instead of allocating the blobs from within
the modules the modules tell the infrastructure how much
space is required, and the space is allocated there.
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Stephen Smalley <sds@tycho.nsa.gov>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20190829232935.7099-2-casey@schaufler-ca.com
---
Changes since v17:
* Rebase the original LSM stacking patch from v5.3 to v5.7: I fixed some
diff conflicts caused by code moves and function renames in
selinux/include/objsec.h and selinux/hooks.c . I checked that it
builds but I didn't test the changes for SELinux nor SMACK.
---
include/linux/lsm_hooks.h | 1 +
security/security.c | 46 ++++++++++++++++++++----
security/selinux/hooks.c | 58 ++++++++++++-------------------
security/selinux/include/objsec.h | 6 ++++
security/selinux/ss/services.c | 3 +-
security/smack/smack.h | 6 ++++
security/smack/smack_lsm.c | 35 +++++--------------
7 files changed, 85 insertions(+), 70 deletions(-)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 95b7c1d32062..80c629d8a2ad 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1550,6 +1550,7 @@ struct lsm_blob_sizes {
int lbs_cred;
int lbs_file;
int lbs_inode;
+ int lbs_superblock;
int lbs_ipc;
int lbs_msg_msg;
int lbs_task;
diff --git a/security/security.c b/security/security.c
index 70a7ad357bc6..d60aa835b670 100644
--- a/security/security.c
+++ b/security/security.c
@@ -201,6 +201,7 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+ lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
}
@@ -331,12 +332,13 @@ static void __init ordered_lsm_init(void)
for (lsm = ordered_lsms; *lsm; lsm++)
prepare_lsm(*lsm);
- init_debug("cred blob size = %d\n", blob_sizes.lbs_cred);
- init_debug("file blob size = %d\n", blob_sizes.lbs_file);
- init_debug("inode blob size = %d\n", blob_sizes.lbs_inode);
- init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc);
- init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg);
- init_debug("task blob size = %d\n", blob_sizes.lbs_task);
+ init_debug("cred blob size = %d\n", blob_sizes.lbs_cred);
+ init_debug("file blob size = %d\n", blob_sizes.lbs_file);
+ init_debug("inode blob size = %d\n", blob_sizes.lbs_inode);
+ init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc);
+ init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg);
+ init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
+ init_debug("task blob size = %d\n", blob_sizes.lbs_task);
/*
* Create any kmem_caches needed for blobs
@@ -668,6 +670,27 @@ static void __init lsm_early_task(struct task_struct *task)
panic("%s: Early task alloc failed.\n", __func__);
}
+/**
+ * lsm_superblock_alloc - allocate a composite superblock blob
+ * @sb: the superblock that needs a blob
+ *
+ * Allocate the superblock blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_superblock_alloc(struct super_block *sb)
+{
+ if (blob_sizes.lbs_superblock == 0) {
+ sb->s_security = NULL;
+ return 0;
+ }
+
+ sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
+ if (sb->s_security == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
/*
* The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
* can be accessed with:
@@ -865,12 +888,21 @@ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *
int security_sb_alloc(struct super_block *sb)
{
- return call_int_hook(sb_alloc_security, 0, sb);
+ int rc = lsm_superblock_alloc(sb);
+
+ if (unlikely(rc))
+ return rc;
+ rc = call_int_hook(sb_alloc_security, 0, sb);
+ if (unlikely(rc))
+ security_sb_free(sb);
+ return rc;
}
void security_sb_free(struct super_block *sb)
{
call_void_hook(sb_free_security, sb);
+ kfree(sb->s_security);
+ sb->s_security = NULL;
}
void security_free_mnt_opts(void **mnt_opts)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index efa6108b1ce9..a88e21b90639 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -321,7 +321,7 @@ static void inode_free_security(struct inode *inode)
if (!isec)
return;
- sbsec = inode->i_sb->s_security;
+ sbsec = selinux_superblock(inode->i_sb);
/*
* As not all inode security structures are in a list, we check for
* empty list outside of the lock to make sure that we won't waste
@@ -339,13 +339,6 @@ static void inode_free_security(struct inode *inode)
}
}
-static void superblock_free_security(struct super_block *sb)
-{
- struct superblock_security_struct *sbsec = sb->s_security;
- sb->s_security = NULL;
- kfree(sbsec);
-}
-
struct selinux_mnt_opts {
const char *fscontext, *context, *rootcontext, *defcontext;
};
@@ -457,7 +450,7 @@ static int selinux_is_genfs_special_handling(struct super_block *sb)
static int selinux_is_sblabel_mnt(struct super_block *sb)
{
- struct superblock_security_struct *sbsec = sb->s_security;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
/*
* IMPORTANT: Double-check logic in this function when adding a new
@@ -485,7 +478,7 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
static int sb_finish_set_opts(struct super_block *sb)
{
- struct superblock_security_struct *sbsec = sb->s_security;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
struct dentry *root = sb->s_root;
struct inode *root_inode = d_backing_inode(root);
int rc = 0;
@@ -598,7 +591,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
unsigned long *set_kern_flags)
{
const struct cred *cred = current_cred();
- struct superblock_security_struct *sbsec = sb->s_security;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
struct dentry *root = sbsec->sb->s_root;
struct selinux_mnt_opts *opts = mnt_opts;
struct inode_security_struct *root_isec;
@@ -835,8 +828,8 @@ static int selinux_set_mnt_opts(struct super_block *sb,
static int selinux_cmp_sb_context(const struct super_block *oldsb,
const struct super_block *newsb)
{
- struct superblock_security_struct *old = oldsb->s_security;
- struct superblock_security_struct *new = newsb->s_security;
+ struct superblock_security_struct *old = selinux_superblock(oldsb);
+ struct superblock_security_struct *new = selinux_superblock(newsb);
char oldflags = old->flags & SE_MNTMASK;
char newflags = new->flags & SE_MNTMASK;
@@ -868,8 +861,9 @@ static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
unsigned long *set_kern_flags)
{
int rc = 0;
- const struct superblock_security_struct *oldsbsec = oldsb->s_security;
- struct superblock_security_struct *newsbsec = newsb->s_security;
+ const struct superblock_security_struct *oldsbsec =
+ selinux_superblock(oldsb);
+ struct superblock_security_struct *newsbsec = selinux_superblock(newsb);
int set_fscontext = (oldsbsec->flags & FSCONTEXT_MNT);
int set_context = (oldsbsec->flags & CONTEXT_MNT);
@@ -1048,7 +1042,7 @@ static int show_sid(struct seq_file *m, u32 sid)
static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
{
- struct superblock_security_struct *sbsec = sb->s_security;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
int rc;
if (!(sbsec->flags & SE_SBINITIALIZED))
@@ -1398,7 +1392,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
if (isec->sclass == SECCLASS_FILE)
isec->sclass = inode_mode_to_security_class(inode->i_mode);
- sbsec = inode->i_sb->s_security;
+ sbsec = selinux_superblock(inode->i_sb);
if (!(sbsec->flags & SE_SBINITIALIZED)) {
/* Defer initialization until selinux_complete_init,
after the initial policy is loaded and the security
@@ -1741,7 +1735,8 @@ selinux_determine_inode_label(const struct task_security_struct *tsec,
const struct qstr *name, u16 tclass,
u32 *_new_isid)
{
- const struct superblock_security_struct *sbsec = dir->i_sb->s_security;
+ const struct superblock_security_struct *sbsec =
+ selinux_superblock(dir->i_sb);
if ((sbsec->flags & SE_SBINITIALIZED) &&
(sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
@@ -1772,7 +1767,7 @@ static int may_create(struct inode *dir,
int rc;
dsec = inode_security(dir);
- sbsec = dir->i_sb->s_security;
+ sbsec = selinux_superblock(dir->i_sb);
sid = tsec->sid;
@@ -1921,7 +1916,7 @@ static int superblock_has_perm(const struct cred *cred,
struct superblock_security_struct *sbsec;
u32 sid = cred_sid(cred);
- sbsec = sb->s_security;
+ sbsec = selinux_superblock(sb);
return avc_has_perm(&selinux_state,
sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
}
@@ -2550,11 +2545,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
static int selinux_sb_alloc_security(struct super_block *sb)
{
- struct superblock_security_struct *sbsec;
-
- sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL);
- if (!sbsec)
- return -ENOMEM;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
mutex_init(&sbsec->lock);
INIT_LIST_HEAD(&sbsec->isec_head);
@@ -2563,16 +2554,10 @@ static int selinux_sb_alloc_security(struct super_block *sb)
sbsec->sid = SECINITSID_UNLABELED;
sbsec->def_sid = SECINITSID_FILE;
sbsec->mntpoint_sid = SECINITSID_UNLABELED;
- sb->s_security = sbsec;
return 0;
}
-static void selinux_sb_free_security(struct super_block *sb)
-{
- superblock_free_security(sb);
-}
-
static inline int opt_len(const char *s)
{
bool open_quote = false;
@@ -2651,7 +2636,7 @@ static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts)
static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
{
struct selinux_mnt_opts *opts = mnt_opts;
- struct superblock_security_struct *sbsec = sb->s_security;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
u32 sid;
int rc;
@@ -2889,7 +2874,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
int rc;
char *context;
- sbsec = dir->i_sb->s_security;
+ sbsec = selinux_superblock(dir->i_sb);
newsid = tsec->create_sid;
@@ -3134,7 +3119,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
if (!selinux_initialized(&selinux_state))
return (inode_owner_or_capable(inode) ? 0 : -EPERM);
- sbsec = inode->i_sb->s_security;
+ sbsec = selinux_superblock(inode->i_sb);
if (!(sbsec->flags & SBLABEL_MNT))
return -EOPNOTSUPP;
@@ -3368,13 +3353,14 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
struct inode_security_struct *isec = inode_security_novalidate(inode);
- struct superblock_security_struct *sbsec = inode->i_sb->s_security;
+ struct superblock_security_struct *sbsec;
u32 newsid;
int rc;
if (strcmp(name, XATTR_SELINUX_SUFFIX))
return -EOPNOTSUPP;
+ sbsec = selinux_superblock(inode->i_sb);
if (!(sbsec->flags & SBLABEL_MNT))
return -EOPNOTSUPP;
@@ -6870,6 +6856,7 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
.lbs_inode = sizeof(struct inode_security_struct),
.lbs_ipc = sizeof(struct ipc_security_struct),
.lbs_msg_msg = sizeof(struct msg_security_struct),
+ .lbs_superblock = sizeof(struct superblock_security_struct),
};
#ifdef CONFIG_PERF_EVENTS
@@ -6970,7 +6957,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),
- LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 330b7b6d44e0..dcebd2b95ca7 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -189,4 +189,10 @@ static inline u32 current_sid(void)
return tsec->sid;
}
+static inline struct superblock_security_struct *selinux_superblock(
+ const struct super_block *superblock)
+{
+ return superblock->s_security + selinux_blob_sizes.lbs_superblock;
+}
+
#endif /* _SELINUX_OBJSEC_H_ */
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index ef0afd878bfc..f838010cb0fe 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -47,6 +47,7 @@
#include <linux/sched.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
+#include <linux/lsm_hooks.h>
#include <net/netlabel.h>
#include "flask.h"
@@ -2795,7 +2796,7 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
struct sidtab *sidtab;
int rc = 0;
struct ocontext *c;
- struct superblock_security_struct *sbsec = sb->s_security;
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
const char *fstype = sb->s_type->name;
read_lock(&state->ss->policy_rwlock);
diff --git a/security/smack/smack.h b/security/smack/smack.h
index e9e817d09785..d33d2a7b73a3 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -364,6 +364,12 @@ static inline struct smack_known **smack_ipc(const struct kern_ipc_perm *ipc)
return ipc->security + smack_blob_sizes.lbs_ipc;
}
+static inline struct superblock_smack *smack_superblock(
+ const struct super_block *superblock)
+{
+ return superblock->s_security + smack_blob_sizes.lbs_superblock;
+}
+
/*
* Is the directory transmuting?
*/
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 8ffbf951b7ed..8d8f70a99374 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -535,12 +535,7 @@ static int smack_syslog(int typefrom_file)
*/
static int smack_sb_alloc_security(struct super_block *sb)
{
- struct superblock_smack *sbsp;
-
- sbsp = kzalloc(sizeof(struct superblock_smack), GFP_KERNEL);
-
- if (sbsp == NULL)
- return -ENOMEM;
+ struct superblock_smack *sbsp = smack_superblock(sb);
sbsp->smk_root = &smack_known_floor;
sbsp->smk_default = &smack_known_floor;
@@ -549,22 +544,10 @@ static int smack_sb_alloc_security(struct super_block *sb)
/*
* SMK_SB_INITIALIZED will be zero from kzalloc.
*/
- sb->s_security = sbsp;
return 0;
}
-/**
- * smack_sb_free_security - free a superblock blob
- * @sb: the superblock getting the blob
- *
- */
-static void smack_sb_free_security(struct super_block *sb)
-{
- kfree(sb->s_security);
- sb->s_security = NULL;
-}
-
struct smack_mnt_opts {
const char *fsdefault, *fsfloor, *fshat, *fsroot, *fstransmute;
};
@@ -772,7 +755,7 @@ static int smack_set_mnt_opts(struct super_block *sb,
{
struct dentry *root = sb->s_root;
struct inode *inode = d_backing_inode(root);
- struct superblock_smack *sp = sb->s_security;
+ struct superblock_smack *sp = smack_superblock(sb);
struct inode_smack *isp;
struct smack_known *skp;
struct smack_mnt_opts *opts = mnt_opts;
@@ -871,7 +854,7 @@ static int smack_set_mnt_opts(struct super_block *sb,
*/
static int smack_sb_statfs(struct dentry *dentry)
{
- struct superblock_smack *sbp = dentry->d_sb->s_security;
+ struct superblock_smack *sbp = smack_superblock(dentry->d_sb);
int rc;
struct smk_audit_info ad;
@@ -905,7 +888,7 @@ static int smack_bprm_creds_for_exec(struct linux_binprm *bprm)
if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
return 0;
- sbsp = inode->i_sb->s_security;
+ sbsp = smack_superblock(inode->i_sb);
if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) &&
isp->smk_task != sbsp->smk_root)
return 0;
@@ -1157,7 +1140,7 @@ static int smack_inode_rename(struct inode *old_inode,
*/
static int smack_inode_permission(struct inode *inode, int mask)
{
- struct superblock_smack *sbsp = inode->i_sb->s_security;
+ struct superblock_smack *sbsp = smack_superblock(inode->i_sb);
struct smk_audit_info ad;
int no_block = mask & MAY_NOT_BLOCK;
int rc;
@@ -1398,7 +1381,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
*/
if (strcmp(name, XATTR_NAME_SMACK) == 0) {
struct super_block *sbp = dentry->d_sb;
- struct superblock_smack *sbsp = sbp->s_security;
+ struct superblock_smack *sbsp = smack_superblock(sbp);
isp->smk_inode = sbsp->smk_default;
} else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0)
@@ -1668,7 +1651,7 @@ static int smack_mmap_file(struct file *file,
isp = smack_inode(file_inode(file));
if (isp->smk_mmap == NULL)
return 0;
- sbsp = file_inode(file)->i_sb->s_security;
+ sbsp = smack_superblock(file_inode(file)->i_sb);
if (sbsp->smk_flags & SMK_SB_UNTRUSTED &&
isp->smk_mmap != sbsp->smk_root)
return -EACCES;
@@ -3268,7 +3251,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
return;
sbp = inode->i_sb;
- sbsp = sbp->s_security;
+ sbsp = smack_superblock(sbp);
/*
* We're going to use the superblock default label
* if there's no label on the file.
@@ -4653,6 +4636,7 @@ struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
.lbs_inode = sizeof(struct inode_smack),
.lbs_ipc = sizeof(struct smack_known *),
.lbs_msg_msg = sizeof(struct smack_known *),
+ .lbs_superblock = sizeof(struct superblock_smack),
};
static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
@@ -4664,7 +4648,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param),
LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security),
- LSM_HOOK_INIT(sb_free_security, smack_sb_free_security),
LSM_HOOK_INIT(sb_free_mnt_opts, smack_free_mnt_opts),
LSM_HOOK_INIT(sb_eat_lsm_opts, smack_sb_eat_lsm_opts),
LSM_HOOK_INIT(sb_statfs, smack_sb_statfs),
--
2.28.0.rc2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox