* [PATCH v7 03/16] fscrypt: use FSCRYPT_* definitions, not FS_*
From: Eric Biggers @ 2019-07-26 22:41 UTC (permalink / raw)
To: linux-fscrypt
Cc: Satya Tangirala, linux-api, linux-f2fs-devel, keyrings, linux-mtd,
linux-crypto, linux-fsdevel, linux-ext4, Paul Crowley
In-Reply-To: <20190726224141.14044-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
Update fs/crypto/ to use the new names for the UAPI constants rather
than the old names, then make the old definitions conditional on
!__KERNEL__.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
fs/crypto/crypto.c | 2 +-
fs/crypto/fname.c | 2 +-
fs/crypto/fscrypt_private.h | 16 +++++------
fs/crypto/keyinfo.c | 53 ++++++++++++++++++------------------
fs/crypto/policy.c | 14 +++++-----
include/uapi/linux/fscrypt.h | 2 ++
6 files changed, 46 insertions(+), 43 deletions(-)
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 3e4624cfe4b54..7502c1f0ede9e 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -141,7 +141,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
memset(iv, 0, ci->ci_mode->ivsize);
iv->lblk_num = cpu_to_le64(lblk_num);
- if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY)
+ if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
if (ci->ci_essiv_tfm != NULL)
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 5cab3bb2d1fc0..f4977d44d69b8 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -182,7 +182,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
u32 max_len, u32 *encrypted_len_ret)
{
int padding = 4 << (inode->i_crypt_info->ci_flags &
- FS_POLICY_FLAGS_PAD_MASK);
+ FSCRYPT_POLICY_FLAGS_PAD_MASK);
u32 encrypted_len;
if (orig_len > max_len)
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 4d715708c6e1f..fae411b2f78dc 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -34,7 +34,7 @@ struct fscrypt_context {
u8 contents_encryption_mode;
u8 filenames_encryption_mode;
u8 flags;
- u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
} __packed;
@@ -84,7 +84,7 @@ struct fscrypt_info {
u8 ci_data_mode;
u8 ci_filename_mode;
u8 ci_flags;
- u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 ci_master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
};
@@ -98,16 +98,16 @@ typedef enum {
static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
u32 filenames_mode)
{
- if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
- filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+ if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
+ filenames_mode == FSCRYPT_MODE_AES_128_CTS)
return true;
- if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
- filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+ if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+ filenames_mode == FSCRYPT_MODE_AES_256_CTS)
return true;
- if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM &&
- filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM)
+ if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
+ filenames_mode == FSCRYPT_MODE_ADIANTUM)
return true;
return false;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 2129943002335..22345ddede119 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -20,7 +20,7 @@
static struct crypto_shash *essiv_hash_tfm;
-/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */
+/* Table of keys referenced by DIRECT_KEY policies */
static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
@@ -77,7 +77,7 @@ static int derive_key_aes(const u8 *master_key,
*/
static struct key *
find_and_lock_process_key(const char *prefix,
- const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
+ const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE],
unsigned int min_keysize,
const struct fscrypt_key **payload_ret)
{
@@ -87,7 +87,7 @@ find_and_lock_process_key(const char *prefix,
const struct fscrypt_key *payload;
description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
- FS_KEY_DESCRIPTOR_SIZE, descriptor);
+ FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
return ERR_PTR(-ENOMEM);
@@ -105,7 +105,7 @@ find_and_lock_process_key(const char *prefix,
payload = (const struct fscrypt_key *)ukp->data;
if (ukp->datalen != sizeof(struct fscrypt_key) ||
- payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
+ payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
fscrypt_warn(NULL,
"key with description '%s' has invalid payload",
key->description);
@@ -129,32 +129,32 @@ find_and_lock_process_key(const char *prefix,
}
static struct fscrypt_mode available_modes[] = {
- [FS_ENCRYPTION_MODE_AES_256_XTS] = {
+ [FSCRYPT_MODE_AES_256_XTS] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
},
- [FS_ENCRYPTION_MODE_AES_256_CTS] = {
+ [FSCRYPT_MODE_AES_256_CTS] = {
.friendly_name = "AES-256-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
.ivsize = 16,
},
- [FS_ENCRYPTION_MODE_AES_128_CBC] = {
+ [FSCRYPT_MODE_AES_128_CBC] = {
.friendly_name = "AES-128-CBC",
.cipher_str = "cbc(aes)",
.keysize = 16,
.ivsize = 16,
.needs_essiv = true,
},
- [FS_ENCRYPTION_MODE_AES_128_CTS] = {
+ [FSCRYPT_MODE_AES_128_CTS] = {
.friendly_name = "AES-128-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
.ivsize = 16,
},
- [FS_ENCRYPTION_MODE_ADIANTUM] = {
+ [FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
@@ -192,7 +192,7 @@ static int find_and_derive_key(const struct inode *inode,
const struct fscrypt_key *payload;
int err;
- key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+ key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
ctx->master_key_descriptor,
mode->keysize, &payload);
if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
@@ -203,7 +203,7 @@ static int find_and_derive_key(const struct inode *inode,
if (IS_ERR(key))
return PTR_ERR(key);
- if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) {
+ if (ctx->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
fscrypt_warn(inode,
"Direct key mode not allowed with %s",
@@ -272,14 +272,14 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
return ERR_PTR(err);
}
-/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */
+/* Master key referenced by DIRECT_KEY policy */
struct fscrypt_master_key {
struct hlist_node mk_node;
refcount_t mk_refcount;
const struct fscrypt_mode *mk_mode;
struct crypto_skcipher *mk_ctfm;
- u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE];
- u8 mk_raw[FS_MAX_KEY_SIZE];
+ u8 mk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+ u8 mk_raw[FSCRYPT_MAX_KEY_SIZE];
};
static void free_master_key(struct fscrypt_master_key *mk)
@@ -320,13 +320,13 @@ find_or_insert_master_key(struct fscrypt_master_key *to_insert,
* raw key, and use crypto_memneq() when comparing raw keys.
*/
- BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE);
+ BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
spin_lock(&fscrypt_master_keys_lock);
hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) != 0)
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
continue;
if (mode != mk->mk_mode)
continue;
@@ -370,7 +370,7 @@ fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
goto err_free_mk;
}
memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
memcpy(mk->mk_raw, raw_key, mode->keysize);
return find_or_insert_master_key(mk, raw_key, mode, ci);
@@ -448,8 +448,8 @@ static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
/*
* Given the encryption mode and key (normally the derived key, but for
- * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's
- * symmetric cipher transform object(s).
+ * DIRECT_KEY mode it's the master key), set up the inode's symmetric cipher
+ * transform object(s).
*/
static int setup_crypto_transform(struct fscrypt_info *ci,
struct fscrypt_mode *mode,
@@ -459,7 +459,7 @@ static int setup_crypto_transform(struct fscrypt_info *ci,
struct crypto_skcipher *ctfm;
int err;
- if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) {
+ if (ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
if (IS_ERR(mk))
return PTR_ERR(mk);
@@ -476,7 +476,7 @@ static int setup_crypto_transform(struct fscrypt_info *ci,
if (mode->needs_essiv) {
/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
- WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY);
+ WARN_ON(ci->ci_flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY);
err = init_essiv_generator(ci, raw_key, mode->keysize);
if (err) {
@@ -530,9 +530,10 @@ int fscrypt_get_encryption_info(struct inode *inode)
/* Fake up a context for an unencrypted directory */
memset(&ctx, 0, sizeof(ctx));
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
- ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
- memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+ ctx.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memset(ctx.master_key_descriptor, 0x42,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
} else if (res != sizeof(ctx)) {
fscrypt_warn(inode,
"Unknown encryption context size (%d bytes)", res);
@@ -545,7 +546,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
return -EINVAL;
}
- if (ctx.flags & ~FS_POLICY_FLAGS_VALID) {
+ if (ctx.flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
fscrypt_warn(inode, "Unknown encryption context flags (0x%02x)",
ctx.flags);
return -EINVAL;
@@ -559,7 +560,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
crypt_info->ci_data_mode = ctx.contents_encryption_mode;
crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
mode = select_encryption_mode(crypt_info, inode);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4941fe8471cef..da7ae9c8b4ad0 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -22,7 +22,7 @@ static bool is_encryption_context_consistent_with_policy(
const struct fscrypt_policy *policy)
{
return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
(ctx->flags == policy->flags) &&
(ctx->contents_encryption_mode ==
policy->contents_encryption_mode) &&
@@ -37,13 +37,13 @@ static int create_encryption_context_from_policy(struct inode *inode,
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
policy->filenames_encryption_mode))
return -EINVAL;
- if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+ if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID)
return -EINVAL;
ctx.contents_encryption_mode = policy->contents_encryption_mode;
@@ -128,7 +128,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
policy.flags = ctx.flags;
memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
if (copy_to_user(arg, &policy, sizeof(policy)))
return -EFAULT;
@@ -202,7 +202,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
if (parent_ci && child_ci) {
return memcmp(parent_ci->ci_master_key_descriptor,
child_ci->ci_master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
(parent_ci->ci_filename_mode ==
child_ci->ci_filename_mode) &&
@@ -219,7 +219,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
return memcmp(parent_ctx.master_key_descriptor,
child_ctx.master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
(parent_ctx.contents_encryption_mode ==
child_ctx.contents_encryption_mode) &&
(parent_ctx.filenames_encryption_mode ==
@@ -257,7 +257,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
ctx.filenames_encryption_mode = ci->ci_filename_mode;
ctx.flags = ci->ci_flags;
memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
res = parent->i_sb->s_cop->set_context(child, &ctx,
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 674b0452ef575..29a945d165def 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -55,6 +55,7 @@ struct fscrypt_key {
/**********************************************************************/
/* old names; don't add anything new here! */
+#ifndef __KERNEL__
#define FS_KEY_DESCRIPTOR_SIZE FSCRYPT_KEY_DESCRIPTOR_SIZE
#define FS_POLICY_FLAGS_PAD_4 FSCRYPT_POLICY_FLAGS_PAD_4
#define FS_POLICY_FLAGS_PAD_8 FSCRYPT_POLICY_FLAGS_PAD_8
@@ -76,5 +77,6 @@ struct fscrypt_key {
#define FS_KEY_DESC_PREFIX FSCRYPT_KEY_DESC_PREFIX
#define FS_KEY_DESC_PREFIX_SIZE FSCRYPT_KEY_DESC_PREFIX_SIZE
#define FS_MAX_KEY_SIZE FSCRYPT_MAX_KEY_SIZE
+#endif /* !__KERNEL__ */
#endif /* _UAPI_LINUX_FSCRYPT_H */
--
2.22.0
^ permalink raw reply related
* [PATCH v7 02/16] fscrypt: use FSCRYPT_ prefix for uapi constants
From: Eric Biggers @ 2019-07-26 22:41 UTC (permalink / raw)
To: linux-fscrypt
Cc: Satya Tangirala, linux-api, linux-f2fs-devel, keyrings, linux-mtd,
linux-crypto, linux-fsdevel, linux-ext4, Paul Crowley
In-Reply-To: <20190726224141.14044-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
Prefix all filesystem encryption UAPI constants except the ioctl numbers
with "FSCRYPT_" rather than with "FS_". This namespaces the constants
more appropriately and makes it clear that they are related specifically
to the filesystem encryption feature, and to the 'fscrypt_*' structures.
With some of the old names like "FS_POLICY_FLAGS_VALID", it was not
immediately clear that the constant had anything to do with encryption.
This is also useful because we'll be adding more encryption-related
constants, e.g. for the policy version, and we'd otherwise have to
choose whether to use unclear names like FS_POLICY_V1 or inconsistent
names like FS_ENCRYPTION_POLICY_V1.
For source compatibility with existing userspace programs, keep the old
names defined as aliases to the new names.
Finally, as long as new names are being defined anyway, I skipped
defining new names for the fscrypt mode numbers that aren't actually
used: INVALID (0), AES_256_GCM (2), AES_256_CBC (3), SPECK128_256_XTS
(7), and SPECK128_256_CTS (8).
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
Documentation/filesystems/fscrypt.rst | 36 +++++++--------
include/uapi/linux/fscrypt.h | 65 +++++++++++++++++----------
2 files changed, 60 insertions(+), 41 deletions(-)
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 82efa41b0e6c0..2c4b6e56b81c5 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -225,9 +225,10 @@ a little endian number, except that:
is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
of the file's data encryption key.
-- In the "direct key" configuration (FS_POLICY_FLAG_DIRECT_KEY set in
- the fscrypt_policy), the file's nonce is also appended to the IV.
- Currently this is only allowed with the Adiantum encryption mode.
+- In the "direct key" configuration (FSCRYPT_POLICY_FLAG_DIRECT_KEY
+ set in the fscrypt_policy), the file's nonce is also appended to the
+ IV. Currently this is only allowed with the Adiantum encryption
+ mode.
Filenames encryption
--------------------
@@ -274,14 +275,14 @@ empty directory or verifies that a directory or regular file already
has the specified encryption policy. It takes in a pointer to a
:c:type:`struct fscrypt_policy`, defined as follows::
- #define FS_KEY_DESCRIPTOR_SIZE 8
+ #define FSCRYPT_KEY_DESCRIPTOR_SIZE 8
struct fscrypt_policy {
__u8 version;
__u8 contents_encryption_mode;
__u8 filenames_encryption_mode;
__u8 flags;
- __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ __u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
};
This structure must be initialized as follows:
@@ -290,18 +291,17 @@ This structure must be initialized as follows:
- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must
be set to constants from ``<linux/fs.h>`` which identify the
- encryption modes to use. If unsure, use
- FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode``
- and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
- ``filenames_encryption_mode``.
+ encryption modes to use. If unsure, use FSCRYPT_MODE_AES_256_XTS
+ (1) for ``contents_encryption_mode`` and FSCRYPT_MODE_AES_256_CTS
+ (4) for ``filenames_encryption_mode``.
- ``flags`` must contain a value from ``<linux/fs.h>`` which
identifies the amount of NUL-padding to use when encrypting
- filenames. If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
- In addition, if the chosen encryption modes are both
- FS_ENCRYPTION_MODE_ADIANTUM, this can contain
- FS_POLICY_FLAG_DIRECT_KEY to specify that the master key should be
- used directly, without key derivation.
+ filenames. If unsure, use FSCRYPT_POLICY_FLAGS_PAD_32 (0x3). In
+ addition, if the chosen encryption modes are both
+ FSCRYPT_MODE_ADIANTUM, this can contain
+ FSCRYPT_POLICY_FLAG_DIRECT_KEY to specify that the master key should
+ be used directly, without key derivation.
- ``master_key_descriptor`` specifies how to find the master key in
the keyring; see `Adding keys`_. It is up to userspace to choose a
@@ -401,11 +401,11 @@ followed by the 16-character lower case hex representation of the
``master_key_descriptor`` that was set in the encryption policy. The
key payload must conform to the following structure::
- #define FS_MAX_KEY_SIZE 64
+ #define FSCRYPT_MAX_KEY_SIZE 64
struct fscrypt_key {
u32 mode;
- u8 raw[FS_MAX_KEY_SIZE];
+ u8 raw[FSCRYPT_MAX_KEY_SIZE];
u32 size;
};
@@ -574,7 +574,7 @@ much confusion if an encryption policy were to be added to or removed
from anything other than an empty directory.) The struct is defined
as follows::
- #define FS_KEY_DESCRIPTOR_SIZE 8
+ #define FSCRYPT_KEY_DESCRIPTOR_SIZE 8
#define FS_KEY_DERIVATION_NONCE_SIZE 16
struct fscrypt_context {
@@ -582,7 +582,7 @@ as follows::
u8 contents_encryption_mode;
u8 filenames_encryption_mode;
u8 flags;
- u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
};
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 26f6d2c19afd3..674b0452ef575 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -10,35 +10,30 @@
#include <linux/types.h>
-#define FS_KEY_DESCRIPTOR_SIZE 8
+#define FSCRYPT_KEY_DESCRIPTOR_SIZE 8
/* Encryption policy flags */
-#define FS_POLICY_FLAGS_PAD_4 0x00
-#define FS_POLICY_FLAGS_PAD_8 0x01
-#define FS_POLICY_FLAGS_PAD_16 0x02
-#define FS_POLICY_FLAGS_PAD_32 0x03
-#define FS_POLICY_FLAGS_PAD_MASK 0x03
-#define FS_POLICY_FLAG_DIRECT_KEY 0x04 /* use master key directly */
-#define FS_POLICY_FLAGS_VALID 0x07
+#define FSCRYPT_POLICY_FLAGS_PAD_4 0x00
+#define FSCRYPT_POLICY_FLAGS_PAD_8 0x01
+#define FSCRYPT_POLICY_FLAGS_PAD_16 0x02
+#define FSCRYPT_POLICY_FLAGS_PAD_32 0x03
+#define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03
+#define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 /* use master key directly */
+#define FSCRYPT_POLICY_FLAGS_VALID 0x07
/* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID 0
-#define FS_ENCRYPTION_MODE_AES_256_XTS 1
-#define FS_ENCRYPTION_MODE_AES_256_GCM 2
-#define FS_ENCRYPTION_MODE_AES_256_CBC 3
-#define FS_ENCRYPTION_MODE_AES_256_CTS 4
-#define FS_ENCRYPTION_MODE_AES_128_CBC 5
-#define FS_ENCRYPTION_MODE_AES_128_CTS 6
-#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_ADIANTUM 9
+#define FSCRYPT_MODE_AES_256_XTS 1
+#define FSCRYPT_MODE_AES_256_CTS 4
+#define FSCRYPT_MODE_AES_128_CBC 5
+#define FSCRYPT_MODE_AES_128_CTS 6
+#define FSCRYPT_MODE_ADIANTUM 9
struct fscrypt_policy {
__u8 version;
__u8 contents_encryption_mode;
__u8 filenames_encryption_mode;
__u8 flags;
- __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ __u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
};
#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy)
@@ -46,16 +41,40 @@ struct fscrypt_policy {
#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy)
/* Parameters for passing an encryption key into the kernel keyring */
-#define FS_KEY_DESC_PREFIX "fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE 8
+#define FSCRYPT_KEY_DESC_PREFIX "fscrypt:"
+#define FSCRYPT_KEY_DESC_PREFIX_SIZE 8
/* Structure that userspace passes to the kernel keyring */
-#define FS_MAX_KEY_SIZE 64
+#define FSCRYPT_MAX_KEY_SIZE 64
struct fscrypt_key {
__u32 mode;
- __u8 raw[FS_MAX_KEY_SIZE];
+ __u8 raw[FSCRYPT_MAX_KEY_SIZE];
__u32 size;
};
+/**********************************************************************/
+
+/* old names; don't add anything new here! */
+#define FS_KEY_DESCRIPTOR_SIZE FSCRYPT_KEY_DESCRIPTOR_SIZE
+#define FS_POLICY_FLAGS_PAD_4 FSCRYPT_POLICY_FLAGS_PAD_4
+#define FS_POLICY_FLAGS_PAD_8 FSCRYPT_POLICY_FLAGS_PAD_8
+#define FS_POLICY_FLAGS_PAD_16 FSCRYPT_POLICY_FLAGS_PAD_16
+#define FS_POLICY_FLAGS_PAD_32 FSCRYPT_POLICY_FLAGS_PAD_32
+#define FS_POLICY_FLAGS_PAD_MASK FSCRYPT_POLICY_FLAGS_PAD_MASK
+#define FS_POLICY_FLAG_DIRECT_KEY FSCRYPT_POLICY_FLAG_DIRECT_KEY
+#define FS_POLICY_FLAGS_VALID FSCRYPT_POLICY_FLAGS_VALID
+#define FS_ENCRYPTION_MODE_INVALID 0 /* never used */
+#define FS_ENCRYPTION_MODE_AES_256_XTS FSCRYPT_MODE_AES_256_XTS
+#define FS_ENCRYPTION_MODE_AES_256_GCM 2 /* never used */
+#define FS_ENCRYPTION_MODE_AES_256_CBC 3 /* never used */
+#define FS_ENCRYPTION_MODE_AES_256_CTS FSCRYPT_MODE_AES_256_CTS
+#define FS_ENCRYPTION_MODE_AES_128_CBC FSCRYPT_MODE_AES_128_CBC
+#define FS_ENCRYPTION_MODE_AES_128_CTS FSCRYPT_MODE_AES_128_CTS
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* removed */
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* removed */
+#define FS_ENCRYPTION_MODE_ADIANTUM FSCRYPT_MODE_ADIANTUM
+#define FS_KEY_DESC_PREFIX FSCRYPT_KEY_DESC_PREFIX
+#define FS_KEY_DESC_PREFIX_SIZE FSCRYPT_KEY_DESC_PREFIX_SIZE
+#define FS_MAX_KEY_SIZE FSCRYPT_MAX_KEY_SIZE
#endif /* _UAPI_LINUX_FSCRYPT_H */
--
2.22.0
^ permalink raw reply related
* [PATCH v7 01/16] fs, fscrypt: move uapi definitions to new header <linux/fscrypt.h>
From: Eric Biggers @ 2019-07-26 22:41 UTC (permalink / raw)
To: linux-fscrypt
Cc: Satya Tangirala, linux-api, linux-f2fs-devel, keyrings, linux-mtd,
linux-crypto, linux-fsdevel, linux-ext4, Paul Crowley
In-Reply-To: <20190726224141.14044-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
More fscrypt definitions are being added, and we shouldn't use a
disproportionate amount of space in <linux/fs.h> for fscrypt stuff.
So move the fscrypt definitions to a new header <linux/fscrypt.h>.
For source compatibility with existing userspace programs, <linux/fs.h>
still includes the new header.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
MAINTAINERS | 1 +
include/linux/fscrypt.h | 1 +
include/uapi/linux/fs.h | 54 ++-----------------------------
include/uapi/linux/fscrypt.h | 61 ++++++++++++++++++++++++++++++++++++
4 files changed, 66 insertions(+), 51 deletions(-)
create mode 100644 include/uapi/linux/fscrypt.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 783569e3c4b48..e7c473fb08163 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6598,6 +6598,7 @@ T: git git://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git
S: Supported
F: fs/crypto/
F: include/linux/fscrypt*.h
+F: include/uapi/linux/fscrypt.h
F: Documentation/filesystems/fscrypt.rst
FSI SUBSYSTEM
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index bd8f207a2fb68..81c0c754f8b21 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <uapi/linux/fscrypt.h>
#define FS_CRYPTO_BLOCK_SIZE 16
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 59c71fa8c553a..41bd84d25a980 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -13,6 +13,9 @@
#include <linux/limits.h>
#include <linux/ioctl.h>
#include <linux/types.h>
+#ifndef __KERNEL__
+#include <linux/fscrypt.h>
+#endif
/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */
#if !defined(__KERNEL__)
@@ -212,57 +215,6 @@ struct fsxattr {
#define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX])
#define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX])
-/*
- * File system encryption support
- */
-/* Policy provided via an ioctl on the topmost directory */
-#define FS_KEY_DESCRIPTOR_SIZE 8
-
-#define FS_POLICY_FLAGS_PAD_4 0x00
-#define FS_POLICY_FLAGS_PAD_8 0x01
-#define FS_POLICY_FLAGS_PAD_16 0x02
-#define FS_POLICY_FLAGS_PAD_32 0x03
-#define FS_POLICY_FLAGS_PAD_MASK 0x03
-#define FS_POLICY_FLAG_DIRECT_KEY 0x04 /* use master key directly */
-#define FS_POLICY_FLAGS_VALID 0x07
-
-/* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID 0
-#define FS_ENCRYPTION_MODE_AES_256_XTS 1
-#define FS_ENCRYPTION_MODE_AES_256_GCM 2
-#define FS_ENCRYPTION_MODE_AES_256_CBC 3
-#define FS_ENCRYPTION_MODE_AES_256_CTS 4
-#define FS_ENCRYPTION_MODE_AES_128_CBC 5
-#define FS_ENCRYPTION_MODE_AES_128_CTS 6
-#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */
-#define FS_ENCRYPTION_MODE_ADIANTUM 9
-
-struct fscrypt_policy {
- __u8 version;
- __u8 contents_encryption_mode;
- __u8 filenames_encryption_mode;
- __u8 flags;
- __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy)
-#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
-#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy)
-
-/* Parameters for passing an encryption key into the kernel keyring */
-#define FS_KEY_DESC_PREFIX "fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE 8
-
-/* Structure that userspace passes to the kernel keyring */
-#define FS_MAX_KEY_SIZE 64
-
-struct fscrypt_key {
- __u32 mode;
- __u8 raw[FS_MAX_KEY_SIZE];
- __u32 size;
-};
-
/*
* Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
*
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
new file mode 100644
index 0000000000000..26f6d2c19afd3
--- /dev/null
+++ b/include/uapi/linux/fscrypt.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * fscrypt user API
+ *
+ * These ioctls can be used on filesystems that support fscrypt. See the
+ * "User API" section of Documentation/filesystems/fscrypt.rst.
+ */
+#ifndef _UAPI_LINUX_FSCRYPT_H
+#define _UAPI_LINUX_FSCRYPT_H
+
+#include <linux/types.h>
+
+#define FS_KEY_DESCRIPTOR_SIZE 8
+
+/* Encryption policy flags */
+#define FS_POLICY_FLAGS_PAD_4 0x00
+#define FS_POLICY_FLAGS_PAD_8 0x01
+#define FS_POLICY_FLAGS_PAD_16 0x02
+#define FS_POLICY_FLAGS_PAD_32 0x03
+#define FS_POLICY_FLAGS_PAD_MASK 0x03
+#define FS_POLICY_FLAG_DIRECT_KEY 0x04 /* use master key directly */
+#define FS_POLICY_FLAGS_VALID 0x07
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID 0
+#define FS_ENCRYPTION_MODE_AES_256_XTS 1
+#define FS_ENCRYPTION_MODE_AES_256_GCM 2
+#define FS_ENCRYPTION_MODE_AES_256_CBC 3
+#define FS_ENCRYPTION_MODE_AES_256_CTS 4
+#define FS_ENCRYPTION_MODE_AES_128_CBC 5
+#define FS_ENCRYPTION_MODE_AES_128_CTS 6
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_ADIANTUM 9
+
+struct fscrypt_policy {
+ __u8 version;
+ __u8 contents_encryption_mode;
+ __u8 filenames_encryption_mode;
+ __u8 flags;
+ __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy)
+#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
+#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy)
+
+/* Parameters for passing an encryption key into the kernel keyring */
+#define FS_KEY_DESC_PREFIX "fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE 8
+
+/* Structure that userspace passes to the kernel keyring */
+#define FS_MAX_KEY_SIZE 64
+
+struct fscrypt_key {
+ __u32 mode;
+ __u8 raw[FS_MAX_KEY_SIZE];
+ __u32 size;
+};
+
+#endif /* _UAPI_LINUX_FSCRYPT_H */
--
2.22.0
______________________________________________________
Linux MTD discussion mailing list
http://lists.infradead.org/mailman/listinfo/linux-mtd/
^ permalink raw reply related
* [PATCH v7 00/16] fscrypt: key management improvements
From: Eric Biggers @ 2019-07-26 22:41 UTC (permalink / raw)
To: linux-fscrypt
Cc: Satya Tangirala, linux-api, linux-f2fs-devel, keyrings, linux-mtd,
linux-crypto, linux-fsdevel, linux-ext4, Paul Crowley
Hello,
[Note: I'd like to apply this for v5.4. Additional review is greatly
appreciated, especially of the API before it's set in stone. Thanks!]
This patchset makes major improvements to how keys are added, removed,
and derived in fscrypt, aka ext4/f2fs/ubifs encryption. It does this by
adding new ioctls that add and remove encryption keys directly to/from
the filesystem, and by adding a new encryption policy version ("v2")
where the user-provided keys are only used as input to HKDF-SHA512 and
are identified by their cryptographic hash.
All new APIs and all cryptosystem changes are documented in
Documentation/filesystems/fscrypt.rst. Userspace can use the new key
management ioctls with existing encrypted directories, but migrating to
v2 encryption policies is needed for the full benefits.
These changes solve four interrelated problems:
(1) Providing fscrypt keys via process-subscribed keyrings is abusing
encryption as an OS-level access control mechanism, causing many
bugs where processes don't get access to the keys they need -- e.g.,
when a 'sudo' command or a system service needs to access encrypted
files. It's also inconsistent with the filesystem/VFS "view" of
encrypted files which is global, so sometimes things randomly happen
to work anyway due to caching. Regardless, currently almost all
fscrypt users actually do need global keys, so they're having to use
workarounds that heavily abuse the session or user keyrings, e.g.
Android and Chromium OS both use a systemwide "session keyring" and
the 'fscrypt' tool links all user keyrings into root's user keyring.
(2) Currently there's no way to securely and efficiently remove a
fscrypt key such that not only is the original key wiped, but also
all files and directories protected by that key are "locked" and
their per-file keys wiped. Many users want this and are using
'echo 2 > /proc/sys/vm/drop_caches' as a workaround, but this is
root-only, and also is overkill so can be a performance disaster.
(3) The key derivation function (KDF) that fscrypt uses to derive
per-file keys is nonstandard, inflexible, and has some weaknesses
such as being reversible and not evenly distributing the entropy
from the user-provided keys.
(4) fscrypt doesn't check that the correct key was supplied. This can
be a security vulnerability, since it allows malicious local users
to associate the wrong key with files to which they have read-only
access, causing other users' processes to read/write the wrong data.
Ultimately, the solutions to these problems all tie into each other. By
adding a filesystem-level encryption keyring with ioctls to add/remove
keys to/from it, the keys are made usable filesystem-wide (solves
problem #1). It also becomes easy to track the inodes that were
"unlocked" with each key, so they can be evicted when the key is removed
(solves problem #2). Moreover, the filesystem-level keyring is a
natural place to store an HMAC transform keyed by each key, thus making
it easy and efficient to switch the KDF to HKDF (solves problem #3).
Finally, to check that the correct key was supplied, I use HKDF to
derive a cryptographically secure key_identifier for each key (solves
problem #4). This in combination with key quotas and other careful
precautions also makes it safe to allow non-root users to add and remove
keys to/from the filesystem-level keyring. Thus, all problems are
solved without having to restrict the fscrypt API to root only.
The patchset is organized as follows:
- Patches 1-8 add new ioctls FS_IOC_ADD_ENCRYPTION_KEY,
FS_IOC_REMOVE_ENCRYPTION_KEY, and FS_IOC_GET_ENCRYPTION_KEY_STATUS.
Adding a key logically "unlocks" all files on the filesystem that are
protected by that key; removing a key "locks" them again.
- Patches 9-12 add support for v2 encryption policies.
- Patches 13-15 wire up the new ioctls to ext4, f2fs, and ubifs.
- Patch 16 updates the fscrypt documentation for all the changes.
This patchset applies to v5.3-rc1 with the pending fscrypt cleanup
patches applied (https://patchwork.kernel.org/patch/11057589/ and
https://patchwork.kernel.org/cover/11057583/).
You can also get it from git at:
Repository: https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git
Branch: fscrypt-key-mgmt-improvements-v7
I've written xfstests for the new APIs. They test the APIs themselves
as well as verify the correctness of the ciphertext stored on-disk for
v2 encryption policies. The tests can be found at:
Repository: https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfstests-dev.git
Branch: fscrypt-key-mgmt-improvements
The xfstests depend on new xfs_io commands which can be found at:
Repository: https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/xfsprogs-dev.git
Branch: fscrypt-key-mgmt-improvements
I've also made proof-of-concept changes to the 'fscrypt' userspace
program (https://github.com/google/fscrypt) to make it support v2
encryption policies. You can find these changes in git at:
Repository: https://github.com/ebiggers/fscrypt.git
Branch: fscrypt-key-mgmt-improvements
To make the 'fscrypt' userspace program experimentally use v2 encryption
policies on new encrypted directories, add the following to
/etc/fscrypt.conf within the "options" section:
"policy_version": "2"
Finally, it's also planned for Android and Chromium OS to switch to the
new ioctls and eventually to v2 encryption policies. Work-in-progress,
proof-of-concept changes by Satya Tangirala for AOSP can be found at
https://android-review.googlesource.com/q/topic:fscrypt-key-mgmt-improvements
Changes v6 => v7:
- Rebase onto v5.3-rc1 and the pending fscrypt cleanups.
- Work around false positive compile-time buffer overflow check in
copy_from_user() in fscrypt_ioctl_set_policy() when building an
i386 kernel in a specific config with an old gcc version.
- A few very minor cleanups.
Changes v5 => v6:
- Change HKDF to use the specification-defined default salt rather
than a custom fixed salt, and prepend the string "fscrypt" to
'info' instead. This is arguably needed to match how RFC 5869 and
SP 800-56C are worded. Both ways are secure in this context, so
prefer the "boring" way that clearly matches the standards.
- Rebase onto v5.2-rc1.
- A few small cleanups.
Changes v4 => v5:
- Simplify shrink_dcache_inode(), as suggested by Al Viro;
also move it into fs/crypto/.
- Fix a build error on some architectures by calling
copy_from_user() rather than get_user() with a __u64 pointer.
Changes v3 => v4:
- Introduce fscrypt_sb_free() to avoid an extra #ifdef.
- Fix UBIFS's ->drop_inode().
- Add 'version' to union fscrypt_policy and union fscrypt_context.
Changes v2 => v3:
- Use ->drop_inode() to trigger the inode eviction during/after
FS_IOC_REMOVE_ENCRYPTION_KEY, as suggested by Dave Chinner.
- A few small cleanups.
v1 of this patchset was sent in October 2017 with title "fscrypt:
filesystem-level keyring and v2 policy support". This revived version
follows the same basic design but incorporates numerous improvements,
such as splitting keyinfo.c into multiple files for much better
understandability, and introducing "per-mode" encryption keys to
implement the semantics of the DIRECT_KEY encryption policy flag.
Eric Biggers (16):
fs, fscrypt: move uapi definitions to new header <linux/fscrypt.h>
fscrypt: use FSCRYPT_ prefix for uapi constants
fscrypt: use FSCRYPT_* definitions, not FS_*
fscrypt: add ->ci_inode to fscrypt_info
fscrypt: refactor v1 policy key setup into keysetup_legacy.c
fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl
fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY ioctl
fscrypt: add FS_IOC_GET_ENCRYPTION_KEY_STATUS ioctl
fscrypt: add an HKDF-SHA512 implementation
fscrypt: v2 encryption policy support
fscrypt: allow unprivileged users to add/remove keys for v2 policies
fscrypt: require that key be added when setting a v2 encryption policy
ext4: wire up new fscrypt ioctls
f2fs: wire up new fscrypt ioctls
ubifs: wire up new fscrypt ioctls
fscrypt: document the new ioctls and policy version
Documentation/filesystems/fscrypt.rst | 670 ++++++++++++++----
MAINTAINERS | 1 +
fs/crypto/Kconfig | 2 +
fs/crypto/Makefile | 10 +-
fs/crypto/crypto.c | 12 +-
fs/crypto/fname.c | 5 +-
fs/crypto/fscrypt_private.h | 366 +++++++++-
fs/crypto/hkdf.c | 181 +++++
fs/crypto/keyinfo.c | 627 -----------------
fs/crypto/keyring.c | 957 ++++++++++++++++++++++++++
fs/crypto/keysetup.c | 569 +++++++++++++++
fs/crypto/keysetup_legacy.c | 340 +++++++++
fs/crypto/policy.c | 428 +++++++++---
fs/ext4/ioctl.c | 24 +
fs/ext4/super.c | 3 +
fs/f2fs/file.c | 46 ++
fs/f2fs/super.c | 2 +
fs/super.c | 2 +
fs/ubifs/ioctl.c | 16 +
fs/ubifs/super.c | 11 +
include/linux/fs.h | 1 +
include/linux/fscrypt.h | 48 +-
include/uapi/linux/fs.h | 54 +-
include/uapi/linux/fscrypt.h | 164 +++++
24 files changed, 3583 insertions(+), 956 deletions(-)
create mode 100644 fs/crypto/hkdf.c
delete mode 100644 fs/crypto/keyinfo.c
create mode 100644 fs/crypto/keyring.c
create mode 100644 fs/crypto/keysetup.c
create mode 100644 fs/crypto/keysetup_legacy.c
create mode 100644 include/uapi/linux/fscrypt.h
--
2.22.0
^ permalink raw reply
* Re: [PATCH v3 2/2] doc: Update documentation for page_idle virtual address indexing
From: Joel Fernandes @ 2019-07-26 20:26 UTC (permalink / raw)
To: sspatil
Cc: linux-kernel, adobriyan, akpm, bgregg, chansen3, dancol, fmayer,
joaodias, corbet, keescook, kernel-team, linux-api, linux-doc,
linux-fsdevel, linux-mm, mhocko, rppt, minchan, namhyung, guro,
sfr, surenb, tkjos, vdavydov.dev, vbabka, wvw, sspatil+mutt
In-Reply-To: <20190726201710.GA144547@google.com>
On Fri, Jul 26, 2019 at 01:17:10PM -0700, sspatil@google.com wrote:
> Thanks Joel, just a couple of nits for the doc inline below. Other than that,
>
> Reviewed-by: Sandeep Patil <sspatil@google.com>
Thanks!
> I'll plan on making changes to Android to use this instead of the pagemap +
> page_idle. I think it will also be considerably faster.
Cool, glad to know.
> On Fri, Jul 26, 2019 at 11:23:19AM -0400, Joel Fernandes (Google) wrote:
> > This patch updates the documentation with the new page_idle tracking
> > feature which uses virtual address indexing.
> >
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > ---
> > .../admin-guide/mm/idle_page_tracking.rst | 43 ++++++++++++++++---
> > 1 file changed, 36 insertions(+), 7 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
> > index df9394fb39c2..1eeac78c94a7 100644
> > --- a/Documentation/admin-guide/mm/idle_page_tracking.rst
> > +++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
> > @@ -19,10 +19,14 @@ It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
> >
> > User API
> > ========
> > +There are 2 ways to access the idle page tracking API. One uses physical
> > +address indexing, another uses a simpler virtual address indexing scheme.
> >
> > -The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
> > -Currently, it consists of the only read-write file,
> > -``/sys/kernel/mm/page_idle/bitmap``.
> > +Physical address indexing
> > +-------------------------
> > +The idle page tracking API for physical address indexing using page frame
> > +numbers (PFN) is located at ``/sys/kernel/mm/page_idle``. Currently, it
> > +consists of the only read-write file, ``/sys/kernel/mm/page_idle/bitmap``.
> >
> > The file implements a bitmap where each bit corresponds to a memory page. The
> > bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
> > @@ -74,6 +78,31 @@ See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
> > information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
> > ``/proc/kpagecgroup``.
> >
> > +Virtual address indexing
> > +------------------------
> > +The idle page tracking API for virtual address indexing using virtual page
> > +frame numbers (VFN) is located at ``/proc/<pid>/page_idle``. It is a bitmap
> > +that follows the same semantics as ``/sys/kernel/mm/page_idle/bitmap``
> > +except that it uses virtual instead of physical frame numbers.
> > +
> > +This idle page tracking API does not need deal with PFN so it does not require
>
> s/need//
>
> > +prior lookups of ``pagemap`` in order to find if page is idle or not. This is
>
> s/in order to find if page is idle or not//
Fixed both, thank you! Will send out update soon.
thanks,
- Joel
^ permalink raw reply
* Re: [PATCH v3 2/2] doc: Update documentation for page_idle virtual address indexing
From: sspatil @ 2019-07-26 20:17 UTC (permalink / raw)
To: joel, sspatil+mutt
Cc: linux-kernel, Alexey Dobriyan, Andrew Morton, Brendan Gregg,
Christian Hansen, dancol, fmayer, joaodias, joelaf,
Jonathan Corbet, Kees Cook, kernel-team, linux-api, linux-doc,
linux-fsdevel, linux-mm, Michal Hocko, Mike Rapoport, minchan,
namhyung, Roman Gushchin, Stephen Rothwell, surenb, tkjos,
Vladimir Davydov, Vlastimil Babka, wvw
In-Reply-To: <20190726152319.134152-2-joel@joelfernandes.org>
Thanks Joel, just a couple of nits for the doc inline below. Other than that,
Reviewed-by: Sandeep Patil <sspatil@google.com>
I'll plan on making changes to Android to use this instead of the pagemap +
page_idle. I think it will also be considerably faster.
On Fri, Jul 26, 2019 at 11:23:19AM -0400, Joel Fernandes (Google) wrote:
> This patch updates the documentation with the new page_idle tracking
> feature which uses virtual address indexing.
>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
> .../admin-guide/mm/idle_page_tracking.rst | 43 ++++++++++++++++---
> 1 file changed, 36 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
> index df9394fb39c2..1eeac78c94a7 100644
> --- a/Documentation/admin-guide/mm/idle_page_tracking.rst
> +++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
> @@ -19,10 +19,14 @@ It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
>
> User API
> ========
> +There are 2 ways to access the idle page tracking API. One uses physical
> +address indexing, another uses a simpler virtual address indexing scheme.
>
> -The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
> -Currently, it consists of the only read-write file,
> -``/sys/kernel/mm/page_idle/bitmap``.
> +Physical address indexing
> +-------------------------
> +The idle page tracking API for physical address indexing using page frame
> +numbers (PFN) is located at ``/sys/kernel/mm/page_idle``. Currently, it
> +consists of the only read-write file, ``/sys/kernel/mm/page_idle/bitmap``.
>
> The file implements a bitmap where each bit corresponds to a memory page. The
> bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
> @@ -74,6 +78,31 @@ See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
> information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
> ``/proc/kpagecgroup``.
>
> +Virtual address indexing
> +------------------------
> +The idle page tracking API for virtual address indexing using virtual page
> +frame numbers (VFN) is located at ``/proc/<pid>/page_idle``. It is a bitmap
> +that follows the same semantics as ``/sys/kernel/mm/page_idle/bitmap``
> +except that it uses virtual instead of physical frame numbers.
> +
> +This idle page tracking API does not need deal with PFN so it does not require
s/need//
> +prior lookups of ``pagemap`` in order to find if page is idle or not. This is
s/in order to find if page is idle or not//
> +an advantage on some systems where looking up PFN is considered a security
> +issue. Also in some cases, this interface could be slightly more reliable to
> +use than physical address indexing, since in physical address indexing, address
> +space changes can occur between reading the ``pagemap`` and reading the
> +``bitmap``, while in virtual address indexing, the process's ``mmap_sem`` is
> +held for the duration of the access.
> +
> +To estimate the amount of pages that are not used by a workload one should:
> +
> + 1. Mark all the workload's pages as idle by setting corresponding bits in
> + ``/proc/<pid>/page_idle``.
> +
> + 2. Wait until the workload accesses its working set.
> +
> + 3. Read ``/proc/<pid>/page_idle`` and count the number of bits set.
> +
> .. _impl_details:
>
> Implementation Details
> @@ -99,10 +128,10 @@ When a dirty page is written to swap or disk as a result of memory reclaim or
> exceeding the dirty memory limit, it is not marked referenced.
>
> The idle memory tracking feature adds a new page flag, the Idle flag. This flag
> -is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
> -:ref:`User API <user_api>`
> -section), and cleared automatically whenever a page is referenced as defined
> -above.
> +is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` for physical
> +addressing or by writing to ``/proc/<pid>/page_idle`` for virtual
> +addressing (see the :ref:`User API <user_api>` section), and cleared
> +automatically whenever a page is referenced as defined above.
>
> When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
> mapped to, otherwise we will not be able to detect accesses to the page coming
> --
> 2.22.0.709.g102302147b-goog
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
^ permalink raw reply
* [PATCH v3 2/2] doc: Update documentation for page_idle virtual address indexing
From: Joel Fernandes (Google) @ 2019-07-26 15:23 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Alexey Dobriyan, Andrew Morton,
Brendan Gregg, Christian Hansen, dancol, fmayer, joaodias, joelaf,
Jonathan Corbet, Kees Cook, kernel-team, linux-api, linux-doc,
linux-fsdevel, linux-mm, Michal Hocko, Mike Rapoport, minchan,
namhyung, Roman Gushchin, Stephen Rothwell, surenb, tkjos,
Vladimir Davydov, Vlastimil Babka
In-Reply-To: <20190726152319.134152-1-joel@joelfernandes.org>
This patch updates the documentation with the new page_idle tracking
feature which uses virtual address indexing.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
.../admin-guide/mm/idle_page_tracking.rst | 43 ++++++++++++++++---
1 file changed, 36 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index df9394fb39c2..1eeac78c94a7 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -19,10 +19,14 @@ It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
User API
========
+There are 2 ways to access the idle page tracking API. One uses physical
+address indexing, another uses a simpler virtual address indexing scheme.
-The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
-Currently, it consists of the only read-write file,
-``/sys/kernel/mm/page_idle/bitmap``.
+Physical address indexing
+-------------------------
+The idle page tracking API for physical address indexing using page frame
+numbers (PFN) is located at ``/sys/kernel/mm/page_idle``. Currently, it
+consists of the only read-write file, ``/sys/kernel/mm/page_idle/bitmap``.
The file implements a bitmap where each bit corresponds to a memory page. The
bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
@@ -74,6 +78,31 @@ See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
``/proc/kpagecgroup``.
+Virtual address indexing
+------------------------
+The idle page tracking API for virtual address indexing using virtual page
+frame numbers (VFN) is located at ``/proc/<pid>/page_idle``. It is a bitmap
+that follows the same semantics as ``/sys/kernel/mm/page_idle/bitmap``
+except that it uses virtual instead of physical frame numbers.
+
+This idle page tracking API does not need deal with PFN so it does not require
+prior lookups of ``pagemap`` in order to find if page is idle or not. This is
+an advantage on some systems where looking up PFN is considered a security
+issue. Also in some cases, this interface could be slightly more reliable to
+use than physical address indexing, since in physical address indexing, address
+space changes can occur between reading the ``pagemap`` and reading the
+``bitmap``, while in virtual address indexing, the process's ``mmap_sem`` is
+held for the duration of the access.
+
+To estimate the amount of pages that are not used by a workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+ ``/proc/<pid>/page_idle``.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read ``/proc/<pid>/page_idle`` and count the number of bits set.
+
.. _impl_details:
Implementation Details
@@ -99,10 +128,10 @@ When a dirty page is written to swap or disk as a result of memory reclaim or
exceeding the dirty memory limit, it is not marked referenced.
The idle memory tracking feature adds a new page flag, the Idle flag. This flag
-is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
-:ref:`User API <user_api>`
-section), and cleared automatically whenever a page is referenced as defined
-above.
+is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` for physical
+addressing or by writing to ``/proc/<pid>/page_idle`` for virtual
+addressing (see the :ref:`User API <user_api>` section), and cleared
+automatically whenever a page is referenced as defined above.
When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
mapped to, otherwise we will not be able to detect accesses to the page coming
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v3 1/2] mm/page_idle: Add per-pid idle page tracking using virtual indexing
From: Joel Fernandes (Google) @ 2019-07-26 15:23 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Alexey Dobriyan, Andrew Morton,
Brendan Gregg, Christian Hansen, dancol, fmayer, joaodias, joelaf,
Jonathan Corbet, Kees Cook, kernel-team, linux-api, linux-doc,
linux-fsdevel, linux-mm, Michal Hocko, Mike Rapoport, minchan,
namhyung, Roman Gushchin, Stephen Rothwell, surenb, tkjos,
Vladimir Davydov, Vlastimil Babka
The page_idle tracking feature currently requires looking up the pagemap
for a process followed by interacting with /sys/kernel/mm/page_idle.
Looking up PFN from pagemap in Android devices is not supported by
unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
This patch adds support to directly interact with page_idle tracking at
the PID level by introducing a /proc/<pid>/page_idle file. It follows
the exact same semantics as the global /sys/kernel/mm/page_idle, but now
looking up PFN through pagemap is not needed since the interface uses
virtual frame numbers, and at the same time also does not require
SYS_ADMIN.
In Android, we are using this for the heap profiler (heapprofd) which
profiles and pin points code paths which allocates and leaves memory
idle for long periods of time. This method solves the security issue
with userspace learning the PFN, and while at it is also shown to yield
better results than the pagemap lookup, the theory being that the window
where the address space can change is reduced by eliminating the
intermediate pagemap look up stage. In virtual address indexing, the
process's mmap_sem is held for the duration of the access.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
v2->v3:
Fixed a bug where I was doing a kfree that is not needed due to not
needing to do GFP_ATOMIC allocations.
v1->v2:
Mark swap ptes as idle (Minchan)
Avoid need for GFP_ATOMIC (Andrew)
Get rid of idle_page_list lock by moving list to stack
Internal review -> v1:
Fixes from Suren.
Corrections to change log, docs (Florian, Sandeep)
fs/proc/base.c | 3 +
fs/proc/internal.h | 1 +
fs/proc/task_mmu.c | 57 +++++++
include/linux/page_idle.h | 4 +
mm/page_idle.c | 340 +++++++++++++++++++++++++++++++++-----
5 files changed, 360 insertions(+), 45 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 77eb628ecc7f..a58dd74606e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3021,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ REG("page_idle", S_IRUSR|S_IWUSR, proc_page_idle_operations),
+#endif
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..bc9371880c63 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -293,6 +293,7 @@ extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_page_idle_operations;
extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4d2b860dbc3f..11ccc53da38e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1642,6 +1642,63 @@ const struct file_operations proc_pagemap_operations = {
.open = pagemap_open,
.release = pagemap_release,
};
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+static ssize_t proc_page_idle_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ if (!tsk)
+ return -EINVAL;
+ ret = page_idle_proc_read(file, buf, count, ppos, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t proc_page_idle_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ if (!tsk)
+ return -EINVAL;
+ ret = page_idle_proc_write(file, (char __user *)buf, count, ppos, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static int proc_page_idle_open(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int proc_page_idle_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
+const struct file_operations proc_page_idle_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = proc_page_idle_read,
+ .write = proc_page_idle_write,
+ .open = proc_page_idle_open,
+ .release = proc_page_idle_release,
+};
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdce..f1bc2640d85e 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -106,6 +106,10 @@ static inline void clear_page_idle(struct page *page)
}
#endif /* CONFIG_64BIT */
+ssize_t page_idle_proc_write(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
+ssize_t page_idle_proc_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
#else /* !CONFIG_IDLE_PAGE_TRACKING */
static inline bool page_is_young(struct page *page)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 295512465065..86244f7f1faa 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -5,12 +5,15 @@
#include <linux/sysfs.h>
#include <linux/kobject.h>
#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/pagemap.h>
-#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmzone.h>
#include <linux/page_ext.h>
#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/sched/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#define BITMAP_CHUNK_SIZE sizeof(u64)
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
@@ -25,18 +28,13 @@
* page tracking. With such an indicator of user pages we can skip isolated
* pages, but since there are not usually many of them, it will hardly affect
* the overall result.
- *
- * This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct page *page_idle_get_page(struct page *page_in)
{
struct page *page;
pg_data_t *pgdat;
- if (!pfn_valid(pfn))
- return NULL;
-
- page = pfn_to_page(pfn);
+ page = page_in;
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
@@ -51,6 +49,18 @@ static struct page *page_idle_get_page(unsigned long pfn)
return page;
}
+/*
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page_pfn(unsigned long pfn)
+{
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ return page_idle_get_page(pfn_to_page(pfn));
+}
+
static bool page_idle_clear_pte_refs_one(struct page *page,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
@@ -118,6 +128,47 @@ static void page_idle_clear_pte_refs(struct page *page)
unlock_page(page);
}
+/* Helper to get the start and end frame given a pos and count */
+static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long max_frame;
+
+ /* If an mm is not given, assume we want physical frames */
+ max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ *start = pos * BITS_PER_BYTE;
+ if (*start >= max_frame)
+ return -ENXIO;
+
+ *end = *start + count * BITS_PER_BYTE;
+ if (*end > max_frame)
+ *end = max_frame;
+ return 0;
+}
+
+static bool page_really_idle(struct page *page)
+{
+ if (!page)
+ return false;
+
+ if (page_is_idle(page)) {
+ /*
+ * The page might have been referenced via a
+ * pte, in which case it is not idle. Clear
+ * refs and recheck.
+ */
+ page_idle_clear_pte_refs(page);
+ if (page_is_idle(page))
+ return true;
+ }
+
+ return false;
+}
+
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
@@ -125,35 +176,21 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
u64 *out = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
- int bit;
+ int bit, ret;
- if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
- return -EINVAL;
-
- pfn = pos * BITS_PER_BYTE;
- if (pfn >= max_pfn)
- return 0;
-
- end_pfn = pfn + count * BITS_PER_BYTE;
- if (end_pfn > max_pfn)
- end_pfn = max_pfn;
+ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ if (ret == -ENXIO)
+ return 0; /* Reads beyond max_pfn do nothing */
+ else if (ret)
+ return ret;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
- /*
- * The page might have been referenced via a
- * pte, in which case it is not idle. Clear
- * refs and recheck.
- */
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
- *out |= 1ULL << bit;
- }
+ page = page_idle_get_page_pfn(pfn);
+ if (page && page_really_idle(page)) {
+ *out |= 1ULL << bit;
put_page(page);
}
if (bit == BITMAP_CHUNK_BITS - 1)
@@ -170,23 +207,16 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
const u64 *in = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
- int bit;
+ int bit, ret;
- if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
- return -EINVAL;
-
- pfn = pos * BITS_PER_BYTE;
- if (pfn >= max_pfn)
- return -ENXIO;
-
- end_pfn = pfn + count * BITS_PER_BYTE;
- if (end_pfn > max_pfn)
- end_pfn = max_pfn;
+ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ if (ret)
+ return ret;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
+ page = page_idle_get_page_pfn(pfn);
if (page) {
page_idle_clear_pte_refs(page);
set_page_idle(page);
@@ -224,6 +254,226 @@ struct page_ext_operations page_idle_ops = {
};
#endif
+/* page_idle tracking for /proc/<pid>/page_idle */
+
+struct page_node {
+ struct page *page;
+ unsigned long addr;
+ struct list_head list;
+};
+
+struct page_idle_proc_priv {
+ unsigned long start_addr;
+ char *buffer;
+ int write;
+
+ /* Pre-allocate and provide nodes to add_page_idle_list() */
+ struct page_node *page_nodes;
+ int cur_page_node;
+ struct list_head *idle_page_list;
+};
+
+/*
+ * Add a page to the idle page list. page can be NULL if pte is
+ * from a swapped page.
+ */
+static void add_page_idle_list(struct page *page,
+ unsigned long addr, struct mm_walk *walk)
+{
+ struct page *page_get = NULL;
+ struct page_node *pn;
+ int bit;
+ unsigned long frames;
+ struct page_idle_proc_priv *priv = walk->private;
+ u64 *chunk = (u64 *)priv->buffer;
+
+ if (priv->write) {
+ /* Find whether this page was asked to be marked */
+ frames = (addr - priv->start_addr) >> PAGE_SHIFT;
+ bit = frames % BITMAP_CHUNK_BITS;
+ chunk = &chunk[frames / BITMAP_CHUNK_BITS];
+ if (((*chunk >> bit) & 1) == 0)
+ return;
+ }
+
+ if (page) {
+ page_get = page_idle_get_page(page);
+ if (!page_get)
+ return;
+ }
+
+ pn = &(priv->page_nodes[priv->cur_page_node++]);
+ pn->page = page_get;
+ pn->addr = addr;
+ list_add(&pn->list, priv->idle_page_list);
+}
+
+static int pte_page_idle_proc_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (pmd_present(*pmd)) {
+ page = follow_trans_huge_pmd(vma, addr, pmd,
+ FOLL_DUMP|FOLL_WRITE);
+ if (!IS_ERR_OR_NULL(page))
+ add_page_idle_list(page, addr, walk);
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ /*
+ * We add swapped pages to the idle_page_list so that we can
+ * reported to userspace that they are idle.
+ */
+ if (is_swap_pte(*pte)) {
+ add_page_idle_list(NULL, addr, walk);
+ continue;
+ }
+
+ if (!pte_present(*pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, *pte);
+ if (page)
+ add_page_idle_list(page, addr, walk);
+ }
+
+ pte_unmap_unlock(pte - 1, ptl);
+ return 0;
+}
+
+ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos,
+ struct task_struct *tsk, int write)
+{
+ int ret;
+ char *buffer;
+ u64 *out;
+ unsigned long start_addr, end_addr, start_frame, end_frame;
+ struct mm_struct *mm = file->private_data;
+ struct mm_walk walk = { .pmd_entry = pte_page_idle_proc_range, };
+ struct page_node *cur;
+ struct page_idle_proc_priv priv;
+ bool walk_error = false;
+ LIST_HEAD(idle_page_list);
+
+ if (!mm || !mmget_not_zero(mm))
+ return -EINVAL;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ buffer = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto out_mmput;
+ }
+ out = (u64 *)buffer;
+
+ if (write && copy_from_user(buffer, ubuff, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = page_idle_get_frames(*pos, count, mm, &start_frame, &end_frame);
+ if (ret)
+ goto out;
+
+ start_addr = (start_frame << PAGE_SHIFT);
+ end_addr = (end_frame << PAGE_SHIFT);
+ priv.buffer = buffer;
+ priv.start_addr = start_addr;
+ priv.write = write;
+
+ priv.idle_page_list = &idle_page_list;
+ priv.cur_page_node = 0;
+ priv.page_nodes = kzalloc(sizeof(struct page_node) *
+ (end_frame - start_frame), GFP_KERNEL);
+ if (!priv.page_nodes) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ walk.private = &priv;
+ walk.mm = mm;
+
+ down_read(&mm->mmap_sem);
+
+ /*
+ * idle_page_list is needed because walk_page_vma() holds ptlock which
+ * deadlocks with page_idle_clear_pte_refs(). So we have to collect all
+ * pages first, and then call page_idle_clear_pte_refs().
+ */
+ ret = walk_page_range(start_addr, end_addr, &walk);
+ if (ret)
+ walk_error = true;
+
+ list_for_each_entry(cur, &idle_page_list, list) {
+ int bit, index;
+ unsigned long off;
+ struct page *page = cur->page;
+
+ if (unlikely(walk_error))
+ goto remove_page;
+
+ if (write) {
+ if (page) {
+ page_idle_clear_pte_refs(page);
+ set_page_idle(page);
+ }
+ } else {
+ if (!page || page_really_idle(page)) {
+ off = ((cur->addr) >> PAGE_SHIFT) - start_frame;
+ bit = off % BITMAP_CHUNK_BITS;
+ index = off / BITMAP_CHUNK_BITS;
+ out[index] |= 1ULL << bit;
+ }
+ }
+remove_page:
+ if (page)
+ put_page(page);
+ }
+
+ if (!write && !walk_error)
+ ret = copy_to_user(ubuff, buffer, count);
+
+ up_read(&mm->mmap_sem);
+ kfree(priv.page_nodes);
+out:
+ kfree(buffer);
+out_mmput:
+ mmput(mm);
+ if (!ret)
+ ret = count;
+ return ret;
+
+}
+
+ssize_t page_idle_proc_read(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 0);
+}
+
+ssize_t page_idle_proc_write(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 1);
+}
+
static int __init page_idle_init(void)
{
int err;
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v2 2/2] doc: Update documentation for page_idle virtual address indexing
From: Joel Fernandes (Google) @ 2019-07-26 15:08 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Alexey Dobriyan, Andrew Morton,
Brendan Gregg, Christian Hansen, dancol, fmayer, joaodias, joelaf,
Jonathan Corbet, Kees Cook, kernel-team, linux-api, linux-doc,
linux-fsdevel, linux-mm, Michal Hocko, Mike Rapoport, minchan,
namhyung, Roman Gushchin, Stephen Rothwell, surenb, tkjos,
Vladimir Davydov, Vlastimil Babka
In-Reply-To: <20190726150845.95720-1-joel@joelfernandes.org>
This patch updates the documentation with the new page_idle tracking
feature which uses virtual address indexing.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
.../admin-guide/mm/idle_page_tracking.rst | 43 ++++++++++++++++---
1 file changed, 36 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index df9394fb39c2..1eeac78c94a7 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -19,10 +19,14 @@ It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
User API
========
+There are 2 ways to access the idle page tracking API. One uses physical
+address indexing, another uses a simpler virtual address indexing scheme.
-The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
-Currently, it consists of the only read-write file,
-``/sys/kernel/mm/page_idle/bitmap``.
+Physical address indexing
+-------------------------
+The idle page tracking API for physical address indexing using page frame
+numbers (PFN) is located at ``/sys/kernel/mm/page_idle``. Currently, it
+consists of the only read-write file, ``/sys/kernel/mm/page_idle/bitmap``.
The file implements a bitmap where each bit corresponds to a memory page. The
bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
@@ -74,6 +78,31 @@ See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
``/proc/kpagecgroup``.
+Virtual address indexing
+------------------------
+The idle page tracking API for virtual address indexing using virtual page
+frame numbers (VFN) is located at ``/proc/<pid>/page_idle``. It is a bitmap
+that follows the same semantics as ``/sys/kernel/mm/page_idle/bitmap``
+except that it uses virtual instead of physical frame numbers.
+
+This idle page tracking API does not need deal with PFN so it does not require
+prior lookups of ``pagemap`` in order to find if page is idle or not. This is
+an advantage on some systems where looking up PFN is considered a security
+issue. Also in some cases, this interface could be slightly more reliable to
+use than physical address indexing, since in physical address indexing, address
+space changes can occur between reading the ``pagemap`` and reading the
+``bitmap``, while in virtual address indexing, the process's ``mmap_sem`` is
+held for the duration of the access.
+
+To estimate the amount of pages that are not used by a workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+ ``/proc/<pid>/page_idle``.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read ``/proc/<pid>/page_idle`` and count the number of bits set.
+
.. _impl_details:
Implementation Details
@@ -99,10 +128,10 @@ When a dirty page is written to swap or disk as a result of memory reclaim or
exceeding the dirty memory limit, it is not marked referenced.
The idle memory tracking feature adds a new page flag, the Idle flag. This flag
-is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
-:ref:`User API <user_api>`
-section), and cleared automatically whenever a page is referenced as defined
-above.
+is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` for physical
+addressing or by writing to ``/proc/<pid>/page_idle`` for virtual
+addressing (see the :ref:`User API <user_api>` section), and cleared
+automatically whenever a page is referenced as defined above.
When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
mapped to, otherwise we will not be able to detect accesses to the page coming
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v2 1/2] mm/page_idle: Add per-pid idle page tracking using virtual indexing
From: Joel Fernandes (Google) @ 2019-07-26 15:08 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Alexey Dobriyan, Andrew Morton,
Brendan Gregg, Christian Hansen, dancol, fmayer, joaodias, joelaf,
Jonathan Corbet, Kees Cook, kernel-team, linux-api, linux-doc,
linux-fsdevel, linux-mm, Michal Hocko, Mike Rapoport, minchan,
namhyung, Roman Gushchin, Stephen Rothwell, surenb, tkjos,
Vladimir Davydov, Vlastimil Babka
The page_idle tracking feature currently requires looking up the pagemap
for a process followed by interacting with /sys/kernel/mm/page_idle.
Looking up PFN from pagemap in Android devices is not supported by
unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
This patch adds support to directly interact with page_idle tracking at
the PID level by introducing a /proc/<pid>/page_idle file. It follows
the exact same semantics as the global /sys/kernel/mm/page_idle, but now
looking up PFN through pagemap is not needed since the interface uses
virtual frame numbers, and at the same time also does not require
SYS_ADMIN.
In Android, we are using this for the heap profiler (heapprofd) which
profiles and pin points code paths which allocates and leaves memory
idle for long periods of time. This method solves the security issue
with userspace learning the PFN, and while at it is also shown to yield
better results than the pagemap lookup, the theory being that the window
where the address space can change is reduced by eliminating the
intermediate pagemap look up stage. In virtual address indexing, the
process's mmap_sem is held for the duration of the access.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
v1 -> v2:
Mark swap ptes as idle (Minchan)
Avoid need for GFP_ATOMIC (Andrew)
Get rid of idle_page_list lock by moving list to stack
Internal review -> v1:
Fixes from Suren.
Corrections to change log, docs (Florian, Sandeep)
fs/proc/base.c | 3 +
fs/proc/internal.h | 1 +
fs/proc/task_mmu.c | 57 +++++++
include/linux/page_idle.h | 4 +
mm/page_idle.c | 342 +++++++++++++++++++++++++++++++++-----
5 files changed, 362 insertions(+), 45 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 77eb628ecc7f..a58dd74606e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3021,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ REG("page_idle", S_IRUSR|S_IWUSR, proc_page_idle_operations),
+#endif
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..bc9371880c63 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -293,6 +293,7 @@ extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_page_idle_operations;
extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4d2b860dbc3f..11ccc53da38e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1642,6 +1642,63 @@ const struct file_operations proc_pagemap_operations = {
.open = pagemap_open,
.release = pagemap_release,
};
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+static ssize_t proc_page_idle_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ if (!tsk)
+ return -EINVAL;
+ ret = page_idle_proc_read(file, buf, count, ppos, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t proc_page_idle_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ if (!tsk)
+ return -EINVAL;
+ ret = page_idle_proc_write(file, (char __user *)buf, count, ppos, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static int proc_page_idle_open(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int proc_page_idle_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
+const struct file_operations proc_page_idle_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = proc_page_idle_read,
+ .write = proc_page_idle_write,
+ .open = proc_page_idle_open,
+ .release = proc_page_idle_release,
+};
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdce..f1bc2640d85e 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -106,6 +106,10 @@ static inline void clear_page_idle(struct page *page)
}
#endif /* CONFIG_64BIT */
+ssize_t page_idle_proc_write(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
+ssize_t page_idle_proc_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
#else /* !CONFIG_IDLE_PAGE_TRACKING */
static inline bool page_is_young(struct page *page)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 295512465065..d8a14955c39d 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -5,12 +5,15 @@
#include <linux/sysfs.h>
#include <linux/kobject.h>
#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/pagemap.h>
-#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmzone.h>
#include <linux/page_ext.h>
#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/sched/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#define BITMAP_CHUNK_SIZE sizeof(u64)
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
@@ -25,18 +28,13 @@
* page tracking. With such an indicator of user pages we can skip isolated
* pages, but since there are not usually many of them, it will hardly affect
* the overall result.
- *
- * This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct page *page_idle_get_page(struct page *page_in)
{
struct page *page;
pg_data_t *pgdat;
- if (!pfn_valid(pfn))
- return NULL;
-
- page = pfn_to_page(pfn);
+ page = page_in;
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
@@ -51,6 +49,18 @@ static struct page *page_idle_get_page(unsigned long pfn)
return page;
}
+/*
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page_pfn(unsigned long pfn)
+{
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ return page_idle_get_page(pfn_to_page(pfn));
+}
+
static bool page_idle_clear_pte_refs_one(struct page *page,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
@@ -118,6 +128,47 @@ static void page_idle_clear_pte_refs(struct page *page)
unlock_page(page);
}
+/* Helper to get the start and end frame given a pos and count */
+static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long max_frame;
+
+ /* If an mm is not given, assume we want physical frames */
+ max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ *start = pos * BITS_PER_BYTE;
+ if (*start >= max_frame)
+ return -ENXIO;
+
+ *end = *start + count * BITS_PER_BYTE;
+ if (*end > max_frame)
+ *end = max_frame;
+ return 0;
+}
+
+static bool page_really_idle(struct page *page)
+{
+ if (!page)
+ return false;
+
+ if (page_is_idle(page)) {
+ /*
+ * The page might have been referenced via a
+ * pte, in which case it is not idle. Clear
+ * refs and recheck.
+ */
+ page_idle_clear_pte_refs(page);
+ if (page_is_idle(page))
+ return true;
+ }
+
+ return false;
+}
+
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
@@ -125,35 +176,21 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
u64 *out = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
- int bit;
+ int bit, ret;
- if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
- return -EINVAL;
-
- pfn = pos * BITS_PER_BYTE;
- if (pfn >= max_pfn)
- return 0;
-
- end_pfn = pfn + count * BITS_PER_BYTE;
- if (end_pfn > max_pfn)
- end_pfn = max_pfn;
+ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ if (ret == -ENXIO)
+ return 0; /* Reads beyond max_pfn do nothing */
+ else if (ret)
+ return ret;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
- /*
- * The page might have been referenced via a
- * pte, in which case it is not idle. Clear
- * refs and recheck.
- */
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
- *out |= 1ULL << bit;
- }
+ page = page_idle_get_page_pfn(pfn);
+ if (page && page_really_idle(page)) {
+ *out |= 1ULL << bit;
put_page(page);
}
if (bit == BITMAP_CHUNK_BITS - 1)
@@ -170,23 +207,16 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
const u64 *in = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
- int bit;
+ int bit, ret;
- if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
- return -EINVAL;
-
- pfn = pos * BITS_PER_BYTE;
- if (pfn >= max_pfn)
- return -ENXIO;
-
- end_pfn = pfn + count * BITS_PER_BYTE;
- if (end_pfn > max_pfn)
- end_pfn = max_pfn;
+ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ if (ret)
+ return ret;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
+ page = page_idle_get_page_pfn(pfn);
if (page) {
page_idle_clear_pte_refs(page);
set_page_idle(page);
@@ -224,6 +254,228 @@ struct page_ext_operations page_idle_ops = {
};
#endif
+/* page_idle tracking for /proc/<pid>/page_idle */
+
+struct page_node {
+ struct page *page;
+ unsigned long addr;
+ struct list_head list;
+};
+
+struct page_idle_proc_priv {
+ unsigned long start_addr;
+ char *buffer;
+ int write;
+
+ /* Pre-allocate and provide nodes to add_page_idle_list() */
+ struct page_node *page_nodes;
+ int cur_page_node;
+ struct list_head *idle_page_list;
+};
+
+/*
+ * Add a page to the idle page list. page can be NULL if pte is
+ * from a swapped page.
+ */
+static void add_page_idle_list(struct page *page,
+ unsigned long addr, struct mm_walk *walk)
+{
+ struct page *page_get = NULL;
+ struct page_node *pn;
+ int bit;
+ unsigned long frames;
+ struct page_idle_proc_priv *priv = walk->private;
+ u64 *chunk = (u64 *)priv->buffer;
+
+ if (priv->write) {
+ /* Find whether this page was asked to be marked */
+ frames = (addr - priv->start_addr) >> PAGE_SHIFT;
+ bit = frames % BITMAP_CHUNK_BITS;
+ chunk = &chunk[frames / BITMAP_CHUNK_BITS];
+ if (((*chunk >> bit) & 1) == 0)
+ return;
+ }
+
+ if (page) {
+ page_get = page_idle_get_page(page);
+ if (!page_get)
+ return;
+ }
+
+ pn = &(priv->page_nodes[priv->cur_page_node++]);
+ pn->page = page_get;
+ pn->addr = addr;
+ list_add(&pn->list, priv->idle_page_list);
+}
+
+static int pte_page_idle_proc_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (pmd_present(*pmd)) {
+ page = follow_trans_huge_pmd(vma, addr, pmd,
+ FOLL_DUMP|FOLL_WRITE);
+ if (!IS_ERR_OR_NULL(page))
+ add_page_idle_list(page, addr, walk);
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ /*
+ * We add swapped pages to the idle_page_list so that we can
+ * reported to userspace that they are idle.
+ */
+ if (is_swap_pte(*pte)) {
+ add_page_idle_list(NULL, addr, walk);
+ continue;
+ }
+
+ if (!pte_present(*pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, *pte);
+ if (page)
+ add_page_idle_list(page, addr, walk);
+ }
+
+ pte_unmap_unlock(pte - 1, ptl);
+ return 0;
+}
+
+ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos,
+ struct task_struct *tsk, int write)
+{
+ int ret;
+ char *buffer;
+ u64 *out;
+ unsigned long start_addr, end_addr, start_frame, end_frame;
+ struct mm_struct *mm = file->private_data;
+ struct mm_walk walk = { .pmd_entry = pte_page_idle_proc_range, };
+ struct page_node *cur, *next;
+ struct page_idle_proc_priv priv;
+ bool walk_error = false;
+ LIST_HEAD(idle_page_list);
+
+ if (!mm || !mmget_not_zero(mm))
+ return -EINVAL;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ buffer = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto out_mmput;
+ }
+ out = (u64 *)buffer;
+
+ if (write && copy_from_user(buffer, ubuff, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = page_idle_get_frames(*pos, count, mm, &start_frame, &end_frame);
+ if (ret)
+ goto out;
+
+ start_addr = (start_frame << PAGE_SHIFT);
+ end_addr = (end_frame << PAGE_SHIFT);
+ priv.buffer = buffer;
+ priv.start_addr = start_addr;
+ priv.write = write;
+
+ priv.idle_page_list = &idle_page_list;
+ priv.cur_page_node = 0;
+ priv.page_nodes = kzalloc(sizeof(struct page_node) *
+ (end_frame - start_frame), GFP_KERNEL);
+ if (!priv.page_nodes) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ walk.private = &priv;
+ walk.mm = mm;
+
+ down_read(&mm->mmap_sem);
+
+ /*
+ * idle_page_list is needed because walk_page_vma() holds ptlock which
+ * deadlocks with page_idle_clear_pte_refs(). So we have to collect all
+ * pages first, and then call page_idle_clear_pte_refs().
+ */
+ ret = walk_page_range(start_addr, end_addr, &walk);
+ if (ret)
+ walk_error = true;
+
+ list_for_each_entry_safe(cur, next, &idle_page_list, list) {
+ int bit, index;
+ unsigned long off;
+ struct page *page = cur->page;
+
+ if (unlikely(walk_error))
+ goto remove_page;
+
+ if (write) {
+ if (page) {
+ page_idle_clear_pte_refs(page);
+ set_page_idle(page);
+ }
+ } else {
+ if (!page || page_really_idle(page)) {
+ off = ((cur->addr) >> PAGE_SHIFT) - start_frame;
+ bit = off % BITMAP_CHUNK_BITS;
+ index = off / BITMAP_CHUNK_BITS;
+ out[index] |= 1ULL << bit;
+ }
+ }
+remove_page:
+ if (page)
+ put_page(page);
+ list_del(&cur->list);
+ kfree(cur);
+ }
+
+ if (!write && !walk_error)
+ ret = copy_to_user(ubuff, buffer, count);
+
+ up_read(&mm->mmap_sem);
+ kfree(priv.page_nodes);
+out:
+ kfree(buffer);
+out_mmput:
+ mmput(mm);
+ if (!ret)
+ ret = count;
+ return ret;
+
+}
+
+ssize_t page_idle_proc_read(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 0);
+}
+
+ssize_t page_idle_proc_write(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 1);
+}
+
static int __init page_idle_init(void)
{
int err;
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems
From: Christian Brauner @ 2019-07-26 11:59 UTC (permalink / raw)
To: Linux List Kernel Mailing, Al Viro, David Howells
Cc: Miklos Szeredi, Eric W. Biederman, Linus Torvalds, linux-fsdevel,
Linux API
Hey everyone,
We have another mount api regression. With current 5.3-rc1 it is not
possible anymore to mount filesystems that have FS_USERNS_MOUNT set and
their fs_context's global member set to true. At least sysfs is
affected, likely also cgroup{2}fs.
The commit that introduced the regression is:
commit 0ce0cf12fc4c6a089717ff613d76457052cf4303
Author: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun May 12 15:42:48 2019 -0400
consolidate the capability checks in sget_{fc,userns}()
... into a common helper - mount_capable(type, userns)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
mount_capable() will select the user namespace in which to check for
CAP_SYS_ADMIN based on the global property of the filesystem's
fs_context.
Since sysfs has global set to true mount_capable() will check for
CAP_SYS_ADMIN in init_user_ns and fail the mount with EPERM for any
non-init userns root. The same check is present in sget_fc().
To me it looks like that global is overriding FS_USERNS_MOUNT which
seems odd. Afaict, there are two ways to fix this:
- remove global from sysfs
- remove the global check from mount_capable() and possibly sget_fc()
The latter feels more correct but I'm not sure *why* that global thing
got introduced. Seems there could be an additional flag on affected
filesystems instead of this "global" thing. But not sure.
I can whip up a patch in case that does make sense.
And it would probably be a good thing if we had some sort of test (if
there isn't one already) so that this doesn't happen again. It could be
as simple as:
unshare -U -m --map-root -n
mkdir whatever
mount -t sysfs sysfs ./whatever
Thanks!
Christian
^ permalink raw reply
* Re: [PATCH 2/5] pidfd: add pidfd_wait()
From: Arnd Bergmann @ 2019-07-26 9:57 UTC (permalink / raw)
To: Christian Brauner
Cc: Linux Kernel Mailing List, Oleg Nesterov, Eric W . Biederman,
Kees Cook, Joel Fernandes, Thomas Gleixner, Tejun Heo,
David Howells, Jann Horn, Andy Lutomirski, Andrew Morton,
Aleksa Sarai, Linus Torvalds, Al Viro, Android Kernel Team,
Linux API
In-Reply-To: <20190726082413.n7srvcrqxmvk67z7@brauner.io>
On Fri, Jul 26, 2019 at 10:24 AM Christian Brauner <christian@brauner.io> wrote:
>
> > It would be nice to introduce it in a separate patch, and then use it
> > to kill off
> > compat_sys_getrusage() and compat_sys_wait4(), and possibly even
> > compat_sys_waitid() in combination with your copy_siginfo_to_user_any().
> > That could be done as a cleanup patch afterwards, or as part of your series.
>
> Right, but we won't go the syscall route but instead go the P_PIDFD
> route for waitid(). :)
Ah, of course, nevermind then. It would still be a useful cleanup, but
many other things would be as well.
Arnd
^ permalink raw reply
* Re: [PATCH 2/5] pidfd: add pidfd_wait()
From: Christian Brauner @ 2019-07-26 8:24 UTC (permalink / raw)
To: Arnd Bergmann
Cc: Linux Kernel Mailing List, Oleg Nesterov, Eric W . Biederman,
Kees Cook, Joel Fernandes, Thomas Gleixner, Tejun Heo,
David Howells, Jann Horn, Andy Lutomirski, Andrew Morton,
Aleksa Sarai, Linus Torvalds, Al Viro, Android Kernel Team,
Linux API
In-Reply-To: <CAK8P3a0+3wqCzQv-A-QmWTtioFRGjYUvq6QiLysqi9OFs3kJsw@mail.gmail.com>
On Fri, Jul 26, 2019 at 10:19:55AM +0200, Arnd Bergmann wrote:
> On Wed, Jul 24, 2019 at 4:47 PM Christian Brauner <christian@brauner.io> wrote:
>
> > +
> > +static int copy_rusage_to_user_any(struct rusage *kru, struct rusage __user *ru)
> > +{
> > +#ifdef CONFIG_COMPAT
> > + if (in_compat_syscall())
> > + return put_compat_rusage(kru, (struct compat_rusage __user *)ru);
> > +#endif
> > + return copy_to_user(ru, kru, sizeof(*kru));
> > +}
>
> I think this code needs a check for COMPAT_USE_64BIT_TIME in order
> to handle x32 correctly.
Similar to waitid(), indeed.
>
> It would be nice to introduce it in a separate patch, and then use it
> to kill off
> compat_sys_getrusage() and compat_sys_wait4(), and possibly even
> compat_sys_waitid() in combination with your copy_siginfo_to_user_any().
> That could be done as a cleanup patch afterwards, or as part of your series.
Right, but we won't go the syscall route but instead go the P_PIDFD
route for waitid(). :)
Christian
^ permalink raw reply
* Re: [PATCH 2/5] pidfd: add pidfd_wait()
From: Arnd Bergmann @ 2019-07-26 8:19 UTC (permalink / raw)
To: Christian Brauner
Cc: Linux Kernel Mailing List, Oleg Nesterov, Eric W . Biederman,
Kees Cook, Joel Fernandes, Thomas Gleixner, Tejun Heo,
David Howells, Jann Horn, Andy Lutomirski, Andrew Morton,
Aleksa Sarai, Linus Torvalds, Al Viro, Android Kernel Team,
Linux API
In-Reply-To: <20190724144651.28272-3-christian@brauner.io>
On Wed, Jul 24, 2019 at 4:47 PM Christian Brauner <christian@brauner.io> wrote:
> +
> +static int copy_rusage_to_user_any(struct rusage *kru, struct rusage __user *ru)
> +{
> +#ifdef CONFIG_COMPAT
> + if (in_compat_syscall())
> + return put_compat_rusage(kru, (struct compat_rusage __user *)ru);
> +#endif
> + return copy_to_user(ru, kru, sizeof(*kru));
> +}
I think this code needs a check for COMPAT_USE_64BIT_TIME in order
to handle x32 correctly.
It would be nice to introduce it in a separate patch, and then use it
to kill off
compat_sys_getrusage() and compat_sys_wait4(), and possibly even
compat_sys_waitid() in combination with your copy_siginfo_to_user_any().
That could be done as a cleanup patch afterwards, or as part of your series.
Arnd
^ permalink raw reply
* Re: [PATCH v7 0/5] Introduce MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-07-26 2:42 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov
In-Reply-To: <20190726023435.214162-1-minchan@kernel.org>
Hi Andrew,
It's the resend with fixing build errors kbuildbot reported.
Please take it this version to get more test coverage.
Thanks.
On Fri, Jul 26, 2019 at 11:34:30AM +0900, Minchan Kim wrote:
> This patch is part of previous series:
> https://lore.kernel.org/lkml/20190531064313.193437-1-minchan@kernel.org/
> Originally, it was created for external madvise hinting feature.
>
> https://lkml.org/lkml/2019/5/31/463
> Michal wanted to separte the discussion from external hinting interface
> so this patchset includes only first part of my entire patchset
>
> - introduce MADV_COLD and MADV_PAGEOUT hint to madvise.
>
> However, I keep entire description for others for easier understanding
> why this kinds of hint was born.
>
> Thanks.
>
> This patchset is against on mmotm-mmotm-2019-07-24-21-39.
>
> Below is description of previous entire patchset.
>
> ================= &< =====================
>
> - Background
>
> The Android terminology used for forking a new process and starting an app
> from scratch is a cold start, while resuming an existing app is a hot start.
> While we continually try to improve the performance of cold starts, hot
> starts will always be significantly less power hungry as well as faster so
> we are trying to make hot start more likely than cold start.
>
> To increase hot start, Android userspace manages the order that apps should
> be killed in a process called ActivityManagerService. ActivityManagerService
> tracks every Android app or service that the user could be interacting with
> at any time and translates that into a ranked list for lmkd(low memory
> killer daemon). They are likely to be killed by lmkd if the system has to
> reclaim memory. In that sense they are similar to entries in any other cache.
> Those apps are kept alive for opportunistic performance improvements but
> those performance improvements will vary based on the memory requirements of
> individual workloads.
>
> - Problem
>
> Naturally, cached apps were dominant consumers of memory on the system.
> However, they were not significant consumers of swap even though they are
> good candidate for swap. Under investigation, swapping out only begins
> once the low zone watermark is hit and kswapd wakes up, but the overall
> allocation rate in the system might trip lmkd thresholds and cause a cached
> process to be killed(we measured performance swapping out vs. zapping the
> memory by killing a process. Unsurprisingly, zapping is 10x times faster
> even though we use zram which is much faster than real storage) so kill
> from lmkd will often satisfy the high zone watermark, resulting in very
> few pages actually being moved to swap.
>
> - Approach
>
> The approach we chose was to use a new interface to allow userspace to
> proactively reclaim entire processes by leveraging platform information.
> This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages
> that are known to be cold from userspace and to avoid races with lmkd
> by reclaiming apps as soon as they entered the cached state. Additionally,
> it could provide many chances for platform to use much information to
> optimize memory efficiency.
>
> To achieve the goal, the patchset introduce two new options for madvise.
> One is MADV_COLD which will deactivate activated pages and the other is
> MADV_PAGEOUT which will reclaim private pages instantly. These new options
> complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to
> gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way
> that it hints the kernel that memory region is not currently needed and
> should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way
> that it hints the kernel that memory region is not currently needed and
> should be reclaimed when memory pressure rises.
>
> * v6 - http://lore.kernel.org/lkml/20190723062539.198697-1-minchan@kernel.org
> * v5 - http://lore.kernel.org/lkml/20190714233401.36909-1-minchan@kernel.org
> * v4 - http://lore.kernel.org/lkml/20190711012528.176050-1-minchan@kernel.org
> * v3 - http://lore.kernel.org/lkml/20190627115405.255259-1-minchan@kernel.org
> * v2 - http://lore.kernel.org/lkml/20190610111252.239156-1-minchan@kernel.org
> * v1 - http://lore.kernel.org/lkml/20190603053655.127730-1-minchan@kernel.org
>
> Minchan Kim (5):
> mm: introduce MADV_COLD
> mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
> mm: account nr_isolated_xxx in [isolate|putback]_lru_page
> mm: introduce MADV_PAGEOUT
> mm: factor out common parts between MADV_COLD and MADV_PAGEOUT
>
> arch/alpha/include/uapi/asm/mman.h | 3 +
> arch/mips/include/uapi/asm/mman.h | 3 +
> arch/parisc/include/uapi/asm/mman.h | 3 +
> arch/xtensa/include/uapi/asm/mman.h | 3 +
> include/linux/swap.h | 2 +
> include/uapi/asm-generic/mman-common.h | 3 +
> mm/compaction.c | 2 -
> mm/gup.c | 7 +-
> mm/internal.h | 2 +-
> mm/khugepaged.c | 3 -
> mm/madvise.c | 274 ++++++++++++++++++++++++-
> mm/memory-failure.c | 3 -
> mm/memory_hotplug.c | 4 -
> mm/mempolicy.c | 3 -
> mm/migrate.c | 37 +---
> mm/oom_kill.c | 2 +-
> mm/swap.c | 42 ++++
> mm/vmscan.c | 83 +++++++-
> 18 files changed, 416 insertions(+), 63 deletions(-)
>
> --
> 2.22.0.709.g102302147b-goog
>
^ permalink raw reply
* [PATCH v7 5/5] mm: factor out common parts between MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-07-26 2:34 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190726023435.214162-1-minchan@kernel.org>
There are many common parts between MADV_COLD and MADV_PAGEOUT.
This patch factor them out to save code duplication.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
mm/madvise.c | 194 ++++++++++++---------------------------------------
1 file changed, 46 insertions(+), 148 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 78aa6802b95ad..52f9bddbab19c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -30,6 +30,11 @@
#include "internal.h"
+struct madvise_walk_private {
+ struct mmu_gather *tlb;
+ bool pageout;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -310,15 +315,22 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
-static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
+static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
- struct mmu_gather *tlb = walk->private;
+ struct madvise_walk_private *private = walk->private;
+ struct mmu_gather *tlb = private->tlb;
+ bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
- struct page *page;
+ struct page *page = NULL;
+ LIST_HEAD(page_list);
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
@@ -366,10 +378,17 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
+ ClearPageReferenced(page);
test_and_clear_page_young(page);
- deactivate_page(page);
+ if (pageout) {
+ if (!isolate_lru_page(page))
+ list_add(&page->lru, &page_list);
+ } else
+ deactivate_page(page);
huge_unlock:
spin_unlock(ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
return 0;
}
@@ -437,12 +456,19 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
* As a side effect, it makes confuse idle-page tracking
* because they will miss recent referenced history.
*/
+ ClearPageReferenced(page);
test_and_clear_page_young(page);
- deactivate_page(page);
+ if (pageout) {
+ if (!isolate_lru_page(page))
+ list_add(&page->lru, &page_list);
+ } else
+ deactivate_page(page);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
cond_resched();
return 0;
@@ -452,10 +478,15 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
+ struct madvise_walk_private walk_private = {
+ .tlb = tlb,
+ .pageout = false,
+ };
+
struct mm_walk cold_walk = {
- .pmd_entry = madvise_cold_pte_range,
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
.mm = vma->vm_mm,
- .private = tlb,
+ .private = &walk_private,
};
tlb_start_vma(tlb, vma);
@@ -482,152 +513,19 @@ static long madvise_cold(struct vm_area_struct *vma,
return 0;
}
-static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
-{
- struct mmu_gather *tlb = walk->private;
- struct mm_struct *mm = tlb->mm;
- struct vm_area_struct *vma = walk->vma;
- pte_t *orig_pte, *pte, ptent;
- spinlock_t *ptl;
- LIST_HEAD(page_list);
- struct page *page;
-
- if (fatal_signal_pending(current))
- return -EINTR;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge(*pmd)) {
- pmd_t orig_pmd;
- unsigned long next = pmd_addr_end(addr, end);
-
- tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (!ptl)
- return 0;
-
- orig_pmd = *pmd;
- if (is_huge_zero_pmd(orig_pmd))
- goto huge_unlock;
-
- if (unlikely(!pmd_present(orig_pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- goto huge_unlock;
- }
-
- page = pmd_page(orig_pmd);
- if (next - addr != HPAGE_PMD_SIZE) {
- int err;
-
- if (page_mapcount(page) != 1)
- goto huge_unlock;
- get_page(page);
- spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (!err)
- goto regular_page;
- return 0;
- }
-
- if (pmd_young(orig_pmd)) {
- pmdp_invalidate(vma, addr, pmd);
- orig_pmd = pmd_mkold(orig_pmd);
-
- set_pmd_at(mm, addr, pmd, orig_pmd);
- tlb_remove_tlb_entry(tlb, pmd, addr);
- }
-
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
-
- if (!isolate_lru_page(page))
- list_add(&page->lru, &page_list);
-huge_unlock:
- spin_unlock(ptl);
- reclaim_pages(&page_list);
- return 0;
- }
-
- if (pmd_trans_unstable(pmd))
- return 0;
-regular_page:
-#endif
- tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
- for (; addr < end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
- if (!pte_present(ptent))
- continue;
-
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
- continue;
-
- /*
- * creating a THP page is expensive so split it only if we
- * are sure it's worth. Split it if we are only owner.
- */
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
- break;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
- break;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- break;
- }
- unlock_page(page);
- put_page(page);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- pte--;
- addr -= PAGE_SIZE;
- continue;
- }
-
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
- if (pte_young(ptent)) {
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- ptent = pte_mkold(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
- }
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
-
- if (!isolate_lru_page(page))
- list_add(&page->lru, &page_list);
- }
-
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
- reclaim_pages(&page_list);
- cond_resched();
-
- return 0;
-}
-
static void madvise_pageout_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
+ struct madvise_walk_private walk_private = {
+ .pageout = true,
+ .tlb = tlb,
+ };
+
struct mm_walk pageout_walk = {
- .pmd_entry = madvise_pageout_pte_range,
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
.mm = vma->vm_mm,
- .private = tlb,
+ .private = &walk_private,
};
tlb_start_vma(tlb, vma);
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v7 4/5] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-07-26 2:34 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim, linux-arch,
James E . J . Bottomley, Richard Henderson, Ralf Baechle,
Chris Zankel
In-Reply-To: <20190726023435.214162-1-minchan@kernel.org>
When a process expects no accesses to a certain memory range
for a long time, it could hint kernel that the pages can be
reclaimed instantly but data should be preserved for future use.
This could reduce workingset eviction so it ends up increasing
performance.
This patch introduces the new MADV_PAGEOUT hint to madvise(2)
syscall. MADV_PAGEOUT can be used by a process to mark a memory
range as not expected to be used for a long time so that kernel
reclaims *any LRU* pages instantly. The hint can help kernel in
deciding which pages to evict proactively.
A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit
intentionally because it's automatically bounded by PMD size.
If PMD size(e.g., 256) makes some trouble, we could fix it later
by limit it to SWAP_CLUSTER_MAX[1].
- man-page material
MADV_PAGEOUT (since Linux x.x)
Do not expect access in the near future so pages in the specified
regions could be reclaimed instantly regardless of memory pressure.
Thus, access in the range after successful operation could cause
major page fault but never lose the up-to-date contents unlike
MADV_DONTNEED. Pages belonging to a shared mapping are only processed
if a write access is allowed for the calling process.
MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
VM_PFNMAP pages.
* v6
* Fix build error kbuildbot reported
* https://lore.kernel.org/linux-mm/201907251759.zSy10dLW%25lkp@intel.com/
* v4
* clear young bit regardless of success of page isolation - hannes
* v3
* man page material modification - mhocko
* remove using SWAP_CLUSTER_MAX - mhocko
* v2
* add comment about SWAP_CLUSTER_MAX - mhocko
* add permission check to prevent sidechannel attack - mhocko
* add man page stuff - dave
* v1
* change pte to old and rely on the other's reference - hannes
* remove page_mapcount to check shared page - mhocko
* RFC v2
* make reclaim_pages simple via factoring out isolate logic - hannes
* RFCv1
* rename from MADV_COLD to MADV_PAGEOUT - hannes
* bail out if process is being killed - Hillf
* fix reclaim_pages bugs - Hillf
[1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/
Cc: linux-arch@vger.kernel.org
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Chris Zankel <chris@zankel.net>
Reported-by: kbuild test robot <lkp@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
arch/alpha/include/uapi/asm/mman.h | 1 +
arch/mips/include/uapi/asm/mman.h | 1 +
arch/parisc/include/uapi/asm/mman.h | 1 +
arch/xtensa/include/uapi/asm/mman.h | 1 +
include/linux/swap.h | 1 +
include/uapi/asm-generic/mman-common.h | 1 +
mm/madvise.c | 195 +++++++++++++++++++++++++
mm/vmscan.c | 55 +++++++
8 files changed, 256 insertions(+)
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index f3258fbf03d03..a18ec7f638880 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -69,6 +69,7 @@
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 00ad09fc5eb16..57dc2ac4f8bda 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -96,6 +96,7 @@
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index eb14e3a7b8f37..6fd8871e4081e 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -49,6 +49,7 @@
#define MADV_DOFORK 11 /* do inherit across fork */
#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index f926b00ff11f9..e5e6437529475 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -104,6 +104,7 @@
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce997edb8bbc..063c0c1e112bd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,6 +365,7 @@ extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
+extern unsigned long reclaim_pages(struct list_head *page_list);
#ifdef CONFIG_NUMA
extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 23431faf0eb6e..c160a5354eb62 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -68,6 +68,7 @@
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
/* compatibility flags */
#define MAP_FILE 0
diff --git a/mm/madvise.c b/mm/madvise.c
index e724bce09d7ca..78aa6802b95ad 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -42,6 +42,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_COLD:
+ case MADV_PAGEOUT:
case MADV_FREE:
return 0;
default:
@@ -481,6 +482,197 @@ static long madvise_cold(struct vm_area_struct *vma,
return 0;
}
+static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ LIST_HEAD(page_list);
+ struct page *page;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_tlb_entry(tlb, pmd, addr);
+ }
+
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+
+ if (!isolate_lru_page(page))
+ list_add(&page->lru, &page_list);
+huge_unlock:
+ spin_unlock(ptl);
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+regular_page:
+#endif
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+
+ if (!isolate_lru_page(page))
+ list_add(&page->lru, &page_list);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ reclaim_pages(&page_list);
+ cond_resched();
+
+ return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct mm_walk pageout_walk = {
+ .pmd_entry = madvise_pageout_pte_range,
+ .mm = vma->vm_mm,
+ .private = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &pageout_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ if (!can_do_pageout(vma))
+ return 0;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -871,6 +1063,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_willneed(vma, prev, start, end);
case MADV_COLD:
return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -893,6 +1087,7 @@ madvise_behavior_valid(int behavior)
case MADV_DONTNEED:
case MADV_FREE:
case MADV_COLD:
+ case MADV_PAGEOUT:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d1d7163c281de..47aa2158cfac2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2158,6 +2158,61 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate, nr_rotated, sc->priority, file);
}
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid = -1;
+ unsigned long nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct reclaim_stat dummy_stat;
+ struct page *page;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ if (nid == -1) {
+ nid = page_to_nid(page);
+ INIT_LIST_HEAD(&node_page_list);
+ }
+
+ if (nid == page_to_nid(page)) {
+ list_move(&page->lru, &node_page_list);
+ continue;
+ }
+
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+ nid = -1;
+ }
+
+ if (!list_empty(&node_page_list)) {
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+ }
+
+ return nr_reclaimed;
+}
+
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v7 3/5] mm: account nr_isolated_xxx in [isolate|putback]_lru_page
From: Minchan Kim @ 2019-07-26 2:34 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190726023435.214162-1-minchan@kernel.org>
The isolate counting is pecpu counter so it would be not huge gain
to work them by batch. Rather than complicating to make them batch,
let's make it more stright-foward via adding the counting logic
into [isolate|putback]_lru_page API.
* v1
* fix accounting bug - Hillf
Link: http://lkml.kernel.org/r/20190531165927.GA20067@cmpxchg.org
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
mm/compaction.c | 2 --
mm/gup.c | 7 +------
mm/khugepaged.c | 3 ---
mm/memory-failure.c | 3 ---
mm/memory_hotplug.c | 4 ----
mm/mempolicy.c | 3 ---
mm/migrate.c | 37 ++++++++-----------------------------
mm/vmscan.c | 22 ++++++++++++++++------
8 files changed, 25 insertions(+), 56 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index d99d59412c755..ac4ead029b4a1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -984,8 +984,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
diff --git a/mm/gup.c b/mm/gup.c
index 012060efddf18..357cfc1ca37d1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1460,13 +1460,8 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
drain_allow = false;
}
- if (!isolate_lru_page(head)) {
+ if (!isolate_lru_page(head))
list_add_tail(&head->lru, &cma_page_list);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON +
- page_is_file_cache(head),
- hpage_nr_pages(head));
- }
}
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b232156..a8b517d6df4ab 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -503,7 +503,6 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -602,8 +601,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7ef849da8278c..9900bb95d7740 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1791,9 +1791,6 @@ static int __soft_offline_page(struct page *page, int flags)
* so use !__PageMovable instead for LRU page's mapping
* cannot have PAGE_MAPPING_MOVABLE.
*/
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5b8811945bbba..9a82e12bd0e73 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1373,10 +1373,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
list_add_tail(&page->lru, &source);
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
-
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 547cd403ed020..e8bbec6148dfe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -977,9 +977,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
diff --git a/mm/migrate.c b/mm/migrate.c
index 92d346646ed55..84b89d2d69065 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -190,8 +190,6 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -1177,10 +1175,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
return -ENOMEM;
if (page_count(page) == 1) {
+ bool is_lru = !__PageMovable(page);
+
/* page was freed from under us. So we are done. */
ClearPageActive(page);
ClearPageUnevictable(page);
- if (unlikely(__PageMovable(page))) {
+ if (likely(is_lru))
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ -hpage_nr_pages(page));
+ else {
lock_page(page);
if (!PageMovable(page))
__ClearPageIsolated(page);
@@ -1206,15 +1211,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
* restored.
*/
list_del(&page->lru);
-
- /*
- * Compaction can migrate also non-LRU pages which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __PageMovable
- */
- if (likely(!__PageMovable(page)))
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
}
/*
@@ -1568,9 +1564,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
err = 0;
list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
}
out_putpage:
/*
@@ -1886,8 +1879,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
- int page_lru;
-
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
@@ -1909,10 +1900,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 0;
}
- page_lru = page_is_file_cache(page);
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
- hpage_nr_pages(page));
-
/*
* Isolating the page has taken another reference, so the
* caller's reference can be safely dropped without the page
@@ -1967,8 +1954,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
putback_lru_page(page);
}
isolated = 0;
@@ -1998,7 +1983,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- int page_lru = page_is_file_cache(page);
unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
@@ -2044,8 +2028,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Retake the callers reference and putback on LRU */
get_page(page);
putback_lru_page(page);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
goto out_unlock;
}
@@ -2095,9 +2077,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru,
- -HPAGE_PMD_NR);
return isolated;
out_fail:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 436577236dd3e..d1d7163c281de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1021,6 +1021,9 @@ int remove_mapping(struct address_space *mapping, struct page *page)
void putback_lru_page(struct page *page)
{
lru_cache_add(page);
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_cache(page),
+ -hpage_nr_pages(page));
put_page(page); /* drop ref from isolate */
}
@@ -1486,6 +1489,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
nr_reclaimed += nr_pages;
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ -nr_pages);
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
@@ -1561,7 +1567,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
TTU_IGNORE_ACCESS, &dummy_stat, true);
list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1637,6 +1642,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
*/
ClearPageLRU(page);
ret = 0;
+ __mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ hpage_nr_pages(page));
}
return ret;
@@ -1768,6 +1776,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
total_scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
+
return nr_taken;
}
@@ -1816,6 +1825,9 @@ int isolate_lru_page(struct page *page)
ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ hpage_nr_pages(page));
}
spin_unlock_irq(&pgdat->lru_lock);
}
@@ -1907,6 +1919,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ -hpage_nr_pages(page));
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1984,7 +1999,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
@@ -2010,8 +2024,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
move_pages_to_lru(lruvec, &page_list);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&page_list);
@@ -2070,7 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
@@ -2139,7 +2150,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGDEACTIVATE, nr_deactivate);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_active);
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v7 2/5] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
From: Minchan Kim @ 2019-07-26 2:34 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190726023435.214162-1-minchan@kernel.org>
The local variable references in shrink_page_list is PAGEREF_RECLAIM_CLEAN
as default. It is for preventing to reclaim dirty pages when CMA try to
migrate pages. Strictly speaking, we don't need it because CMA didn't allow
to write out by .may_writepage = 0 in reclaim_clean_pages_from_list.
Moreover, it has a problem to prevent anonymous pages's swap out even
though force_reclaim = true in shrink_page_list on upcoming patch.
So this patch makes references's default value to PAGEREF_RECLAIM and
rename force_reclaim with ignore_references to make it more clear.
This is a preparatory work for next patch.
* RFCv1
* use ignore_referecnes as parameter name - hannes
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
mm/vmscan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 82e1e229eef21..436577236dd3e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1124,7 +1124,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
- bool force_reclaim)
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -1138,7 +1138,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
@@ -1269,7 +1269,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- if (!force_reclaim)
+ if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v7 1/5] mm: introduce MADV_COLD
From: Minchan Kim @ 2019-07-26 2:34 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim, linux-arch,
James E . J . Bottomley, Richard Henderson, Ralf Baechle,
Chris Zankel
In-Reply-To: <20190726023435.214162-1-minchan@kernel.org>
When a process expects no accesses to a certain memory range, it could
give a hint to kernel that the pages can be reclaimed when memory pressure
happens but data should be preserved for future use. This could reduce
workingset eviction so it ends up increasing performance.
This patch introduces the new MADV_COLD hint to madvise(2) syscall.
MADV_COLD can be used by a process to mark a memory range as not expected
to be used in the near future. The hint can help kernel in deciding which
pages to evict early during memory pressure.
It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
active file page -> inactive file LRU
active anon page -> inacdtive anon LRU
Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
file LRU's head because MADV_COLD is a little bit different symantic.
MADV_FREE means it's okay to discard when the memory pressure because
the content of the page is *garbage* so freeing such pages is almost zero
overhead since we don't need to swap out and access afterward causes just
minor fault. Thus, it would make sense to put those freeable pages in
inactive file LRU to compete other used-once pages. It makes sense for
implmentaion point of view, too because it's not swapbacked memory any
longer until it would be re-dirtied. Even, it could give a bonus to make
them be reclaimed on swapless system. However, MADV_COLD doesn't mean
garbage so reclaiming them requires swap-out/in in the end so it's bigger
cost. Since we have designed VM LRU aging based on cost-model, anonymous
cold pages would be better to position inactive anon's LRU list, not file
LRU. Furthermore, it would help to avoid unnecessary scanning if system
doesn't have a swap device. Let's start simpler way without adding
complexity at this moment. However, keep in mind, too that it's a caveat
that workloads with a lot of pages cache are likely to ignore MADV_COLD
on anonymous memory because we rarely age anonymous LRU lists.
* man-page material
MADV_COLD (since Linux x.x)
Pages in the specified regions will be treated as less-recently-accessed
compared to pages in the system with similar access frequencies.
In contrast to MADV_FREE, the contents of the region are preserved
regardless of subsequent writes to pages.
MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP
pages.
* v6
* Fix build error kbuildbot reported
* https://lore.kernel.org/linux-mm/201907251647.fhJ6XzdA%25lkp@intel.com/
* https://lore.kernel.org/linux-mm/201907251529.kTj2FpcL%25lkp@intel.com/
* v5
* Fix typo and correct wrong lazy_mmu_mode pair use - surenb
* v2
* add up the warn with lots of page cache workload - mhocko
* add man page stuff - dave
* v1
* remove page_mapcount filter - hannes, mhocko
* remove idle page handling - joelaf
* RFCv2
* add more description - mhocko
* RFCv1
* renaming from MADV_COOL to MADV_COLD - hannes
* internal review
* use clear_page_youn in deactivate_page - joelaf
* Revise the description - surenb
* Renaming from MADV_WARM to MADV_COOL - surenb
Cc: linux-arch@vger.kernel.org
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Chris Zankel <chris@zankel.net>
Reported-by: kbuild test robot <lkp@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
arch/alpha/include/uapi/asm/mman.h | 2 +
arch/mips/include/uapi/asm/mman.h | 2 +
arch/parisc/include/uapi/asm/mman.h | 2 +
arch/xtensa/include/uapi/asm/mman.h | 2 +
include/linux/swap.h | 1 +
include/uapi/asm-generic/mman-common.h | 2 +
mm/internal.h | 2 +-
mm/madvise.c | 181 ++++++++++++++++++++++++-
mm/oom_kill.c | 2 +-
mm/swap.c | 42 ++++++
10 files changed, 234 insertions(+), 4 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index ac23379b7a876..f3258fbf03d03 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -68,6 +68,8 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index c2b40969eb1fa..00ad09fc5eb16 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -95,6 +95,8 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index c98162f494dbb..eb14e3a7b8f37 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -48,6 +48,8 @@
#define MADV_DONTFORK 10 /* don't inherit across fork */
#define MADV_DOFORK 11 /* do inherit across fork */
+#define MADV_COLD 20 /* deactivate these pages */
+
#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index ebbb48842190d..f926b00ff11f9 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -103,6 +103,8 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a33b7e7..0ce997edb8bbc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 63b1f506ea678..23431faf0eb6e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -67,6 +67,8 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/mm/internal.h b/mm/internal.h
index e32390802fd3f..0d5f720c75abf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069fd..e724bce09d7ca 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
+#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
@@ -40,6 +41,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_COLD:
case MADV_FREE:
return 0;
default:
@@ -307,6 +309,178 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ test_and_clear_page_young(page);
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+regular_page:
+#endif
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * Creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ /*
+ * We are deactivating a page for accelerating reclaiming.
+ * VM couldn't reclaim the page unless we clear PG_young.
+ * As a side effect, it makes confuse idle-page tracking
+ * because they will miss recent referenced history.
+ */
+ test_and_clear_page_young(page);
+ deactivate_page(page);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+
+ return 0;
+}
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct mm_walk cold_walk = {
+ .pmd_entry = madvise_cold_pte_range,
+ .mm = vma->vm_mm,
+ .private = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &cold_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -519,7 +693,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
int behavior)
{
*prev = vma;
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -541,7 +715,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
return -ENOMEM;
}
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -695,6 +869,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -716,6 +892,7 @@ madvise_behavior_valid(int behavior)
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
+ case MADV_COLD:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a2a5edbf61789..493028ad865f1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -522,7 +522,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
continue;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 0226c53465604..9c0c5d6286faa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
@@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page)
}
}
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
/**
* mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
@@ -687,6 +728,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
--
2.22.0.709.g102302147b-goog
^ permalink raw reply related
* [PATCH v7 0/5] Introduce MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-07-26 2:34 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
This patch is part of previous series:
https://lore.kernel.org/lkml/20190531064313.193437-1-minchan@kernel.org/
Originally, it was created for external madvise hinting feature.
https://lkml.org/lkml/2019/5/31/463
Michal wanted to separte the discussion from external hinting interface
so this patchset includes only first part of my entire patchset
- introduce MADV_COLD and MADV_PAGEOUT hint to madvise.
However, I keep entire description for others for easier understanding
why this kinds of hint was born.
Thanks.
This patchset is against on mmotm-mmotm-2019-07-24-21-39.
Below is description of previous entire patchset.
================= &< =====================
- Background
The Android terminology used for forking a new process and starting an app
from scratch is a cold start, while resuming an existing app is a hot start.
While we continually try to improve the performance of cold starts, hot
starts will always be significantly less power hungry as well as faster so
we are trying to make hot start more likely than cold start.
To increase hot start, Android userspace manages the order that apps should
be killed in a process called ActivityManagerService. ActivityManagerService
tracks every Android app or service that the user could be interacting with
at any time and translates that into a ranked list for lmkd(low memory
killer daemon). They are likely to be killed by lmkd if the system has to
reclaim memory. In that sense they are similar to entries in any other cache.
Those apps are kept alive for opportunistic performance improvements but
those performance improvements will vary based on the memory requirements of
individual workloads.
- Problem
Naturally, cached apps were dominant consumers of memory on the system.
However, they were not significant consumers of swap even though they are
good candidate for swap. Under investigation, swapping out only begins
once the low zone watermark is hit and kswapd wakes up, but the overall
allocation rate in the system might trip lmkd thresholds and cause a cached
process to be killed(we measured performance swapping out vs. zapping the
memory by killing a process. Unsurprisingly, zapping is 10x times faster
even though we use zram which is much faster than real storage) so kill
from lmkd will often satisfy the high zone watermark, resulting in very
few pages actually being moved to swap.
- Approach
The approach we chose was to use a new interface to allow userspace to
proactively reclaim entire processes by leveraging platform information.
This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages
that are known to be cold from userspace and to avoid races with lmkd
by reclaiming apps as soon as they entered the cached state. Additionally,
it could provide many chances for platform to use much information to
optimize memory efficiency.
To achieve the goal, the patchset introduce two new options for madvise.
One is MADV_COLD which will deactivate activated pages and the other is
MADV_PAGEOUT which will reclaim private pages instantly. These new options
complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to
gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed when memory pressure rises.
* v6 - http://lore.kernel.org/lkml/20190723062539.198697-1-minchan@kernel.org
* v5 - http://lore.kernel.org/lkml/20190714233401.36909-1-minchan@kernel.org
* v4 - http://lore.kernel.org/lkml/20190711012528.176050-1-minchan@kernel.org
* v3 - http://lore.kernel.org/lkml/20190627115405.255259-1-minchan@kernel.org
* v2 - http://lore.kernel.org/lkml/20190610111252.239156-1-minchan@kernel.org
* v1 - http://lore.kernel.org/lkml/20190603053655.127730-1-minchan@kernel.org
Minchan Kim (5):
mm: introduce MADV_COLD
mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
mm: account nr_isolated_xxx in [isolate|putback]_lru_page
mm: introduce MADV_PAGEOUT
mm: factor out common parts between MADV_COLD and MADV_PAGEOUT
arch/alpha/include/uapi/asm/mman.h | 3 +
arch/mips/include/uapi/asm/mman.h | 3 +
arch/parisc/include/uapi/asm/mman.h | 3 +
arch/xtensa/include/uapi/asm/mman.h | 3 +
include/linux/swap.h | 2 +
include/uapi/asm-generic/mman-common.h | 3 +
mm/compaction.c | 2 -
mm/gup.c | 7 +-
mm/internal.h | 2 +-
mm/khugepaged.c | 3 -
mm/madvise.c | 274 ++++++++++++++++++++++++-
mm/memory-failure.c | 3 -
mm/memory_hotplug.c | 4 -
mm/mempolicy.c | 3 -
mm/migrate.c | 37 +---
mm/oom_kill.c | 2 +-
mm/swap.c | 42 ++++
mm/vmscan.c | 83 +++++++-
18 files changed, 416 insertions(+), 63 deletions(-)
--
2.22.0.709.g102302147b-goog
^ permalink raw reply
* Re: [v4 PATCH 2/2] mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind
From: Yang Shi @ 2019-07-25 17:38 UTC (permalink / raw)
To: Andrew Morton, Vlastimil Babka
Cc: mhocko, mgorman, linux-mm, linux-kernel, linux-api
In-Reply-To: <20190724174423.1826c92f72ce9c815ebc72d9@linux-foundation.org>
On 7/24/19 5:44 PM, Andrew Morton wrote:
> On Wed, 24 Jul 2019 10:19:34 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>
>> On 7/23/19 7:35 AM, Yang Shi wrote:
>>>
>>> On 7/22/19 6:02 PM, Andrew Morton wrote:
>>>> On Mon, 22 Jul 2019 09:25:09 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>>>>
>>>>>> since there may be pages off LRU temporarily. We should migrate other
>>>>>> pages if MPOL_MF_MOVE* is specified. Set has_unmovable flag if some
>>>>>> paged could not be not moved, then return -EIO for mbind() eventually.
>>>>>>
>>>>>> With this change the above test would return -EIO as expected.
>>>>>>
>>>>>> Cc: Vlastimil Babka <vbabka@suse.cz>
>>>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>>>> Cc: Mel Gorman <mgorman@techsingularity.net>
>>>>>> Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
>>>>> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
>>>> Thanks.
>>>>
>>>> I'm a bit surprised that this doesn't have a cc:stable. Did we
>>>> consider that?
>>> The VM_BUG just happens on 4.9, and it is enabled only by CONFIG_VM. For
>>> post-4.9 kernel, this fixes the semantics of mbind which should be not a
>>> regression IMHO.
>> 4.9 is a LTS kernel, so perhaps worth trying?
>>
> OK, I'll add cc:stable to
Thanks.
>
> mm-mempolicy-make-the-behavior-consistent-when-mpol_mf_move-and-mpol_mf_strict-were-specified.patch
>
> and
>
> mm-mempolicy-handle-vma-with-unmovable-pages-mapped-correctly-in-mbind.patch
>
> Do we have a Fixes: for these patches?
It looks the problem has existed since very beginning. The oldest commit
which I can find is dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd ("[PATCH]
Swap Migration V5: MPOL_MF_MOVE interface"), which is a 2.6.16 commit.
^ permalink raw reply
* Re: [PATCH 4/5] pidfd: add CLONE_WAIT_PID
From: Eric W. Biederman @ 2019-07-25 16:57 UTC (permalink / raw)
To: Christian Brauner
Cc: Oleg Nesterov, linux-kernel, arnd, keescook, joel, tglx, tj,
dhowells, jannh, luto, akpm, cyphar, torvalds, viro, kernel-team,
Ingo Molnar, Peter Zijlstra, linux-api
In-Reply-To: <20190725122650.4i3arct5rpchqmyt@brauner.io>
Christian Brauner <christian@brauner.io> writes:
> On Thu, Jul 25, 2019 at 01:43:59PM +0200, Oleg Nesterov wrote:
>> Or. We can change wait_consider_task() to not clear ->notask_error if
>> WXXX and the child is PF_WAIT_PID.
>>
>> This way you can "safely" use wait() without WNOHANG, it won't block if
>> all the children which can report an even are PF_WAIT_PID.
>>
>> But I do not understand your use-cases, I have no idea if this can help
>
> One usecase (among others listed in the commit message) are shared
> libraries. P_ALL is usually something you can't really use in a shared
> library because you have no idea what else might be fork()ed off. Only
> the main program can use this but none of the auxiliary libraries that
> it uses.
> The other way around you want to be able fork() off something without
> affecting P_ALL in the main program.
> The key is that you want to be able to create child processes in a
> shared library without the main programing having to know about this so
> that it can use P_ALL and never get stuff from the library.
>
> Assume you have a project with a main loop with a million things
> happening in that mainloop like some gui app running an avi video. For
> example, gtk uses gstreamer which forks off all codecs in child
> processes which are sandboxed for security. So gstreamer is using helper
> processes in the background which are my children now. Now I'm creating
> four more additional helper processes as well. Now, in my (glib, qt
> whatever) mainloop on SIGCHLD some part of the app is checking with
> WNHOANG and finds a process has exited. It's cleaning this thing up now
> but it's not a process it wanted to clean up. The other part of the app
> is now doing waitid(P_PID, pid) but will find the process already gone
> it wanted to reap.
>
> I hope I'm expressing this well enough.
I think so.
A) I think Oleg is correct that you should test the flag in
do_wait_thread rather than elsewhere.
B) We have a deficiency in do_wait that should be addressed. The
do_wait function does not have a fast path for waiting on a
particular process. For adding this functionality I such a fast path
goes from a nice to have to a necessity for getting all of the
fiddly details correct.
C) I believe the semantics should be that while such a file descriptor
is open, only that file descriptor can be used to reap the process.
And that it should be allowed to pass the file descriptor between
processes. Which means the parent can die and the process be
reparted to init and we should still be able to reap the process with
the file descriptor.
D) I think it is a toss up how we should deal with such a process when
the file descriptor is closed. Setting the process to autoreap
or reparent to init and let init deal with it. My inclination is
that autoreap is the correct behavior.
Eric
^ permalink raw reply
* Re: [PATCH 4/5] pidfd: add CLONE_WAIT_PID
From: Christian Brauner @ 2019-07-25 16:56 UTC (permalink / raw)
To: Oleg Nesterov
Cc: linux-kernel, arnd, ebiederm, keescook, joel, tglx, tj, dhowells,
jannh, luto, akpm, cyphar, torvalds, viro, kernel-team,
Ingo Molnar, Peter Zijlstra, linux-api
In-Reply-To: <20190725161316.GI4707@redhat.com>
On Thu, Jul 25, 2019 at 06:13:17PM +0200, Oleg Nesterov wrote:
> On 07/25, Christian Brauner wrote:
> >
> > The key is that you want to be able to create child processes in a
> > shared library without the main programing having to know about this so
> > that it can use P_ALL and never get stuff from the library.
>
> OK, thanks...
>
> in this case you should probablu pass 0 in CSIGNAL to ensure that the main
> program won't be notified when that child exits.
Yes, that's the idea. So you'd turn off SIGCHLD and rely on pidfd polling
only. That's similar to how pdfork() on FreeBSD works. It's just that we
need to do:
struct clone_args args = {
.exit_signal = 0,
.flags = CLONE_PIDFD | CLONE_WAIT_PID,
};
Now the shared library can guarantee that noone else in the mainloop
gets woken by a SIGCHLD from the pidfd-based helper process (because
exit_signal is 0) but afaict it also needs CLONE_WAIT_PID. Since the
latter allows it to guarantee that if someone in the mainloop gets
SIGCHLD from another regular process and calls waitid(P_ALL) it won't
accidently reap a pidfd-based process that has exited.
Fyi, I'm splitting CLONE_WAIT_PID out into a separate patchset and only
keep P_PIDFD for now. So we can discuss this independently. You think
that's better Oleg?
Christian
^ permalink raw reply
* Re: [PATCH 4/5] pidfd: add CLONE_WAIT_PID
From: Oleg Nesterov @ 2019-07-25 16:13 UTC (permalink / raw)
To: Christian Brauner
Cc: linux-kernel, arnd, ebiederm, keescook, joel, tglx, tj, dhowells,
jannh, luto, akpm, cyphar, torvalds, viro, kernel-team,
Ingo Molnar, Peter Zijlstra, linux-api
In-Reply-To: <20190725122650.4i3arct5rpchqmyt@brauner.io>
On 07/25, Christian Brauner wrote:
>
> The key is that you want to be able to create child processes in a
> shared library without the main programing having to know about this so
> that it can use P_ALL and never get stuff from the library.
OK, thanks...
in this case you should probablu pass 0 in CSIGNAL to ensure that the main
program won't be notified when that child exits.
Oleg.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox