From: Nikolay Borisov <kernel-6AxghH7DbtA@public.gmane.org>
To: ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
Nikolay Borisov <kernel-6AxghH7DbtA@public.gmane.org>,
operations-/eCPMmvKun9pLGFMi4vTTA@public.gmane.org
Subject: [PATCH 3/4] userns/inotify: Initial implementation of inotify per-userns
Date: Wed, 13 Jul 2016 15:14:12 +0300 [thread overview]
Message-ID: <1468412053-30130-4-git-send-email-kernel@kyup.com> (raw)
In-Reply-To: <1468412053-30130-1-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
So here is the first version of the hierarchical inotify limits. Changes
include:
* Added 2 new sysctls:
- inotify_reserved_user_instances and inotify_reserved_user_watches these essentially
control the distribution of instances/watches down the hierarchy. For example if we
have instances/watches limit of 1024/256 and reserved instances/watches are set to
128/32 then at every level of the hierarchy instances/watches are going to be reduced
by 128/32, so at userns level of 1 (e.g. init_user_ns->level_1_user_ns) each user would
have 896/224 respectively. Currently the defaults are calculated so that at least 8 levels
of indirection are allowed. Those can be set only by global root user.
* Changed core userns code to support adding per-userns/per-user counters, this
is happening in the nsuser_state structure.
* Add necessary functionality to inotify to make use of the newly added
userns infrastructure.
* Moved the initialization of the inotify_max_user_instances/watches to
user_namespaces_init so that it's initialised by the time inotify is
bootstrapped.
Signed-off-by: Nikolay Borisov <kernel-6AxghH7DbtA@public.gmane.org>
---
fs/notify/inotify/inotify.h | 2 +
fs/notify/inotify/inotify_user.c | 93 +++++++++++++++++++++++++++++++++-
include/linux/fsnotify_backend.h | 3 ++
include/linux/user_namespace.h | 45 +++++++++++++++++
kernel/user_namespace.c | 106 ++++++++++++++++++++++++++++++++++++++-
5 files changed, 246 insertions(+), 3 deletions(-)
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..8ead0a1a3cdb 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -1,6 +1,8 @@
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */
+#include <linux/page_counter.h>
+#include <linux/user_namespace.h>
struct inotify_event_info {
struct fsnotify_event fse;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b8d08d0d0a4d..076a9990eff4 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -48,6 +48,8 @@
static int inotify_max_user_instances __read_mostly;
static int inotify_max_queued_events __read_mostly;
static int inotify_max_user_watches __read_mostly;
+int inotify_reserved_user_instances __read_mostly;
+int inotify_reserved_user_watches __read_mostly;
static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
@@ -82,10 +84,96 @@ struct ctl_table inotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero
},
+ {
+ .procname = "reserved_user_instances",
+ .data = &inotify_reserved_user_instances,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "reserved_user_watches",
+ .data = &inotify_reserved_user_watches,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
{ }
};
#endif /* CONFIG_SYSCTL */
+static inline void __init_counters(struct nsuser_state *state,
+ struct nsuser_state *parent,
+ struct user_namespace *ns)
+{
+ if (ns == &init_user_ns) {
+ page_counter_init(&state->inotify_watches, NULL);
+ page_counter_init(&state->inotify_instances, NULL);
+ page_counter_limit(&state->inotify_watches,
+ init_user_ns.inotify_max_user_watches);
+ page_counter_limit(&state->inotify_instances,
+ init_user_ns.inotify_max_user_instances);
+ } else {
+ page_counter_init(&state->inotify_watches,
+ &parent->inotify_watches);
+ page_counter_init(&state->inotify_instances,
+ &parent->inotify_instances);
+ page_counter_limit(&state->inotify_watches, ns->inotify_max_user_watches);
+ page_counter_limit(&state->inotify_instances, ns->inotify_max_user_instances);
+ }
+}
+
+static noinline int inotify_init_state(struct user_namespace *ns, kuid_t uid)
+{
+ struct nsuser_state *state;
+ struct page_counter *cnt;
+
+ /* We can work with the data without the lock held, since liveliness
+ * of data is guaranteed as long as the namespace is alive
+ */
+ spin_lock_bh(&nsuser_state_lock);
+ state = get_nsuser_state(ns, uid);
+ spin_unlock_bh(&nsuser_state_lock);
+
+ if (!state) {
+
+ state = kzalloc(sizeof(struct nsuser_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ state->uid = uid;
+ state->ns = ns;
+
+ if (ns == &init_user_ns)
+ __init_counters(state, NULL, ns);
+ else {
+ struct nsuser_state *parent_state;
+
+ spin_lock_bh(&nsuser_state_lock);
+ parent_state = get_nsuser_state(ns->parent, ns->owner);
+ spin_unlock_bh(&nsuser_state_lock);
+
+ BUG_ON(!parent_state);
+
+ __init_counters(state, parent_state, ns);
+ }
+
+ page_counter_charge(&state->inotify_instances, 1);
+
+ spin_lock_bh(&nsuser_state_lock);
+ hash_add(nsstate_hash, &state->node, __kuid_val(uid));
+ spin_unlock_bh(&nsuser_state_lock);
+ } else {
+ if (!page_counter_try_charge(&state->inotify_instances, 1, &cnt))
+ return -EMFILE;
+ }
+
+ return 0;
+}
+
+
static inline __u32 inotify_arg_to_mask(u32 arg)
{
__u32 mask;
@@ -819,8 +907,9 @@ static int __init inotify_user_setup(void)
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
inotify_max_queued_events = 16384;
- inotify_max_user_instances = 128;
- inotify_max_user_watches = 8192;
+ /* These reserves should allow for 8 levels of nesting in userns */
+ inotify_reserved_user_instances = 32;
+ inotify_reserved_user_watches = 1024;
return 0;
}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 29f917517299..eb83a10afac7 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -170,6 +170,9 @@ struct fsnotify_group {
spinlock_t idr_lock;
struct idr idr;
struct user_struct *user;
+ struct user_namespace *userns;
+ kuid_t uid; /* id in the userns this group is
+ associated with */
} inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8297e5b341d8..3116a2df1cee 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -6,6 +6,9 @@
#include <linux/ns_common.h>
#include <linux/sched.h>
#include <linux/err.h>
+#include <linux/hashtable.h>
+#include <linux/spinlock.h>
+#include <linux/page_counter.h>
#define UID_GID_MAP_MAX_EXTENTS 5
@@ -22,6 +25,21 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
+#define NSSTATE_HASHTABLE_BITS 10
+extern DECLARE_HASHTABLE(nsstate_hash, NSSTATE_HASHTABLE_BITS);
+extern spinlock_t nsuser_state_lock;
+
+/* Generic struct to hold various peruser/perns state */
+struct nsuser_state {
+ struct hlist_node node; /* keyed at nstate_hash */
+ void *ns; /* ns in which uid is valid */
+ kuid_t uid;
+#ifdef CONFIG_INOTIFY_USER
+ struct page_counter inotify_watches; /* How many inotify watches does this user */
+ struct page_counter inotify_instances; /* How many inotify devs does this user have opened? */
+#endif
+};
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
@@ -39,11 +57,28 @@ struct user_namespace {
struct key *persistent_keyring_register;
struct rw_semaphore persistent_keyring_register_sem;
#endif
+
+#ifdef CONFIG_INOTIFY_USER
+ int inotify_max_user_instances;
+ int inotify_max_user_watches;
+#endif
};
extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
+static inline struct nsuser_state *get_nsuser_state(struct user_namespace *ns,
+ kuid_t uid)
+{
+ struct nsuser_state *state;
+
+ WARN_ON(!spin_is_locked(&nsuser_state_lock));
+
+ hash_for_each_possible(nsstate_hash, state, node, __kuid_val(uid))
+ if (state->ns == ns && uid_eq(state->uid, uid))
+ return state;
+ return NULL;
+}
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
@@ -74,6 +109,16 @@ extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
#else
+static inline struct nsuser_state *get_nsuser_state(struct user_namespace *ns,
+ kuid_t uid)
+{
+ struct nsuser_state *state;
+ hash_for_each_possible(nsstate_hash, state, node, &init_user_ns)
+ if (uid_eq(uid, state->uid) && state->ns == ns);
+ return state;
+ return NULL;
+}
+
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
return &init_user_ns;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..cb51e3607d2d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,10 +22,20 @@
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);
+DEFINE_HASHTABLE(nsstate_hash, NSSTATE_HASHTABLE_BITS);
+DEFINE_SPINLOCK(nsuser_state_lock);
+
+#ifdef CONFIG_INOTIFY_USER
+extern int inotify_reserved_user_instances;
+extern int inotify_reserved_user_watches;
+#endif
+
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
@@ -60,10 +70,13 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
int create_user_ns(struct cred *new)
{
struct user_namespace *ns, *parent_ns = new->user_ns;
+ struct nsuser_state *state, *parent_state;
kuid_t owner = new->euid;
kgid_t group = new->egid;
int ret;
-
+#ifdef CONFIG_INOTIFY_USER
+ int tmp;
+#endif
if (parent_ns->level > 32)
return -EUSERS;
@@ -88,9 +101,16 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
+ state = kmalloc(sizeof(struct nsuser_state), GFP_KERNEL);
+ if (!state) {
+ kmem_cache_free(user_ns_cachep, ns);
+ return -ENOMEM;
+ }
+
ret = ns_alloc_inum(&ns->ns);
if (ret) {
kmem_cache_free(user_ns_cachep, ns);
+ kfree(state);
return ret;
}
ns->ns.ops = &userns_operations;
@@ -101,6 +121,13 @@ int create_user_ns(struct cred *new)
ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
+#ifdef CONFIG_INOTIFY_USER
+ tmp = parent_ns->inotify_max_user_instances - inotify_reserved_user_instances;
+ ns->inotify_max_user_instances = max(0, tmp);
+
+ tmp = parent_ns->inotify_max_user_watches - inotify_reserved_user_watches;
+ ns->inotify_max_user_watches = max(0, tmp);
+#endif
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
mutex_lock(&userns_state_mutex);
@@ -112,8 +139,63 @@ int create_user_ns(struct cred *new)
#ifdef CONFIG_PERSISTENT_KEYRINGS
init_rwsem(&ns->persistent_keyring_register_sem);
#endif
+
+ spin_lock_bh(&nsuser_state_lock);
+ parent_state = get_nsuser_state(parent_ns, owner);
+ spin_unlock_bh(&nsuser_state_lock);
+ if (!parent_state) {
+ struct nsuser_state *grandfather_state;
+
+ spin_lock_bh(&nsuser_state_lock);
+ /* init_user_ns doesn't have a parent */
+ if (parent_ns == &init_user_ns)
+ grandfather_state = get_nsuser_state(parent_ns, parent_ns->owner);
+ else
+ grandfather_state = get_nsuser_state(parent_ns->parent, parent_ns->owner);
+ spin_unlock_bh(&nsuser_state_lock);
+
+ state->uid = owner;
+ state->ns = parent_ns;
+
+#ifdef CONFIG_INOTIFY_USER
+ page_counter_init(&state->inotify_watches,
+ &grandfather_state->inotify_watches);
+ page_counter_init(&state->inotify_instances,
+ &grandfather_state->inotify_instances);
+ page_counter_limit(&state->inotify_watches,
+ parent_ns->inotify_max_user_watches);
+ page_counter_limit(&state->inotify_instances,
+ parent_ns->inotify_max_user_instances);
+#endif
+
+ spin_lock_bh(&nsuser_state_lock);
+ hash_add(nsstate_hash, &state->node, __kuid_val(owner));
+ spin_unlock_bh(&nsuser_state_lock);
+ }
+
return 0;
}
+/* Delete all state related to a user ns. All processes of a
+ * namespace should be dead by this time and no references
+ * to the peruser/perns state variables should be live.As such
+ * we can be modifying the hashtable without holding the lock
+ */
+static void free_nsuser_state(struct user_namespace *ns)
+{
+ int bkt;
+ struct hlist_node *tmp;
+ struct nsuser_state *state;
+
+ hash_for_each_safe(nsstate_hash, bkt, tmp, state, node) {
+ if (state->ns == ns) {
+ BUG_ON(page_counter_read(&state->inotify_instances));
+ BUG_ON(page_counter_read(&state->inotify_watches));
+
+ hash_del(&state->node);
+ kfree(state);
+ }
+ }
+}
int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
@@ -141,6 +223,10 @@ void free_user_ns(struct user_namespace *ns)
do {
parent = ns->parent;
+
+ spin_lock_bh(&nsuser_state_lock);
+ free_nsuser_state(ns);
+ spin_unlock_bh(&nsuser_state_lock);
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
@@ -1000,7 +1086,25 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
+ struct nsuser_state *root_state = kmalloc(sizeof(struct nsuser_state),
+ GFP_KERNEL);
+
+ init_user_ns.inotify_max_user_instances = 256;
+ init_user_ns.inotify_max_user_watches = 8192;
+
+#ifdef CONFIG_INOTIFY_USE
+ page_counter_init(&root_state->inotify_watches, NULL);
+ page_counter_init(&root_state->inotify_instances, NULL);
+ page_counter_limit(&root_state->inotify_watches,
+ init_user_ns.inotify_max_user_watches);
+ page_counter_limit(&root_state->inotify_instances,
+ init_user_ns.inotify_max_user_instances);
+#endif
+ root_state->uid = GLOBAL_ROOT_UID;
+ root_state->ns = &init_user_ns;
+ hash_add(nsstate_hash, &root_state->node, __kuid_val(GLOBAL_ROOT_UID));
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+
return 0;
}
subsys_initcall(user_namespaces_init);
--
2.5.0
next prev parent reply other threads:[~2016-07-13 12:14 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-07-13 12:14 [RFC PATCH 0/4 v3] Inotify limits per usernamespace Nikolay Borisov
[not found] ` <1468412053-30130-1-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
2016-07-13 12:14 ` [PATCH 1/4] hashtable: Add __HASHTABLE_INITIALIZER Nikolay Borisov
2016-07-13 12:14 ` [PATCH 2/4] misc: Rename the HASH_SIZE macro Nikolay Borisov
2016-07-13 12:14 ` Nikolay Borisov [this message]
2016-07-13 12:14 ` [PATCH 4/4] inotify: Convert to using new userns infrastructure Nikolay Borisov
2016-07-20 0:41 ` [RFC PATCH 0/4 v3] Inotify limits per usernamespace Eric W. Biederman
-- strict thread matches above, loose matches on Subject: below --
2016-06-29 13:37 [RFC PATCH 0/4 v2] " Nikolay Borisov
[not found] ` <1467207425-22072-1-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
2016-06-29 13:37 ` [PATCH 3/4] userns/inotify: Initial implementation of inotify per-userns Nikolay Borisov
[not found] ` <1467207425-22072-4-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
2016-07-06 17:29 ` Eric W. Biederman
[not found] ` <87mvluekun.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-07-07 13:40 ` Nikolay Borisov
[not found] ` <577E5BC2.1000208-6AxghH7DbtA@public.gmane.org>
2016-07-07 15:27 ` Eric W. Biederman
[not found] ` <87inwh31v6.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-07-08 11:43 ` Nikolay Borisov
[not found] ` <577F91C9.9060903-6AxghH7DbtA@public.gmane.org>
2016-07-08 15:08 ` Eric W. Biederman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1468412053-30130-4-git-send-email-kernel@kyup.com \
--to=kernel-6axghh7dbta@public.gmane.org \
--cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
--cc=ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org \
--cc=operations-/eCPMmvKun9pLGFMi4vTTA@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox