From: Al Viro <viro@ZenIV.linux.org.uk>
To: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: linux-kernel@vger.kernel.org,
Andrew Morton <akpm@linux-foundation.org>,
"Eric W. Biederman" <ebiederm@xmission.com>
Subject: Re: [PATCH] proc/sysctl: prune stale dentries during unregistering
Date: Sun, 19 Feb 2017 08:42:12 +0000 [thread overview]
Message-ID: <20170219084202.GJ29622@ZenIV.linux.org.uk> (raw)
In-Reply-To: <d358cb91-aa72-5fbe-47e0-b7b978ac65b5@yandex-team.ru>
On Sat, Feb 18, 2017 at 09:55:28PM +0300, Konstantin Khlebnikov wrote:
> This patch has locking problem. I've got lockdep splat under LTP.
>
> d_lock nests inside i_lock
> sysctl_lock nests inside d_lock in d_compare
>
> This patch adds i_lock nesting inside sysctl_lock.
Once ->unregistering is set, you can drop sysctl_lock just fine. So I'd
try something like this - use rcu_read_lock() in proc_sys_prune_dcache(),
drop sysctl_lock() before it and regain after. Make sure that no inodes
are added to the list ones ->unregistering has been set and use RCU list
primitives for modifying the inode list, with sysctl_lock still used to
serialize its modifications.
Freeing struct inode is RCU-delayed (see proc_destroy_inode()), so doing
igrab() is safe there. Since we don't drop inode reference until after we'd
passed beyond it in the list, list_for_each_entry_rcu() should be fine,
AFAICS. Below is a completely untested modification of your patch along
those lines:
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
de = PDE(inode);
if (de)
pde_put(de);
+
head = PROC_I(inode)->sysctl;
if (head) {
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
- sysctl_head_put(head);
+ proc_sys_evict_inode(inode, head);
}
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..ed1d762160e6 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
+ struct list_head sysctl_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
};
@@ -249,10 +250,12 @@ extern void proc_thread_self_init(void);
*/
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+ struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct inode *inode,
+ struct ctl_table_header *head) { }
#endif
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 55313d994895..6477c4a2dc6c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
head->set = set;
head->parent = NULL;
head->node = node;
+ INIT_LIST_HEAD(&head->inodes);
if (node) {
struct ctl_table *entry;
for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,26 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering);
}
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+ struct inode *inode, *prev = NULL;
+ struct proc_inode *ei;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
+ inode = igrab(&ei->vfs_inode);
+ if (inode) {
+ rcu_read_unlock();
+ iput(prev);
+ prev = inode;
+ d_prune_aliases(inode);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+ iput(prev);
+}
+
/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
@@ -272,31 +293,22 @@ static void start_unregistering(struct ctl_table_header *p)
p->unregistering = &wait;
spin_unlock(&sysctl_lock);
wait_for_completion(&wait);
- spin_lock(&sysctl_lock);
} else {
/* anything non-NULL; we'll never dereference it */
p->unregistering = ERR_PTR(-EINVAL);
+ spin_unlock(&sysctl_lock);
}
/*
+ * Prune dentries for unregistered sysctls: namespaced sysctls
+ * can have duplicate names and contaminate dcache very badly.
+ */
+ proc_sys_prune_dcache(p);
+ /*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
*/
- erase_header(p);
-}
-
-static void sysctl_head_get(struct ctl_table_header *head)
-{
- spin_lock(&sysctl_lock);
- head->count++;
- spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
spin_lock(&sysctl_lock);
- if (!--head->count)
- kfree_rcu(head, rcu);
- spin_unlock(&sysctl_lock);
+ erase_header(p);
}
static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +452,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
- sysctl_head_get(head);
ei = PROC_I(inode);
+
+ spin_lock(&sysctl_lock);
+ if (unlikely(head->unregistering)) {
+ spin_unlock(&sysctl_lock);
+ iput(inode);
+ inode = NULL;
+ goto out;
+ }
ei->sysctl = head;
ei->sysctl_entry = table;
+ list_add_rcu(&ei->sysctl_inodes, &head->inodes);
+ head->count++;
+ spin_unlock(&sysctl_lock);
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = table->mode;
@@ -466,6 +488,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
return inode;
}
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ list_del_rcu(&PROC_I(inode)->sysctl_inodes);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+}
+
static struct ctl_table_header *grab_header(struct inode *inode)
{
struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
struct ctl_table_set *set;
struct ctl_dir *parent;
struct ctl_node *node;
+ struct list_head inodes; /* head for proc_inode->sysctl_inodes */
};
struct ctl_dir {
next prev parent reply other threads:[~2017-02-19 8:42 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-02-08 10:48 [PATCH] proc/sysctl: drop unregistered stale dentries as soon as possible Konstantin Khlebnikov
2017-02-08 21:48 ` Andrew Morton
2017-02-09 3:53 ` Al Viro
2017-02-09 7:36 ` Konstantin Khlebnikov
2017-02-09 8:40 ` Al Viro
2017-02-10 7:35 ` [PATCH] proc/sysctl: prune stale dentries during unregistering Konstantin Khlebnikov
2017-02-10 7:47 ` Al Viro
2017-02-10 7:54 ` Konstantin Khlebnikov
2017-02-13 9:54 ` Eric W. Biederman
2017-02-18 18:55 ` Konstantin Khlebnikov
2017-02-19 8:42 ` Al Viro [this message]
2017-02-21 1:41 ` [REVIEW][PATCH] proc/sysctl: Don't grab i_lock under sysctl_lock Eric W. Biederman
2017-02-21 8:40 ` Konstantin Khlebnikov
2017-02-21 19:29 ` Eric W. Biederman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170219084202.GJ29622@ZenIV.linux.org.uk \
--to=viro@zeniv.linux.org.uk \
--cc=akpm@linux-foundation.org \
--cc=ebiederm@xmission.com \
--cc=khlebnikov@yandex-team.ru \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.