From: Stephen Hemminger <stephen@networkplumber.org>
To: Thomas Graf <tgraf@suug.ch>
Cc: davem@davemloft.net, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, herbert@gondor.apana.org.au,
paulmck@linux.vnet.ibm.com, edumazet@google.com,
john.r.fastabend@intel.com, josh@joshtriplett.org
Subject: [RFC] netlink: get rid of nl_table_lock
Date: Sat, 3 Jan 2015 11:02:11 -0800 [thread overview]
Message-ID: <20150103110211.18b11f0f@urahara> (raw)
In-Reply-To: <daae84cd33bfde8518b849176e25dcf3eea8014b.1420230585.git.tgraf@suug.ch>
As a follow on to Thomas's patch I think this would complete the
transistion to RCU for netlink.
Compile tested only.
This patch gets rid of the reader/writer nl_table_lock and replaces it
with exclusively using RCU for reading, and a mutex for writing.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/include/net/sock.h 2015-01-01 10:05:35.805253771 -0800
+++ b/include/net/sock.h 2015-01-03 10:45:29.661737618 -0800
@@ -666,6 +666,8 @@ static inline void sk_add_bind_node(stru
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
+#define sk_for_each_bound_rcu(__sk, list) \
+ hlist_for_each_entry_rcu(__sk, list, sk_bind_node)
/**
* sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
--- a/net/netlink/af_netlink.c 2015-01-03 10:04:37.738319202 -0800
+++ b/net/netlink/af_netlink.c 2015-01-03 10:53:29.568387253 -0800
@@ -100,15 +100,14 @@ static void netlink_skb_destructor(struc
* Lookup and traversal are protected with an RCU read-side lock. Insertion
* and removal are protected with nl_sk_hash_lock while using RCU list
* modification primitives and may run in parallel to RCU protected lookups.
- * Destruction of the Netlink socket may only occur *after* nl_table_lock has
+ * Destruction of the Netlink socket may only occur *after* nl_table_mutex has
* been acquired * either during or after the socket has been removed from
* the list and after an RCU grace period.
*/
-DEFINE_RWLOCK(nl_table_lock);
-EXPORT_SYMBOL_GPL(nl_table_lock);
-static atomic_t nl_table_users = ATOMIC_INIT(0);
+static DEFINE_MUTEX(nl_table_mutex);
-#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
+#define nl_deref_protected(X) \
+ rcu_dereference_protected(X, lockdep_is_held(&nl_table_mutex))
/* Protects netlink socket hash table mutations */
DEFINE_MUTEX(nl_sk_hash_lock);
@@ -118,7 +117,8 @@ EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
static int lockdep_nl_sk_hash_is_held(void *parent)
{
if (debug_locks)
- return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
+ return lockdep_is_held(&nl_sk_hash_lock) ||
+ lockdep_is_held(&nl_table_mutex);
return 1;
}
#endif
@@ -925,59 +925,14 @@ static void netlink_sock_destruct(struct
WARN_ON(nlk_sk(sk)->groups);
}
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
- * SMP. Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines.
- */
-
void netlink_table_grab(void)
- __acquires(nl_table_lock)
{
- might_sleep();
-
- write_lock_irq(&nl_table_lock);
-
- if (atomic_read(&nl_table_users)) {
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue_exclusive(&nl_table_wait, &wait);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (atomic_read(&nl_table_users) == 0)
- break;
- write_unlock_irq(&nl_table_lock);
- schedule();
- write_lock_irq(&nl_table_lock);
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nl_table_wait, &wait);
- }
+ mutex_lock(&nl_table_mutex);
}
void netlink_table_ungrab(void)
- __releases(nl_table_lock)
-{
- write_unlock_irq(&nl_table_lock);
- wake_up(&nl_table_wait);
-}
-
-static inline void
-netlink_lock_table(void)
{
- /* read_lock() synchronizes us to netlink_table_grab */
-
- read_lock(&nl_table_lock);
- atomic_inc(&nl_table_users);
- read_unlock(&nl_table_lock);
-}
-
-static inline void
-netlink_unlock_table(void)
-{
- if (atomic_dec_and_test(&nl_table_users))
- wake_up(&nl_table_wait);
+ mutex_unlock(&nl_table_mutex);
}
struct netlink_compare_arg
@@ -1151,12 +1106,12 @@ static int netlink_create(struct net *ne
if (protocol < 0 || protocol >= MAX_LINKS)
return -EPROTONOSUPPORT;
- netlink_lock_table();
+ mutex_lock(&nl_table_mutex);
#ifdef CONFIG_MODULES
if (!nl_table[protocol].registered) {
- netlink_unlock_table();
+ mutex_unlock(&nl_table_mutex);
request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
- netlink_lock_table();
+ mutex_lock(&nl_table_mutex);
}
#endif
if (nl_table[protocol].registered &&
@@ -1167,7 +1122,7 @@ static int netlink_create(struct net *ne
cb_mutex = nl_table[protocol].cb_mutex;
bind = nl_table[protocol].bind;
unbind = nl_table[protocol].unbind;
- netlink_unlock_table();
+ mutex_unlock(&nl_table_mutex);
if (err < 0)
goto out;
@@ -1982,17 +1937,13 @@ int netlink_broadcast_filtered(struct so
info.tx_filter = filter;
info.tx_data = filter_data;
- /* While we sleep in clone, do not allow to change socket list */
-
- netlink_lock_table();
-
- sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
+ rcu_read_lock();
+ sk_for_each_bound_rcu(sk, &nl_table[ssk->sk_protocol].mc_list)
do_one_broadcast(sk, &info);
+ rcu_read_unlock();
consume_skb(skb);
- netlink_unlock_table();
-
if (info.delivery_failure) {
kfree_skb(info.skb2);
return -ENOBUFS;
@@ -2071,12 +2022,11 @@ int netlink_set_err(struct sock *ssk, u3
/* sk->sk_err wants a positive error value */
info.code = -code;
- read_lock(&nl_table_lock);
-
- sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
+ rcu_read_lock();
+ sk_for_each_bound_rcu(sk, &nl_table[ssk->sk_protocol].mc_list)
ret += do_one_set_err(sk, &info);
+ rcu_read_unlock();
- read_unlock(&nl_table_lock);
return ret;
}
EXPORT_SYMBOL(netlink_set_err);
--- a/net/netlink/diag.c 2014-08-09 08:39:57.756179454 -0700
+++ b/net/netlink/diag.c 2015-01-03 10:57:31.113791535 -0800
@@ -136,7 +136,7 @@ static int __netlink_diag_dump(struct sk
}
}
- sk_for_each_bound(sk, &tbl->mc_list) {
+ sk_for_each_bound_rcu(sk, &tbl->mc_list) {
if (sk_hashed(sk))
continue;
if (!net_eq(sock_net(sk), net))
@@ -171,7 +171,7 @@ static int netlink_diag_dump(struct sk_b
req = nlmsg_data(cb->nlh);
mutex_lock(&nl_sk_hash_lock);
- read_lock(&nl_table_lock);
+ rcu_read_lock();
if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
int i;
@@ -183,7 +183,7 @@ static int netlink_diag_dump(struct sk_b
}
} else {
if (req->sdiag_protocol >= MAX_LINKS) {
- read_unlock(&nl_table_lock);
+ rcu_read_unlock();
mutex_unlock(&nl_sk_hash_lock);
return -ENOENT;
}
@@ -191,7 +191,7 @@ static int netlink_diag_dump(struct sk_b
__netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
}
- read_unlock(&nl_table_lock);
+ rcu_read_unlock();
mutex_unlock(&nl_sk_hash_lock);
return skb->len;
next prev parent reply other threads:[~2015-01-03 19:02 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-01-02 22:00 [PATCH 0/9 net-next v2] rhashtable: Per bucket locks & deferred table resizing Thomas Graf
2015-01-02 22:00 ` [PATCH 1/9] rhashtable: Do hashing inside of rhashtable_lookup_compare() Thomas Graf
2015-01-16 15:37 ` Patrick McHardy
2015-01-16 16:00 ` Thomas Graf
2015-01-02 22:00 ` [PATCH 2/9] rhashtable: Use rht_obj() instead of manual offset calculation Thomas Graf
2015-01-02 22:00 ` [PATCH 3/9] rhashtable: Convert bucket iterators to take table and index Thomas Graf
2015-01-02 22:00 ` [PATCH 4/9] rhashtable: Factor out bucket_tail() function Thomas Graf
2015-01-02 22:00 ` [PATCH 5/9] nft_hash: Remove rhashtable_remove_pprev() Thomas Graf
2015-01-02 22:00 ` [PATCH 6/9] spinlock: Add spin_lock_bh_nested() Thomas Graf
2015-01-02 22:00 ` [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking Thomas Graf
2015-01-02 22:00 ` [PATCH 8/9] rhashtable: Supports for nulls marker Thomas Graf
2015-01-02 22:00 ` [PATCH 9/9] netlink: Lockless lookup with RCU grace period in socket release Thomas Graf
2015-01-03 19:02 ` Stephen Hemminger [this message]
2015-01-06 22:19 ` [RFC] netlink: get rid of nl_table_lock David Miller
2015-01-06 23:00 ` Thomas Graf
2015-01-03 19:44 ` [PATCH 0/9 net-next v2] rhashtable: Per bucket locks & deferred table resizing David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150103110211.18b11f0f@urahara \
--to=stephen@networkplumber.org \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=herbert@gondor.apana.org.au \
--cc=john.r.fastabend@intel.com \
--cc=josh@joshtriplett.org \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=paulmck@linux.vnet.ibm.com \
--cc=tgraf@suug.ch \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.