netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] netlink: convert DIY reader/writer to mutex and RCU
@ 2010-03-08 21:32 Stephen Hemminger
  2010-03-08 22:00 ` [PATCH] netlink: convert DIY reader/writer to mutex and RCU (v2) Stephen Hemminger
  0 siblings, 1 reply; 4+ messages in thread
From: Stephen Hemminger @ 2010-03-08 21:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

The netlink table locking was open coded of reader/writer
sleeping lock.  Change to using mutex and RCU which makes
code clearer, shorter, and simpler.

The genetlink code is simplified because it is allowable
to acquire mutex with RCU lock.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netlink.h  |    3 
 include/net/sock.h       |    2 
 net/netlink/af_netlink.c |  146 ++++++++++++++---------------------------------
 net/netlink/genetlink.c  |    4 -
 4 files changed, 47 insertions(+), 108 deletions(-)

--- a/net/netlink/af_netlink.c	2010-03-08 09:07:15.104547875 -0800
+++ b/net/netlink/af_netlink.c	2010-03-08 11:02:01.263297712 -0800
@@ -128,15 +128,11 @@ struct netlink_table {
 };
 
 static struct netlink_table *nl_table;
-
-static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
+static DEFINE_MUTEX(nltable_mutex);
 
 static int netlink_dump(struct sock *sk);
 static void netlink_destroy_callback(struct netlink_callback *cb);
 
-static DEFINE_RWLOCK(nl_table_lock);
-static atomic_t nl_table_users = ATOMIC_INIT(0);
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static u32 netlink_group_mask(u32 group)
@@ -171,61 +167,6 @@ static void netlink_sock_destruct(struct
 	WARN_ON(nlk_sk(sk)->groups);
 }
 
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
- * SMP. Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines.
- */
-
-void netlink_table_grab(void)
-	__acquires(nl_table_lock)
-{
-	might_sleep();
-
-	write_lock_irq(&nl_table_lock);
-
-	if (atomic_read(&nl_table_users)) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		add_wait_queue_exclusive(&nl_table_wait, &wait);
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (atomic_read(&nl_table_users) == 0)
-				break;
-			write_unlock_irq(&nl_table_lock);
-			schedule();
-			write_lock_irq(&nl_table_lock);
-		}
-
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&nl_table_wait, &wait);
-	}
-}
-
-void netlink_table_ungrab(void)
-	__releases(nl_table_lock)
-{
-	write_unlock_irq(&nl_table_lock);
-	wake_up(&nl_table_wait);
-}
-
-static inline void
-netlink_lock_table(void)
-{
-	/* read_lock() synchronizes us to netlink_table_grab */
-
-	read_lock(&nl_table_lock);
-	atomic_inc(&nl_table_users);
-	read_unlock(&nl_table_lock);
-}
-
-static inline void
-netlink_unlock_table(void)
-{
-	if (atomic_dec_and_test(&nl_table_users))
-		wake_up(&nl_table_wait);
-}
-
 static inline struct sock *netlink_lookup(struct net *net, int protocol,
 					  u32 pid)
 {
@@ -234,9 +175,9 @@ static inline struct sock *netlink_looku
 	struct sock *sk;
 	struct hlist_node *node;
 
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 	head = nl_pid_hashfn(hash, pid);
-	sk_for_each(sk, node, head) {
+	sk_for_each_rcu(sk, node, head) {
 		if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) {
 			sock_hold(sk);
 			goto found;
@@ -244,7 +185,7 @@ static inline struct sock *netlink_looku
 	}
 	sk = NULL;
 found:
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 	return sk;
 }
 
@@ -353,7 +294,7 @@ static int netlink_insert(struct sock *s
 	struct hlist_node *node;
 	int len;
 
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	head = nl_pid_hashfn(hash, pid);
 	len = 0;
 	sk_for_each(osk, node, head) {
@@ -380,18 +321,18 @@ static int netlink_insert(struct sock *s
 	err = 0;
 
 err:
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 	return err;
 }
 
 static void netlink_remove(struct sock *sk)
 {
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	if (sk_del_node_init(sk))
 		nl_table[sk->sk_protocol].hash.entries--;
 	if (nlk_sk(sk)->subscriptions)
 		__sk_del_bind_node(sk);
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 }
 
 static struct proto netlink_proto = {
@@ -444,12 +385,14 @@ static int netlink_create(struct net *ne
 	if (protocol < 0 || protocol >= MAX_LINKS)
 		return -EPROTONOSUPPORT;
 
-	netlink_lock_table();
+	mutex_lock(&nltable_mutex);
 #ifdef CONFIG_MODULES
 	if (!nl_table[protocol].registered) {
-		netlink_unlock_table();
+		mutex_unlock(&nltable_mutex);
+
 		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
-		netlink_lock_table();
+
+		mutex_lock(&nltable_mutex);
 	}
 #endif
 	if (nl_table[protocol].registered &&
@@ -458,7 +401,7 @@ static int netlink_create(struct net *ne
 	else
 		err = -EPROTONOSUPPORT;
 	cb_mutex = nl_table[protocol].cb_mutex;
-	netlink_unlock_table();
+	mutex_unlock(&nltable_mutex);
 
 	if (err < 0)
 		goto out;
@@ -515,7 +458,7 @@ static int netlink_release(struct socket
 
 	module_put(nlk->module);
 
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	if (netlink_is_kernel(sk)) {
 		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
 		if (--nl_table[sk->sk_protocol].registered == 0) {
@@ -525,7 +468,7 @@ static int netlink_release(struct socket
 		}
 	} else if (nlk->subscriptions)
 		netlink_update_listeners(sk);
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 
 	kfree(nlk->groups);
 	nlk->groups = NULL;
@@ -533,6 +476,8 @@ static int netlink_release(struct socket
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
+
+	synchronize_rcu();
 	sock_put(sk);
 	return 0;
 }
@@ -551,7 +496,7 @@ static int netlink_autobind(struct socke
 
 retry:
 	cond_resched();
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	head = nl_pid_hashfn(hash, pid);
 	sk_for_each(osk, node, head) {
 		if (!net_eq(sock_net(osk), net))
@@ -561,11 +506,11 @@ retry:
 			pid = rover--;
 			if (rover > -4097)
 				rover = -4097;
-			netlink_table_ungrab();
+			mutex_unlock(&nltable_mutex);
 			goto retry;
 		}
 	}
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 
 	err = netlink_insert(sk, net, pid);
 	if (err == -EADDRINUSE)
@@ -603,7 +548,7 @@ static int netlink_realloc_groups(struct
 	unsigned long *new_groups;
 	int err = 0;
 
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 
 	groups = nl_table[sk->sk_protocol].groups;
 	if (!nl_table[sk->sk_protocol].registered) {
@@ -625,7 +570,7 @@ static int netlink_realloc_groups(struct
 	nlk->groups = new_groups;
 	nlk->ngroups = groups;
  out_unlock:
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 	return err;
 }
 
@@ -664,13 +609,13 @@ static int netlink_bind(struct socket *s
 	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
 		return 0;
 
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	netlink_update_subscriptions(sk, nlk->subscriptions +
 					 hweight32(nladdr->nl_groups) -
 					 hweight32(nlk->groups[0]));
 	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
 	netlink_update_listeners(sk);
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 
 	return 0;
 }
@@ -1059,15 +1004,12 @@ int netlink_broadcast(struct sock *ssk, 
 
 	/* While we sleep in clone, do not allow to change socket list */
 
-	netlink_lock_table();
-
-	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+	rcu_read_lock();
+	sk_for_each_bound_rcu(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_broadcast(sk, &info);
+	rcu_read_unlock();
 
 	kfree_skb(skb);
-
-	netlink_unlock_table();
-
 	kfree_skb(info.skb2);
 
 	if (info.delivery_failure)
@@ -1129,12 +1071,12 @@ void netlink_set_err(struct sock *ssk, u
 	/* sk->sk_err wants a positive error value */
 	info.code = -code;
 
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 
-	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+	sk_for_each_bound_rcu(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_set_err(sk, &info);
 
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(netlink_set_err);
 
@@ -1187,10 +1129,10 @@ static int netlink_setsockopt(struct soc
 			return err;
 		if (!val || val - 1 >= nlk->ngroups)
 			return -EINVAL;
-		netlink_table_grab();
+		mutex_lock(&nltable_mutex);
 		netlink_update_socket_mc(nlk, val,
 					 optname == NETLINK_ADD_MEMBERSHIP);
-		netlink_table_ungrab();
+		mutex_unlock(&nltable_mutex);
 		err = 0;
 		break;
 	}
@@ -1515,7 +1457,7 @@ netlink_kernel_create(struct net *net, i
 	nlk = nlk_sk(sk);
 	nlk->flags |= NETLINK_KERNEL_SOCKET;
 
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	if (!nl_table[unit].registered) {
 		nl_table[unit].groups = groups;
 		nl_table[unit].listeners = listeners;
@@ -1526,7 +1468,7 @@ netlink_kernel_create(struct net *net, i
 		kfree(listeners);
 		nl_table[unit].registered++;
 	}
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 	return sk;
 
 out_sock_release:
@@ -1557,7 +1499,7 @@ static void netlink_free_old_listeners(s
 	kfree(lrh->ptr);
 }
 
-int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
+static int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
 {
 	unsigned long *listeners, *old = NULL;
 	struct listeners_rcu_head *old_rcu_head;
@@ -1608,14 +1550,14 @@ int netlink_change_ngroups(struct sock *
 {
 	int err;
 
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	err = __netlink_change_ngroups(sk, groups);
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 
 	return err;
 }
 
-void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+static void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
 	struct sock *sk;
 	struct hlist_node *node;
@@ -1635,9 +1577,9 @@ void __netlink_clear_multicast_users(str
  */
 void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
-	netlink_table_grab();
+	mutex_lock(&nltable_mutex);
 	__netlink_clear_multicast_users(ksk, group);
-	netlink_table_ungrab();
+	mutex_unlock(&nltable_mutex);
 }
 
 void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1902,7 +1844,7 @@ static struct sock *netlink_seq_socket_i
 		struct nl_pid_hash *hash = &nl_table[i].hash;
 
 		for (j = 0; j <= hash->mask; j++) {
-			sk_for_each(s, node, &hash->table[j]) {
+			sk_for_each_rcu(s, node, &hash->table[j]) {
 				if (sock_net(s) != seq_file_net(seq))
 					continue;
 				if (off == pos) {
@@ -1918,9 +1860,9 @@ static struct sock *netlink_seq_socket_i
 }
 
 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(nl_table_lock)
+	__acquires(RCU)
 {
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
@@ -1967,9 +1909,9 @@ static void *netlink_seq_next(struct seq
 }
 
 static void netlink_seq_stop(struct seq_file *seq, void *v)
-	__releases(nl_table_lock)
+	__releases(RCU)
 {
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 }
 
 
--- a/include/net/sock.h	2010-03-08 10:52:34.113924084 -0800
+++ b/include/net/sock.h	2010-03-08 10:52:50.222986495 -0800
@@ -507,6 +507,8 @@ static __inline__ void sk_add_bind_node(
 	hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_bind_node)
+#define sk_for_each_bound_rcu(__sk, node, list) \
+	hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node)
 
 /* Sock flags */
 enum sock_flags {
--- a/include/linux/netlink.h	2010-03-08 10:56:35.314235687 -0800
+++ b/include/linux/netlink.h	2010-03-08 11:04:33.414547616 -0800
@@ -170,18 +170,13 @@ struct netlink_skb_parms {
 #define NETLINK_CREDS(skb)	(&NETLINK_CB((skb)).creds)
 
 
-extern void netlink_table_grab(void);
-extern void netlink_table_ungrab(void);
-
 extern struct sock *netlink_kernel_create(struct net *net,
 					  int unit,unsigned int groups,
 					  void (*input)(struct sk_buff *skb),
 					  struct mutex *cb_mutex,
 					  struct module *module);
 extern void netlink_kernel_release(struct sock *sk);
-extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
-extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
--- a/net/netlink/genetlink.c	2010-03-08 10:56:02.194547526 -0800
+++ b/net/netlink/genetlink.c	2010-03-08 11:01:05.865177057 -0800
@@ -168,10 +168,9 @@ int genl_register_mc_group(struct genl_f
 	if (family->netnsok) {
 		struct net *net;
 
-		netlink_table_grab();
 		rcu_read_lock();
 		for_each_net_rcu(net) {
-			err = __netlink_change_ngroups(net->genl_sock,
+			err = netlink_change_ngroups(net->genl_sock,
 					mc_groups_longs * BITS_PER_LONG);
 			if (err) {
 				/*
@@ -181,12 +180,10 @@ int genl_register_mc_group(struct genl_f
 				 * increased on some sockets which is ok.
 				 */
 				rcu_read_unlock();
-				netlink_table_ungrab();
 				goto out;
 			}
 		}
 		rcu_read_unlock();
-		netlink_table_ungrab();
 	} else {
 		err = netlink_change_ngroups(init_net.genl_sock,
 					     mc_groups_longs * BITS_PER_LONG);
@@ -212,12 +209,10 @@ static void __genl_unregister_mc_group(s
 	struct net *net;
 	BUG_ON(grp->family != family);
 
-	netlink_table_grab();
 	rcu_read_lock();
 	for_each_net_rcu(net)
-		__netlink_clear_multicast_users(net->genl_sock, grp->id);
+		netlink_clear_multicast_users(net->genl_sock, grp->id);
 	rcu_read_unlock();
-	netlink_table_ungrab();
 
 	clear_bit(grp->id, mc_groups);
 	list_del(&grp->list);

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] netlink: convert DIY reader/writer to mutex and RCU (v2)
  2010-03-08 21:32 [PATCH] netlink: convert DIY reader/writer to mutex and RCU Stephen Hemminger
@ 2010-03-08 22:00 ` Stephen Hemminger
  2010-03-16 15:00   ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Stephen Hemminger @ 2010-03-08 22:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

The netlink table locking was open coded version of reader/writer
sleeping lock.  Change to using mutex and RCU which makes
code clearer, shorter, and simpler.

Could use sk_list nulls but then would have to have kmem_cache
for netlink handles and that seems like unnecessary bloat.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v1 -> v2 do RCU correctly...
  * use spinlock not mutex (not safe to sleep in normal RCU)
  * use _rcu variants of add/delete


 include/linux/netlink.h  |    5 -
 include/net/sock.h       |   12 +++
 net/netlink/af_netlink.c |  161 +++++++++++++++--------------------------------
 net/netlink/genetlink.c  |    9 --
 4 files changed, 66 insertions(+), 121 deletions(-)

--- a/net/netlink/af_netlink.c	2010-03-08 09:07:15.104547875 -0800
+++ b/net/netlink/af_netlink.c	2010-03-08 13:43:46.026097658 -0800
@@ -128,15 +128,11 @@ struct netlink_table {
 };
 
 static struct netlink_table *nl_table;
-
-static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
+static DEFINE_SPINLOCK(nltable_lock);
 
 static int netlink_dump(struct sock *sk);
 static void netlink_destroy_callback(struct netlink_callback *cb);
 
-static DEFINE_RWLOCK(nl_table_lock);
-static atomic_t nl_table_users = ATOMIC_INIT(0);
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static u32 netlink_group_mask(u32 group)
@@ -171,61 +167,6 @@ static void netlink_sock_destruct(struct
 	WARN_ON(nlk_sk(sk)->groups);
 }
 
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
- * SMP. Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines.
- */
-
-void netlink_table_grab(void)
-	__acquires(nl_table_lock)
-{
-	might_sleep();
-
-	write_lock_irq(&nl_table_lock);
-
-	if (atomic_read(&nl_table_users)) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		add_wait_queue_exclusive(&nl_table_wait, &wait);
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (atomic_read(&nl_table_users) == 0)
-				break;
-			write_unlock_irq(&nl_table_lock);
-			schedule();
-			write_lock_irq(&nl_table_lock);
-		}
-
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&nl_table_wait, &wait);
-	}
-}
-
-void netlink_table_ungrab(void)
-	__releases(nl_table_lock)
-{
-	write_unlock_irq(&nl_table_lock);
-	wake_up(&nl_table_wait);
-}
-
-static inline void
-netlink_lock_table(void)
-{
-	/* read_lock() synchronizes us to netlink_table_grab */
-
-	read_lock(&nl_table_lock);
-	atomic_inc(&nl_table_users);
-	read_unlock(&nl_table_lock);
-}
-
-static inline void
-netlink_unlock_table(void)
-{
-	if (atomic_dec_and_test(&nl_table_users))
-		wake_up(&nl_table_wait);
-}
-
 static inline struct sock *netlink_lookup(struct net *net, int protocol,
 					  u32 pid)
 {
@@ -234,9 +175,9 @@ static inline struct sock *netlink_looku
 	struct sock *sk;
 	struct hlist_node *node;
 
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 	head = nl_pid_hashfn(hash, pid);
-	sk_for_each(sk, node, head) {
+	sk_for_each_rcu(sk, node, head) {
 		if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) {
 			sock_hold(sk);
 			goto found;
@@ -244,7 +185,7 @@ static inline struct sock *netlink_looku
 	}
 	sk = NULL;
 found:
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 	return sk;
 }
 
@@ -299,7 +240,8 @@ static int nl_pid_hash_rehash(struct nl_
 		struct hlist_node *node, *tmp;
 
 		sk_for_each_safe(sk, node, tmp, &otable[i])
-			__sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid));
+			__sk_add_node_rcu(sk,
+					  nl_pid_hashfn(hash, nlk_sk(sk)->pid));
 	}
 
 	nl_pid_hash_free(otable, osize);
@@ -353,7 +295,7 @@ static int netlink_insert(struct sock *s
 	struct hlist_node *node;
 	int len;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	head = nl_pid_hashfn(hash, pid);
 	len = 0;
 	sk_for_each(osk, node, head) {
@@ -376,22 +318,22 @@ static int netlink_insert(struct sock *s
 		head = nl_pid_hashfn(hash, pid);
 	hash->entries++;
 	nlk_sk(sk)->pid = pid;
-	sk_add_node(sk, head);
+	sk_add_node_rcu(sk, head);
 	err = 0;
 
 err:
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 	return err;
 }
 
 static void netlink_remove(struct sock *sk)
 {
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	if (sk_del_node_init(sk))
 		nl_table[sk->sk_protocol].hash.entries--;
 	if (nlk_sk(sk)->subscriptions)
-		__sk_del_bind_node(sk);
-	netlink_table_ungrab();
+		__sk_del_bind_node_rcu(sk);
+	spin_unlock(&nltable_lock);
 }
 
 static struct proto netlink_proto = {
@@ -444,12 +386,14 @@ static int netlink_create(struct net *ne
 	if (protocol < 0 || protocol >= MAX_LINKS)
 		return -EPROTONOSUPPORT;
 
-	netlink_lock_table();
+	spin_lock(&nltable_lock);
 #ifdef CONFIG_MODULES
 	if (!nl_table[protocol].registered) {
-		netlink_unlock_table();
+		spin_unlock(&nltable_lock);
+
 		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
-		netlink_lock_table();
+
+		spin_lock(&nltable_lock);
 	}
 #endif
 	if (nl_table[protocol].registered &&
@@ -458,7 +402,7 @@ static int netlink_create(struct net *ne
 	else
 		err = -EPROTONOSUPPORT;
 	cb_mutex = nl_table[protocol].cb_mutex;
-	netlink_unlock_table();
+	spin_unlock(&nltable_lock);
 
 	if (err < 0)
 		goto out;
@@ -515,7 +459,7 @@ static int netlink_release(struct socket
 
 	module_put(nlk->module);
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	if (netlink_is_kernel(sk)) {
 		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
 		if (--nl_table[sk->sk_protocol].registered == 0) {
@@ -525,7 +469,7 @@ static int netlink_release(struct socket
 		}
 	} else if (nlk->subscriptions)
 		netlink_update_listeners(sk);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	kfree(nlk->groups);
 	nlk->groups = NULL;
@@ -533,6 +477,8 @@ static int netlink_release(struct socket
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
+
+	synchronize_rcu();
 	sock_put(sk);
 	return 0;
 }
@@ -551,7 +497,7 @@ static int netlink_autobind(struct socke
 
 retry:
 	cond_resched();
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	head = nl_pid_hashfn(hash, pid);
 	sk_for_each(osk, node, head) {
 		if (!net_eq(sock_net(osk), net))
@@ -561,11 +507,11 @@ retry:
 			pid = rover--;
 			if (rover > -4097)
 				rover = -4097;
-			netlink_table_ungrab();
+			spin_unlock(&nltable_lock);
 			goto retry;
 		}
 	}
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	err = netlink_insert(sk, net, pid);
 	if (err == -EADDRINUSE)
@@ -590,9 +536,9 @@ netlink_update_subscriptions(struct sock
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->subscriptions && !subscriptions)
-		__sk_del_bind_node(sk);
+		__sk_del_bind_node_rcu(sk);
 	else if (!nlk->subscriptions && subscriptions)
-		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+		sk_add_bind_node_rcu(sk, &nl_table[sk->sk_protocol].mc_list);
 	nlk->subscriptions = subscriptions;
 }
 
@@ -603,7 +549,7 @@ static int netlink_realloc_groups(struct
 	unsigned long *new_groups;
 	int err = 0;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 
 	groups = nl_table[sk->sk_protocol].groups;
 	if (!nl_table[sk->sk_protocol].registered) {
@@ -625,7 +571,7 @@ static int netlink_realloc_groups(struct
 	nlk->groups = new_groups;
 	nlk->ngroups = groups;
  out_unlock:
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 	return err;
 }
 
@@ -664,13 +610,13 @@ static int netlink_bind(struct socket *s
 	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
 		return 0;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	netlink_update_subscriptions(sk, nlk->subscriptions +
 					 hweight32(nladdr->nl_groups) -
 					 hweight32(nlk->groups[0]));
 	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
 	netlink_update_listeners(sk);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	return 0;
 }
@@ -1059,15 +1005,12 @@ int netlink_broadcast(struct sock *ssk, 
 
 	/* While we sleep in clone, do not allow to change socket list */
 
-	netlink_lock_table();
-
-	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+	rcu_read_lock();
+	sk_for_each_bound_rcu(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_broadcast(sk, &info);
+	rcu_read_unlock();
 
 	kfree_skb(skb);
-
-	netlink_unlock_table();
-
 	kfree_skb(info.skb2);
 
 	if (info.delivery_failure)
@@ -1129,12 +1072,12 @@ void netlink_set_err(struct sock *ssk, u
 	/* sk->sk_err wants a positive error value */
 	info.code = -code;
 
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 
-	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+	sk_for_each_bound_rcu(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_set_err(sk, &info);
 
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(netlink_set_err);
 
@@ -1187,10 +1130,10 @@ static int netlink_setsockopt(struct soc
 			return err;
 		if (!val || val - 1 >= nlk->ngroups)
 			return -EINVAL;
-		netlink_table_grab();
+		spin_lock(&nltable_lock);
 		netlink_update_socket_mc(nlk, val,
 					 optname == NETLINK_ADD_MEMBERSHIP);
-		netlink_table_ungrab();
+		spin_unlock(&nltable_lock);
 		err = 0;
 		break;
 	}
@@ -1515,7 +1458,7 @@ netlink_kernel_create(struct net *net, i
 	nlk = nlk_sk(sk);
 	nlk->flags |= NETLINK_KERNEL_SOCKET;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	if (!nl_table[unit].registered) {
 		nl_table[unit].groups = groups;
 		nl_table[unit].listeners = listeners;
@@ -1526,7 +1469,7 @@ netlink_kernel_create(struct net *net, i
 		kfree(listeners);
 		nl_table[unit].registered++;
 	}
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 	return sk;
 
 out_sock_release:
@@ -1557,7 +1500,7 @@ static void netlink_free_old_listeners(s
 	kfree(lrh->ptr);
 }
 
-int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
+static int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
 {
 	unsigned long *listeners, *old = NULL;
 	struct listeners_rcu_head *old_rcu_head;
@@ -1608,14 +1551,14 @@ int netlink_change_ngroups(struct sock *
 {
 	int err;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	err = __netlink_change_ngroups(sk, groups);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	return err;
 }
 
-void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+static void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
 	struct sock *sk;
 	struct hlist_node *node;
@@ -1635,9 +1578,9 @@ void __netlink_clear_multicast_users(str
  */
 void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	__netlink_clear_multicast_users(ksk, group);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 }
 
 void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1902,7 +1845,7 @@ static struct sock *netlink_seq_socket_i
 		struct nl_pid_hash *hash = &nl_table[i].hash;
 
 		for (j = 0; j <= hash->mask; j++) {
-			sk_for_each(s, node, &hash->table[j]) {
+			sk_for_each_rcu(s, node, &hash->table[j]) {
 				if (sock_net(s) != seq_file_net(seq))
 					continue;
 				if (off == pos) {
@@ -1918,9 +1861,9 @@ static struct sock *netlink_seq_socket_i
 }
 
 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(nl_table_lock)
+	__acquires(RCU)
 {
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
@@ -1967,9 +1910,9 @@ static void *netlink_seq_next(struct seq
 }
 
 static void netlink_seq_stop(struct seq_file *seq, void *v)
-	__releases(nl_table_lock)
+	__releases(RCU)
 {
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 }
 
 
--- a/include/net/sock.h	2010-03-08 10:52:34.113924084 -0800
+++ b/include/net/sock.h	2010-03-08 13:54:27.815939404 -0800
@@ -451,6 +451,10 @@ static __inline__ void __sk_add_node(str
 {
 	hlist_add_head(&sk->sk_node, list);
 }
+static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+{
+	hlist_add_head_rcu(&sk->sk_node, list);
+}
 
 static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 {
@@ -479,12 +483,18 @@ static __inline__ void __sk_del_bind_nod
 {
 	__hlist_del(&sk->sk_bind_node);
 }
+#define __sk_del_bind_node_rcu(sk) __sk_del_bind_node(sk)
 
 static __inline__ void sk_add_bind_node(struct sock *sk,
 					struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_bind_node, list);
 }
+static __inline__ void sk_add_bind_node_rcu(struct sock *sk,
+					struct hlist_head *list)
+{
+	hlist_add_head_rcu(&sk->sk_bind_node, list);
+}
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
@@ -507,6 +517,8 @@ static __inline__ void sk_add_bind_node(
 	hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_bind_node)
+#define sk_for_each_bound_rcu(__sk, node, list) \
+	hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node)
 
 /* Sock flags */
 enum sock_flags {
--- a/include/linux/netlink.h	2010-03-08 10:56:35.314235687 -0800
+++ b/include/linux/netlink.h	2010-03-08 11:04:33.414547616 -0800
@@ -170,18 +170,13 @@ struct netlink_skb_parms {
 #define NETLINK_CREDS(skb)	(&NETLINK_CB((skb)).creds)
 
 
-extern void netlink_table_grab(void);
-extern void netlink_table_ungrab(void);
-
 extern struct sock *netlink_kernel_create(struct net *net,
 					  int unit,unsigned int groups,
 					  void (*input)(struct sk_buff *skb),
 					  struct mutex *cb_mutex,
 					  struct module *module);
 extern void netlink_kernel_release(struct sock *sk);
-extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
-extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
--- a/net/netlink/genetlink.c	2010-03-08 10:56:02.194547526 -0800
+++ b/net/netlink/genetlink.c	2010-03-08 11:01:05.865177057 -0800
@@ -168,10 +168,9 @@ int genl_register_mc_group(struct genl_f
 	if (family->netnsok) {
 		struct net *net;
 
-		netlink_table_grab();
 		rcu_read_lock();
 		for_each_net_rcu(net) {
-			err = __netlink_change_ngroups(net->genl_sock,
+			err = netlink_change_ngroups(net->genl_sock,
 					mc_groups_longs * BITS_PER_LONG);
 			if (err) {
 				/*
@@ -181,12 +180,10 @@ int genl_register_mc_group(struct genl_f
 				 * increased on some sockets which is ok.
 				 */
 				rcu_read_unlock();
-				netlink_table_ungrab();
 				goto out;
 			}
 		}
 		rcu_read_unlock();
-		netlink_table_ungrab();
 	} else {
 		err = netlink_change_ngroups(init_net.genl_sock,
 					     mc_groups_longs * BITS_PER_LONG);
@@ -212,12 +209,10 @@ static void __genl_unregister_mc_group(s
 	struct net *net;
 	BUG_ON(grp->family != family);
 
-	netlink_table_grab();
 	rcu_read_lock();
 	for_each_net_rcu(net)
-		__netlink_clear_multicast_users(net->genl_sock, grp->id);
+		netlink_clear_multicast_users(net->genl_sock, grp->id);
 	rcu_read_unlock();
-	netlink_table_ungrab();
 
 	clear_bit(grp->id, mc_groups);
 	list_del(&grp->list);

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] netlink: convert DIY reader/writer to mutex and RCU (v2)
  2010-03-08 22:00 ` [PATCH] netlink: convert DIY reader/writer to mutex and RCU (v2) Stephen Hemminger
@ 2010-03-16 15:00   ` Eric Dumazet
  2010-03-19  0:44     ` Stephen Hemminger
  0 siblings, 1 reply; 4+ messages in thread
From: Eric Dumazet @ 2010-03-16 15:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le lundi 08 mars 2010 à 14:00 -0800, Stephen Hemminger a écrit :
> The netlink table locking was open coded version of reader/writer
> sleeping lock.  Change to using mutex and RCU which makes
> code clearer, shorter, and simpler.
> 
> Could use sk_list nulls but then would have to have kmem_cache
> for netlink handles and that seems like unnecessary bloat.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
> v1 -> v2 do RCU correctly...
>   * use spinlock not mutex (not safe to sleep in normal RCU)
>   * use _rcu variants of add/delete


>  	kfree(nlk->groups);
>  	nlk->groups = NULL;
> @@ -533,6 +477,8 @@ static int netlink_release(struct socket
>  	local_bh_disable();
>  	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
>  	local_bh_enable();
> +
> +	synchronize_rcu();
>  	sock_put(sk);
>  	return 0;

I am a bit scared by synchronize_rcu() proliferation. This can slow down
some workloads (some scripts invoking netlink commands, not using batch
mode)

We could change sk_prot_free() behavior and optionaly call a callback to
free a socket (and the module_put()) after rcu grace period. As the
rcu_head is not inside struct sock, I could not find a generic way to
code this.

For TCP/UDP sockets this was considered not a viable alternative, but
for other sockets its certainly better than synchronize_rcu() ?

diff --git a/include/net/sock.h b/include/net/sock.h
index 092b055..3a2d598 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -693,6 +693,8 @@ struct proto {
 	int			(*backlog_rcv) (struct sock *sk, 
 						struct sk_buff *skb);
 
+	void			(*free_socket)(struct sock *sk);
+
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index c5812bb..3060c9b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1027,18 +1027,20 @@ out_free:
 
 static void sk_prot_free(struct proto *prot, struct sock *sk)
 {
-	struct kmem_cache *slab;
-	struct module *owner;
+	security_sk_free(sk);
+	if (prot->free_socket)
+		prot->free_socket(sk);
+	else {
+		struct kmem_cache *slab;
 
-	owner = prot->owner;
-	slab = prot->slab;
+		slab = prot->slab;
 
-	security_sk_free(sk);
-	if (slab != NULL)
-		kmem_cache_free(slab, sk);
-	else
-		kfree(sk);
-	module_put(owner);
+		if (slab != NULL)
+			kmem_cache_free(slab, sk);
+		else
+			kfree(sk);
+		module_put(prot->owner);
+	}
 }
 
 /**
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 320d042..fae2344 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -81,6 +81,7 @@ struct netlink_sock {
 	struct mutex		cb_def_mutex;
 	void			(*netlink_rcv)(struct sk_buff *skb);
 	struct module		*module;
+	struct rcu_head		rcu;
 };
 
 struct listeners_rcu_head {
@@ -394,10 +395,22 @@ static void netlink_remove(struct sock *sk)
 	netlink_table_ungrab();
 }
 
+static void rcu_free_socket(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct netlink_sock, rcu));
+	module_put(THIS_MODULE);
+}
+
+static void free_socket(struct sock *sk)
+{
+	call_rcu(&nlk_sk(sk)->rcu, rcu_free_socket);
+}
+
 static struct proto netlink_proto = {
 	.name	  = "NETLINK",
 	.owner	  = THIS_MODULE,
 	.obj_size = sizeof(struct netlink_sock),
+	.free_socket = free_socket,
 };
 
 static int __netlink_create(struct net *net, struct socket *sock,



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] netlink: convert DIY reader/writer to mutex and RCU (v2)
  2010-03-16 15:00   ` Eric Dumazet
@ 2010-03-19  0:44     ` Stephen Hemminger
  0 siblings, 0 replies; 4+ messages in thread
From: Stephen Hemminger @ 2010-03-19  0:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Tue, 16 Mar 2010 16:00:35 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le lundi 08 mars 2010 à 14:00 -0800, Stephen Hemminger a écrit :
> > The netlink table locking was open coded version of reader/writer
> > sleeping lock.  Change to using mutex and RCU which makes
> > code clearer, shorter, and simpler.
> > 
> > Could use sk_list nulls but then would have to have kmem_cache
> > for netlink handles and that seems like unnecessary bloat.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > ---

Maybe using rcu list_nulls would be better.
The problem is what about the bind_node hash list.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-03-19  0:45 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-08 21:32 [PATCH] netlink: convert DIY reader/writer to mutex and RCU Stephen Hemminger
2010-03-08 22:00 ` [PATCH] netlink: convert DIY reader/writer to mutex and RCU (v2) Stephen Hemminger
2010-03-16 15:00   ` Eric Dumazet
2010-03-19  0:44     ` Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).