From: Stephen Hemminger <stephen@networkplumber.org>
To: davem@davemloft.net
Cc: netdev@vger.kernel.org, Stephen Hemminger <stephen@networkplumber.org>
Subject: [PATCH net-next 05/12] vxlan: fix race caused by dropping rtnl_unlock
Date: Mon, 10 Jun 2013 13:24:50 -0700 [thread overview]
Message-ID: <20130610132450.7ff7236a@nehalam.linuxnetplumber.net> (raw)
In-Reply-To: <20130610200524.721617349@vyatta.com>
It is possible for two cpu's to race creating vxlan device.
For most cases this is harmless, but the ability to assign "next
avaliable vxlan device" relies on rtnl lock being held across the
whole operation. Therfore two instances of calling:
ip li add vxlan%d vxlan ...
could collide and create two devices with same name.
To fix this defer creation of socket to a work queue, and
handle possible races there. Introduce a lock to ensure that
changes to vxlan socket hash list is SMP safe.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c 2013-06-10 12:19:59.002097934 -0700
+++ b/drivers/net/vxlan.c 2013-06-10 12:19:59.522092022 -0700
@@ -94,6 +94,7 @@ struct vxlan_sock {
struct vxlan_net {
struct list_head vxlan_list;
struct hlist_head sock_list[PORT_HASH_SIZE];
+ spinlock_t sock_lock;
};
struct vxlan_rdst {
@@ -131,7 +132,9 @@ struct vxlan_dev {
__u8 ttl;
u32 flags; /* VXLAN_F_* below */
+ struct work_struct sock_work;
struct work_struct igmp_work;
+
unsigned long age_interval;
struct timer_list age_timer;
spinlock_t hash_lock;
@@ -151,6 +154,8 @@ struct vxlan_dev {
static u32 vxlan_salt __read_mostly;
static struct workqueue_struct *vxlan_wq;
+static void vxlan_sock_work(struct work_struct *work);
+
/* Virtual Network hash table head */
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
{
@@ -670,12 +675,15 @@ static void vxlan_sock_hold(struct vxlan
atomic_inc(&vs->refcnt);
}
-static void vxlan_sock_release(struct vxlan_sock *vs)
+static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs)
{
if (!atomic_dec_and_test(&vs->refcnt))
return;
+ spin_lock(&vn->sock_lock);
hlist_del_rcu(&vs->hlist);
+ spin_unlock(&vn->sock_lock);
+
queue_work(vxlan_wq, &vs->del_work);
}
@@ -700,7 +708,7 @@ static void vxlan_igmp_work(struct work_
ip_mc_leave_group(sk, &mreq);
release_sock(sk);
- vxlan_sock_release(vs);
+ vxlan_sock_release(vn, vs);
dev_put(vxlan->dev);
}
@@ -1238,10 +1246,29 @@ static void vxlan_cleanup(unsigned long
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+ struct vxlan_sock *vs;
+ __u32 vni = vxlan->default_dst.remote_vni;
+
dev->tstats = alloc_percpu(struct pcpu_tstats);
if (!dev->tstats)
return -ENOMEM;
+ spin_lock(&vn->sock_lock);
+ vs = vxlan_find_port(dev_net(dev), vxlan->dst_port);
+ if (vs) {
+ /* If we have a socket with same port already, reuse it */
+ atomic_inc(&vs->refcnt);
+ vxlan->vn_sock = vs;
+ hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+ } else {
+ /* otherwise make new socket outside of RTNL */
+ dev_hold(dev);
+ queue_work(vxlan_wq, &vxlan->sock_work);
+ }
+ spin_unlock(&vn->sock_lock);
+
return 0;
}
@@ -1249,9 +1276,14 @@ static int vxlan_init(struct net_device
static int vxlan_open(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_sock *vs = vxlan->vn_sock;
+
+ /* socket hasn't been created */
+ if (!vs)
+ return -ENOTCONN;
if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
- vxlan_sock_hold(vxlan->vn_sock);
+ vxlan_sock_hold(vs);
dev_hold(dev);
queue_work(vxlan_wq, &vxlan->igmp_work);
}
@@ -1283,9 +1315,10 @@ static void vxlan_flush(struct vxlan_dev
static int vxlan_stop(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_sock *vs = vxlan->vn_sock;
- if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
- vxlan_sock_hold(vxlan->vn_sock);
+ if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
+ vxlan_sock_hold(vs);
dev_hold(dev);
queue_work(vxlan_wq, &vxlan->igmp_work);
}
@@ -1358,6 +1391,7 @@ static void vxlan_setup(struct net_devic
INIT_LIST_HEAD(&vxlan->next);
spin_lock_init(&vxlan->hash_lock);
INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work);
+ INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
init_timer_deferrable(&vxlan->age_timer);
vxlan->age_timer.function = vxlan_cleanup;
@@ -1449,7 +1483,6 @@ static void vxlan_del_work(struct work_s
kfree_rcu(vs, rcu);
}
-/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
{
struct vxlan_sock *vs;
@@ -1506,13 +1539,52 @@ static struct vxlan_sock *vxlan_socket_c
return vs;
}
+/* Scheduled at device creation to bind to a socket */
+static void vxlan_sock_work(struct work_struct *work)
+{
+ struct vxlan_dev *vxlan
+ = container_of(work, struct vxlan_dev, sock_work);
+ struct net_device *dev = vxlan->dev;
+ struct net *net = dev_net(dev);
+ __u32 vni = vxlan->default_dst.remote_vni;
+ __be16 port = vxlan->dst_port;
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+ struct vxlan_sock *nvs, *ovs;
+
+ nvs = vxlan_socket_create(net, port);
+ if (IS_ERR(nvs)) {
+ netdev_err(vxlan->dev, "Can not create UDP socket, %ld\n",
+ PTR_ERR(nvs));
+ goto out;
+ }
+
+ spin_lock(&vn->sock_lock);
+ /* Look again to see if can reuse socket */
+ ovs = vxlan_find_port(net, port);
+ if (ovs) {
+ atomic_inc(&ovs->refcnt);
+ vxlan->vn_sock = ovs;
+ hlist_add_head_rcu(&vxlan->hlist, vni_head(ovs, vni));
+ spin_unlock(&vn->sock_lock);
+
+ sk_release_kernel(nvs->sock->sk);
+ kfree(nvs);
+ } else {
+ vxlan->vn_sock = nvs;
+ hlist_add_head_rcu(&nvs->hlist, vs_head(net, port));
+ hlist_add_head_rcu(&vxlan->hlist, vni_head(nvs, vni));
+ spin_unlock(&vn->sock_lock);
+ }
+out:
+ dev_put(dev);
+}
+
static int vxlan_newlink(struct net *net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_rdst *dst = &vxlan->default_dst;
- struct vxlan_sock *vs;
__u32 vni;
int err;
@@ -1590,31 +1662,13 @@ static int vxlan_newlink(struct net *net
return -EEXIST;
}
- vs = vxlan_find_port(net, vxlan->dst_port);
- if (vs)
- atomic_inc(&vs->refcnt);
- else {
- /* Drop lock because socket create acquires RTNL lock */
- rtnl_unlock();
- vs = vxlan_socket_create(net, vxlan->dst_port);
- rtnl_lock();
- if (IS_ERR(vs))
- return PTR_ERR(vs);
-
- hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port));
- }
- vxlan->vn_sock = vs;
-
SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
err = register_netdevice(dev);
- if (err) {
- vxlan_sock_release(vs);
+ if (err)
return err;
- }
list_add(&vxlan->next, &vn->vxlan_list);
- hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
return 0;
}
@@ -1622,12 +1676,14 @@ static int vxlan_newlink(struct net *net
static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
struct vxlan_sock *vs = vxlan->vn_sock;
hlist_del_rcu(&vxlan->hlist);
list_del(&vxlan->next);
unregister_netdevice_queue(dev, head);
- vxlan_sock_release(vs);
+ if (vs)
+ vxlan_sock_release(vn, vs);
}
static size_t vxlan_get_size(const struct net_device *dev)
@@ -1716,6 +1772,7 @@ static __net_init int vxlan_init_net(str
unsigned int h;
INIT_LIST_HEAD(&vn->vxlan_list);
+ spin_lock_init(&vn->sock_lock);
for (h = 0; h < PORT_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vn->sock_list[h]);
next prev parent reply other threads:[~2013-06-10 20:25 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20130610200524.721617349@vyatta.com>
2013-06-10 20:24 ` [PATCH net-next 12/12] vxlan: bump version Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 11/12] vxlan whitespace cleanup Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 10/12] vxlan: use initializer for dummy structures Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 09/12] vxlan: port module param should be ushort Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void Stephen Hemminger
2013-06-11 1:30 ` Cong Wang
2013-06-11 3:14 ` Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 06/12] vxlan: move freecpu to uninit Stephen Hemminger
2013-06-10 20:24 ` Stephen Hemminger [this message]
2013-06-10 20:24 ` [PATCH net-next 04/12] vxlan: send notification when MAC migrates Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue Stephen Hemminger
2013-06-11 2:01 ` Cong Wang
2013-06-10 20:24 ` [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning Stephen Hemminger
2013-06-11 0:46 ` Cong Wang
2013-06-11 2:00 ` Cong Wang
2013-06-10 20:24 ` [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal Stephen Hemminger
2013-06-11 2:00 ` Cong Wang
2013-06-10 20:25 ` [PATCH net-next 08/12] vxlan: convert remotes list to list_rcu Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130610132450.7ff7236a@nehalam.linuxnetplumber.net \
--to=stephen@networkplumber.org \
--cc=davem@davemloft.net \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.