* [PATCH net-next 12/12] vxlan: bump version
       [not found] <20130610200524.721617349@vyatta.com>
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 11/12] vxlan whitespace cleanup Stephen Hemminger
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:20:04.974030028 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:05.754021159 -0700
@@ -42,7 +42,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
-#define VXLAN_VERSION	"0.1"
+#define VXLAN_VERSION	"0.2"
 
 #define PORT_HASH_BITS	8
 #define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 11/12] vxlan whitespace cleanup
       [not found] <20130610200524.721617349@vyatta.com>
  2013-06-10 20:24 ` [PATCH net-next 12/12] vxlan: bump version Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 10/12] vxlan: use initializer for dummy structures Stephen Hemminger
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger, David L Stevens
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: David L Stevens <dlstevens@us.ibm.com>
--- a/drivers/net/vxlan.c	2013-06-10 12:20:04.266038079 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:04.974030028 -0700
@@ -210,9 +210,9 @@ static struct vxlan_dev *vxlan_find_vni(
 
 /* Fill in neighbour message in skbuff. */
 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
-			   const struct vxlan_fdb *fdb,
-			   u32 portid, u32 seq, int type, unsigned int flags,
-			   const struct vxlan_rdst *rdst)
+			  const struct vxlan_fdb *fdb,
+			  u32 portid, u32 seq, int type, unsigned int flags,
+			  const struct vxlan_rdst *rdst)
 {
 	unsigned long now = jiffies;
 	struct nda_cacheinfo ci;
@@ -1032,7 +1032,7 @@ static void vxlan_xmit_one(struct sk_buf
 	struct flowi4 fl4;
 	__be32 dst;
 	__be16 src_port, dst_port;
-        u32 vni;
+	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
 
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 10/12] vxlan:  use initializer for dummy structures
       [not found] <20130610200524.721617349@vyatta.com>
  2013-06-10 20:24 ` [PATCH net-next 12/12] vxlan: bump version Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 11/12] vxlan whitespace cleanup Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 09/12] vxlan: port module param should be ushort Stephen Hemminger
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
For the notification code, a couple of places build fdb entries on
the stack, use structure initialization instead and fix formatting.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:20:03.034052087 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:04.266038079 -0700
@@ -311,14 +311,13 @@ errout:
 static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct vxlan_fdb f;
-	struct vxlan_rdst remote;
-
-	memset(&f, 0, sizeof f);
-	f.state = NUD_STALE;
-
-	remote.remote_ip = ipa; /* goes to NDA_DST */
-	remote.remote_vni = VXLAN_N_VID;
+	struct vxlan_fdb f = {
+		.state = NUD_STALE,
+	};
+	struct vxlan_rdst remote = {
+		.remote_ip = ipa, /* goes to NDA_DST */
+		.remote_vni = VXLAN_N_VID,
+	};
 
 	INIT_LIST_HEAD(&f.remotes);
 	list_add_rcu(&remote.list, &f.remotes);
@@ -328,11 +327,11 @@ static void vxlan_ip_miss(struct net_dev
 
 static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
 {
-	struct vxlan_fdb	f;
+	struct vxlan_fdb f = {
+		.state = NUD_STALE,
+	};
 
-	memset(&f, 0, sizeof f);
 	INIT_LIST_HEAD(&f.remotes);
-	f.state = NUD_STALE;
 	memcpy(f.eth_addr, eth_addr, ETH_ALEN);
 
 	vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
@@ -1501,6 +1500,7 @@ static struct vxlan_sock *vxlan_socket_c
 	struct sockaddr_in vxlan_addr = {
 		.sin_family = AF_INET,
 		.sin_addr.s_addr = htonl(INADDR_ANY),
+		.sin_port = port,
 	};
 	int rc;
 	unsigned int h;
@@ -1526,8 +1526,6 @@ static struct vxlan_sock *vxlan_socket_c
 	sk = vs->sock->sk;
 	sk_change_net(sk, net);
 
-	vxlan_addr.sin_port = port;
-
 	rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr,
 			 sizeof(vxlan_addr));
 	if (rc < 0) {
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 09/12] vxlan: port module param should be ushort
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (2 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 10/12] vxlan: use initializer for dummy structures Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void Stephen Hemminger
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
UDP ports are limited to 16 bits.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:20:02.174061866 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:03.034052087 -0700
@@ -70,8 +70,8 @@ struct vxlanhdr {
  * The IANA assigned port is 4789, but the Linux default is 8472
  * for compatability with early adopters.
  */
-static unsigned int vxlan_port __read_mostly = 8472;
-module_param_named(udp_port, vxlan_port, uint, 0444);
+static unsigned short vxlan_port __read_mostly = 8472;
+module_param_named(udp_port, vxlan_port, ushort, 0444);
 MODULE_PARM_DESC(udp_port, "Destination UDP port");
 
 static bool log_ecn_error = true;
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (3 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 09/12] vxlan: port module param should be ushort Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-11  1:30   ` Cong Wang
  2013-06-10 20:24 ` [PATCH net-next 06/12] vxlan: move freecpu to uninit Stephen Hemminger
                   ` (6 subsequent siblings)
  11 siblings, 1 reply; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger, David L Stevens
The function vxlan_xmit_one always returns NETDEV_TX_OK, so there
is no point in keeping track of return values etc.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: David L Stevens <dlstevens@us.ibm.com>
--- a/drivers/net/vxlan.c	2013-06-10 12:20:00.590079878 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:01.314071646 -0700
@@ -1008,8 +1008,8 @@ static void vxlan_encap_bypass(struct sk
 	}
 }
 
-static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
-				  struct vxlan_rdst *rdst, bool did_rsc)
+static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
+			   struct vxlan_rdst *rdst, bool did_rsc)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct rtable *rt;
@@ -1032,7 +1032,7 @@ static netdev_tx_t vxlan_xmit_one(struct
 		if (did_rsc) {
 			/* short-circuited back to local bridge */
 			vxlan_encap_bypass(skb, vxlan, vxlan);
-			return NETDEV_TX_OK;
+			return;
 		}
 		goto drop;
 	}
@@ -1088,7 +1088,7 @@ static netdev_tx_t vxlan_xmit_one(struct
 		if (!dst_vxlan)
 			goto tx_error;
 		vxlan_encap_bypass(skb, vxlan, dst_vxlan);
-		return NETDEV_TX_OK;
+		return;
 	}
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1132,7 +1132,7 @@ static netdev_tx_t vxlan_xmit_one(struct
 		goto drop;
 
 	iptunnel_xmit(skb, dev);
-	return NETDEV_TX_OK;
+	return;
 
 drop:
 	dev->stats.tx_dropped++;
@@ -1142,7 +1142,6 @@ tx_error:
 	dev->stats.tx_errors++;
 tx_free:
 	dev_kfree_skb(skb);
-	return NETDEV_TX_OK;
 }
 
 /* Transmit local packets over Vxlan
@@ -1158,7 +1157,6 @@ static netdev_tx_t vxlan_xmit(struct sk_
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst0, *rdst;
 	struct vxlan_fdb *f;
-	int rc1, rc;
 
 	skb_reset_mac_header(skb);
 	eth = eth_hdr(skb);
@@ -1186,24 +1184,18 @@ static netdev_tx_t vxlan_xmit(struct sk_
 	} else
 		rdst0 = &f->remote;
 
-	rc = NETDEV_TX_OK;
 
 	/* if there are multiple destinations, send copies */
 	for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) {
 		struct sk_buff *skb1;
 
 		skb1 = skb_clone(skb, GFP_ATOMIC);
-		if (skb1) {
-			rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
-			if (rc == NETDEV_TX_OK)
-				rc = rc1;
-		}
+		if (skb1)
+			vxlan_xmit_one(skb1, dev, rdst, did_rsc);
 	}
 
-	rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc);
-	if (rc == NETDEV_TX_OK)
-		rc = rc1;
-	return rc;
+	vxlan_xmit_one(skb, dev, rdst0, did_rsc);
+	return NETDEV_TX_OK;
 }
 
 /* Walk the forwarding table and purge stale entries */
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 06/12] vxlan: move freecpu to uninit
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (4 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 05/12] vxlan: fix race caused by dropping rtnl_unlock Stephen Hemminger
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
Use ndo_uninit to avoid having to override destructor.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:19:59.522092022 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:00.590079878 -0700
@@ -1272,6 +1272,17 @@ static int vxlan_init(struct net_device
 	return 0;
 }
 
+static void vxlan_uninit(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct vxlan_sock *vs = vxlan->vn_sock;
+
+	if (vs)
+		vxlan_sock_release(vn, vs);
+	free_percpu(dev->tstats);
+}
+
 /* Start ageing timer and join group when device is brought up */
 static int vxlan_open(struct net_device *dev)
 {
@@ -1337,6 +1348,7 @@ static void vxlan_set_multicast_list(str
 
 static const struct net_device_ops vxlan_netdev_ops = {
 	.ndo_init		= vxlan_init,
+	.ndo_uninit		= vxlan_uninit,
 	.ndo_open		= vxlan_open,
 	.ndo_stop		= vxlan_stop,
 	.ndo_start_xmit		= vxlan_xmit,
@@ -1355,12 +1367,6 @@ static struct device_type vxlan_type = {
 	.name = "vxlan",
 };
 
-static void vxlan_free(struct net_device *dev)
-{
-	free_percpu(dev->tstats);
-	free_netdev(dev);
-}
-
 /* Initialize the device structure. */
 static void vxlan_setup(struct net_device *dev)
 {
@@ -1373,7 +1379,7 @@ static void vxlan_setup(struct net_devic
 	dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
 
 	dev->netdev_ops = &vxlan_netdev_ops;
-	dev->destructor = vxlan_free;
+	dev->destructor = free_netdev;
 	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
 
 	dev->tx_queue_len = 0;
@@ -1676,14 +1682,10 @@ static int vxlan_newlink(struct net *net
 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
-	struct vxlan_sock *vs = vxlan->vn_sock;
 
 	hlist_del_rcu(&vxlan->hlist);
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
-	if (vs)
-		vxlan_sock_release(vn, vs);
 }
 
 static size_t vxlan_get_size(const struct net_device *dev)
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 05/12] vxlan: fix race caused by dropping rtnl_unlock
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (5 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 06/12] vxlan: move freecpu to uninit Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 04/12] vxlan: send notification when MAC migrates Stephen Hemminger
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
It is possible for two cpu's to race creating vxlan device.
For most cases this is harmless, but the ability to assign "next
avaliable vxlan device" relies on rtnl lock being held across the
whole operation. Therfore two instances of calling:
  ip li add vxlan%d vxlan ...
could collide and create two devices with same name.
To fix this defer creation of socket to a work queue, and
handle possible races there. Introduce a lock to ensure that
changes to vxlan socket hash list is SMP safe.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:19:59.002097934 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:19:59.522092022 -0700
@@ -94,6 +94,7 @@ struct vxlan_sock {
 struct vxlan_net {
 	struct list_head  vxlan_list;
 	struct hlist_head sock_list[PORT_HASH_SIZE];
+	spinlock_t	  sock_lock;
 };
 
 struct vxlan_rdst {
@@ -131,7 +132,9 @@ struct vxlan_dev {
 	__u8		  ttl;
 	u32		  flags;	/* VXLAN_F_* below */
 
+	struct work_struct sock_work;
 	struct work_struct igmp_work;
+
 	unsigned long	  age_interval;
 	struct timer_list age_timer;
 	spinlock_t	  hash_lock;
@@ -151,6 +154,8 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static void vxlan_sock_work(struct work_struct *work);
+
 /* Virtual Network hash table head */
 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
 {
@@ -670,12 +675,15 @@ static void vxlan_sock_hold(struct vxlan
 	atomic_inc(&vs->refcnt);
 }
 
-static void vxlan_sock_release(struct vxlan_sock *vs)
+static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs)
 {
 	if (!atomic_dec_and_test(&vs->refcnt))
 		return;
 
+	spin_lock(&vn->sock_lock);
 	hlist_del_rcu(&vs->hlist);
+	spin_unlock(&vn->sock_lock);
+
 	queue_work(vxlan_wq, &vs->del_work);
 }
 
@@ -700,7 +708,7 @@ static void vxlan_igmp_work(struct work_
 		ip_mc_leave_group(sk, &mreq);
 	release_sock(sk);
 
-	vxlan_sock_release(vs);
+	vxlan_sock_release(vn, vs);
 	dev_put(vxlan->dev);
 }
 
@@ -1238,10 +1246,29 @@ static void vxlan_cleanup(unsigned long
 /* Setup stats when device is created */
 static int vxlan_init(struct net_device *dev)
 {
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct vxlan_sock *vs;
+	__u32 vni = vxlan->default_dst.remote_vni;
+
 	dev->tstats = alloc_percpu(struct pcpu_tstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	spin_lock(&vn->sock_lock);
+	vs = vxlan_find_port(dev_net(dev), vxlan->dst_port);
+	if (vs) {
+		/* If we have a socket with same port already, reuse it */
+		atomic_inc(&vs->refcnt);
+		vxlan->vn_sock = vs;
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+	} else {
+		/* otherwise make new socket outside of RTNL */
+		dev_hold(dev);
+		queue_work(vxlan_wq, &vxlan->sock_work);
+	}
+	spin_unlock(&vn->sock_lock);
+
 	return 0;
 }
 
@@ -1249,9 +1276,14 @@ static int vxlan_init(struct net_device
 static int vxlan_open(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_sock *vs = vxlan->vn_sock;
+
+	/* socket hasn't been created */
+	if (!vs)
+		return -ENOTCONN;
 
 	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
-		vxlan_sock_hold(vxlan->vn_sock);
+		vxlan_sock_hold(vs);
 		dev_hold(dev);
 		queue_work(vxlan_wq, &vxlan->igmp_work);
 	}
@@ -1283,9 +1315,10 @@ static void vxlan_flush(struct vxlan_dev
 static int vxlan_stop(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_sock *vs = vxlan->vn_sock;
 
-	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
-		vxlan_sock_hold(vxlan->vn_sock);
+	if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
+		vxlan_sock_hold(vs);
 		dev_hold(dev);
 		queue_work(vxlan_wq, &vxlan->igmp_work);
 	}
@@ -1358,6 +1391,7 @@ static void vxlan_setup(struct net_devic
 	INIT_LIST_HEAD(&vxlan->next);
 	spin_lock_init(&vxlan->hash_lock);
 	INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work);
+	INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
 
 	init_timer_deferrable(&vxlan->age_timer);
 	vxlan->age_timer.function = vxlan_cleanup;
@@ -1449,7 +1483,6 @@ static void vxlan_del_work(struct work_s
 	kfree_rcu(vs, rcu);
 }
 
-/* Create new listen socket if needed */
 static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
 {
 	struct vxlan_sock *vs;
@@ -1506,13 +1539,52 @@ static struct vxlan_sock *vxlan_socket_c
 	return vs;
 }
 
+/* Scheduled at device creation to bind to a socket */
+static void vxlan_sock_work(struct work_struct *work)
+{
+	struct vxlan_dev *vxlan
+		= container_of(work, struct vxlan_dev, sock_work);
+	struct net_device *dev = vxlan->dev;
+	struct net *net = dev_net(dev);
+	__u32 vni = vxlan->default_dst.remote_vni;
+	__be16 port = vxlan->dst_port;
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct vxlan_sock *nvs, *ovs;
+
+	nvs = vxlan_socket_create(net, port);
+	if (IS_ERR(nvs)) {
+		netdev_err(vxlan->dev, "Can not create UDP socket, %ld\n",
+			   PTR_ERR(nvs));
+		goto out;
+	}
+
+	spin_lock(&vn->sock_lock);
+	/* Look again to see if can reuse socket */
+	ovs = vxlan_find_port(net, port);
+	if (ovs) {
+		atomic_inc(&ovs->refcnt);
+		vxlan->vn_sock = ovs;
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(ovs, vni));
+		spin_unlock(&vn->sock_lock);
+
+		sk_release_kernel(nvs->sock->sk);
+		kfree(nvs);
+	} else {
+		vxlan->vn_sock = nvs;
+		hlist_add_head_rcu(&nvs->hlist, vs_head(net, port));
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(nvs, vni));
+		spin_unlock(&vn->sock_lock);
+	}
+out:
+	dev_put(dev);
+}
+
 static int vxlan_newlink(struct net *net, struct net_device *dev,
 			 struct nlattr *tb[], struct nlattr *data[])
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_rdst *dst = &vxlan->default_dst;
-	struct vxlan_sock *vs;
 	__u32 vni;
 	int err;
 
@@ -1590,31 +1662,13 @@ static int vxlan_newlink(struct net *net
 		return -EEXIST;
 	}
 
-	vs = vxlan_find_port(net, vxlan->dst_port);
-	if (vs)
-		atomic_inc(&vs->refcnt);
-	else {
-		/* Drop lock because socket create acquires RTNL lock */
-		rtnl_unlock();
-		vs = vxlan_socket_create(net, vxlan->dst_port);
-		rtnl_lock();
-		if (IS_ERR(vs))
-			return PTR_ERR(vs);
-
-		hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port));
-	}
-	vxlan->vn_sock = vs;
-
 	SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
 
 	err = register_netdevice(dev);
-	if (err) {
-		vxlan_sock_release(vs);
+	if (err)
 		return err;
-	}
 
 	list_add(&vxlan->next, &vn->vxlan_list);
-	hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
 
 	return 0;
 }
@@ -1622,12 +1676,14 @@ static int vxlan_newlink(struct net *net
 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 
 	hlist_del_rcu(&vxlan->hlist);
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
-	vxlan_sock_release(vs);
+	if (vs)
+		vxlan_sock_release(vn, vs);
 }
 
 static size_t vxlan_get_size(const struct net_device *dev)
@@ -1716,6 +1772,7 @@ static __net_init int vxlan_init_net(str
 	unsigned int h;
 
 	INIT_LIST_HEAD(&vn->vxlan_list);
+	spin_lock_init(&vn->sock_lock);
 
 	for (h = 0; h < PORT_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&vn->sock_list[h]);
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 04/12] vxlan: send notification when MAC migrates
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (6 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 05/12] vxlan: fix race caused by dropping rtnl_unlock Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-10 20:24 ` [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue Stephen Hemminger
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
When learned entry migrates to another IP send a notification
that entry has changed.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:19:58.398104802 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:19:59.002097934 -0700
@@ -629,6 +629,7 @@ static bool vxlan_snoop(struct net_devic
 
 		f->remote.remote_ip = src_ip;
 		f->updated = jiffies;
+		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 	} else {
 		/* learned new entry */
 		spin_lock(&vxlan->hash_lock);
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (7 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 04/12] vxlan: send notification when MAC migrates Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-11  2:01   ` Cong Wang
  2013-06-10 20:24 ` [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning Stephen Hemminger
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
Do join/leave from work queue to avoid lock inversion problems
between normal socket and RTNL. The code comes out cleaner
as well.
Uses Cong Wang's suggestion to turn refcnt into a real atomic
since now need to handle case where last use of socket is IGMP
worker.
Also fixes race where vxlan_stop could be called after
device was deleted on module removal. The call to rtnl_link_unregister
would call dellink while vxlan device was still up. Reordering
the calls fixes it.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
--- a/drivers/net/vxlan.c	2013-06-10 12:19:57.722112490 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:19:58.398104802 -0700
@@ -85,7 +85,7 @@ struct vxlan_sock {
 	struct hlist_node hlist;
 	struct rcu_head	  rcu;
 	struct work_struct del_work;
-	unsigned int	  refcnt;
+	atomic_t	  refcnt;
 	struct socket	  *sock;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 };
@@ -131,6 +131,7 @@ struct vxlan_dev {
 	__u8		  ttl;
 	u32		  flags;	/* VXLAN_F_* below */
 
+	struct work_struct igmp_work;
 	unsigned long	  age_interval;
 	struct timer_list age_timer;
 	spinlock_t	  hash_lock;
@@ -648,76 +649,58 @@ static bool vxlan_snoop(struct net_devic
 
 
 /* See if multicast group is already in use by other ID */
-static bool vxlan_group_used(struct vxlan_net *vn,
-			     const struct vxlan_dev *this)
+static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip)
 {
 	struct vxlan_dev *vxlan;
 
 	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
-		if (vxlan == this)
-			continue;
-
 		if (!netif_running(vxlan->dev))
 			continue;
 
-		if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip)
+		if (vxlan->default_dst.remote_ip == remote_ip)
 			return true;
 	}
 
 	return false;
 }
 
-/* kernel equivalent to IP_ADD_MEMBERSHIP */
-static int vxlan_join_group(struct net_device *dev)
+static void vxlan_sock_hold(struct vxlan_sock *vs)
 {
-	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
-	struct sock *sk = vxlan->vn_sock->sock->sk;
-	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
-		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
-	};
-	int err;
-
-	/* Already a member of group */
-	if (vxlan_group_used(vn, vxlan))
-		return 0;
+	atomic_inc(&vs->refcnt);
+}
 
-	/* Need to drop RTNL to call multicast join */
-	rtnl_unlock();
-	lock_sock(sk);
-	err = ip_mc_join_group(sk, &mreq);
-	release_sock(sk);
-	rtnl_lock();
+static void vxlan_sock_release(struct vxlan_sock *vs)
+{
+	if (!atomic_dec_and_test(&vs->refcnt))
+		return;
 
-	return err;
+	hlist_del_rcu(&vs->hlist);
+	queue_work(vxlan_wq, &vs->del_work);
 }
 
-
-/* kernel equivalent to IP_DROP_MEMBERSHIP */
-static int vxlan_leave_group(struct net_device *dev)
+/* Callback to update multicast group membership.
+ * Scheduled when vxlan goes up/down.
+ */
+static void vxlan_igmp_work(struct work_struct *work)
 {
-	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
-	int err = 0;
-	struct sock *sk = vxlan->vn_sock->sock->sk;
+	struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_work);
+	struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id);
+	struct vxlan_sock *vs = vxlan->vn_sock;
+	struct sock *sk = vs->sock->sk;
 	struct ip_mreqn mreq = {
 		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
 		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
 	};
 
-	/* Only leave group when last vxlan is done. */
-	if (vxlan_group_used(vn, vxlan))
-		return 0;
-
-	/* Need to drop RTNL to call multicast leave */
-	rtnl_unlock();
 	lock_sock(sk);
-	err = ip_mc_leave_group(sk, &mreq);
+	if (vxlan_group_used(vn, vxlan->default_dst.remote_ip))
+		ip_mc_join_group(sk, &mreq);
+	else
+		ip_mc_leave_group(sk, &mreq);
 	release_sock(sk);
-	rtnl_lock();
 
-	return err;
+	vxlan_sock_release(vs);
+	dev_put(vxlan->dev);
 }
 
 /* Callback from net/ipv4/udp.c to receive packets */
@@ -1265,12 +1248,11 @@ static int vxlan_init(struct net_device
 static int vxlan_open(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	int err;
 
 	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
-		err = vxlan_join_group(dev);
-		if (err)
-			return err;
+		vxlan_sock_hold(vxlan->vn_sock);
+		dev_hold(dev);
+		queue_work(vxlan_wq, &vxlan->igmp_work);
 	}
 
 	if (vxlan->age_interval)
@@ -1301,8 +1283,11 @@ static int vxlan_stop(struct net_device
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
-	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)))
-		vxlan_leave_group(dev);
+	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
+		vxlan_sock_hold(vxlan->vn_sock);
+		dev_hold(dev);
+		queue_work(vxlan_wq, &vxlan->igmp_work);
+	}
 
 	del_timer_sync(&vxlan->age_timer);
 
@@ -1371,6 +1356,7 @@ static void vxlan_setup(struct net_devic
 
 	INIT_LIST_HEAD(&vxlan->next);
 	spin_lock_init(&vxlan->hash_lock);
+	INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work);
 
 	init_timer_deferrable(&vxlan->age_timer);
 	vxlan->age_timer.function = vxlan_cleanup;
@@ -1514,8 +1500,8 @@ static struct vxlan_sock *vxlan_socket_c
 	udp_sk(sk)->encap_type = 1;
 	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
 	udp_encap_enable();
+	atomic_set(&vs->refcnt, 1);
 
-	vs->refcnt = 1;
 	return vs;
 }
 
@@ -1605,7 +1591,7 @@ static int vxlan_newlink(struct net *net
 
 	vs = vxlan_find_port(net, vxlan->dst_port);
 	if (vs)
-		++vs->refcnt;
+		atomic_inc(&vs->refcnt);
 	else {
 		/* Drop lock because socket create acquires RTNL lock */
 		rtnl_unlock();
@@ -1622,12 +1608,7 @@ static int vxlan_newlink(struct net *net
 
 	err = register_netdevice(dev);
 	if (err) {
-		if (--vs->refcnt == 0) {
-			rtnl_unlock();
-			sk_release_kernel(vs->sock->sk);
-			kfree(vs);
-			rtnl_lock();
-		}
+		vxlan_sock_release(vs);
 		return err;
 	}
 
@@ -1645,11 +1626,7 @@ static void vxlan_dellink(struct net_dev
 	hlist_del_rcu(&vxlan->hlist);
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
-
-	if (--vs->refcnt == 0) {
-		hlist_del_rcu(&vs->hlist);
-		queue_work(vxlan_wq, &vs->del_work);
-	}
+	vxlan_sock_release(vs);
 }
 
 static size_t vxlan_get_size(const struct net_device *dev)
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (8 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-11  0:46   ` Cong Wang
  2013-06-11  2:00   ` Cong Wang
  2013-06-10 20:24 ` [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal Stephen Hemminger
  2013-06-10 20:25 ` [PATCH net-next 08/12] vxlan: convert remotes list to list_rcu Stephen Hemminger
  11 siblings, 2 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
It is possible for a packet to arrive during vxlan_stop(), and
have a dynamic entry created. Close this by checking if device
is up.
 CPU1                             CPU2   
vxlan_stop
  vxlan_flush
     hash_lock acquired
                                  vxlan_encap_recv
                                     vxlan_snoop
                                        waiting for hash_lock
     hash_lock relased
  vxlan_flush done
                                        hash_lock acquired
                                        vxlan_fdb_create
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:19:51.290185627 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:19:57.722112490 -0700
@@ -611,7 +611,6 @@ static bool vxlan_snoop(struct net_devic
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
-	int err;
 
 	f = vxlan_find_mac(vxlan, src_mac);
 	if (likely(f)) {
@@ -632,12 +631,15 @@ static bool vxlan_snoop(struct net_devic
 	} else {
 		/* learned new entry */
 		spin_lock(&vxlan->hash_lock);
-		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
-				       NUD_REACHABLE,
-				       NLM_F_EXCL|NLM_F_CREATE,
-				       vxlan->dst_port,
-				       vxlan->default_dst.remote_vni,
-				       0, NTF_SELF);
+
+		/* close off race between vxlan_flush and incoming packets */
+		if (netif_running(dev))
+			vxlan_fdb_create(vxlan, src_mac, src_ip,
+					 NUD_REACHABLE,
+					 NLM_F_EXCL|NLM_F_CREATE,
+					 vxlan->dst_port,
+					 vxlan->default_dst.remote_vni,
+					 0, NTF_SELF);
 		spin_unlock(&vxlan->hash_lock);
 	}
 
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (9 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning Stephen Hemminger
@ 2013-06-10 20:24 ` Stephen Hemminger
  2013-06-11  2:00   ` Cong Wang
  2013-06-10 20:25 ` [PATCH net-next 08/12] vxlan: convert remotes list to list_rcu Stephen Hemminger
  11 siblings, 1 reply; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:24 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
Switch to using a per module work queue so that all the socket
deletion callbacks are done when module is removed.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
--- a/drivers/net/vxlan.c	2013-06-10 12:19:51.278185763 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:19:51.290185627 -0700
@@ -148,6 +148,7 @@ struct vxlan_dev {
 
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
+static struct workqueue_struct *vxlan_wq;
 
 /* Virtual Network hash table head */
 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
@@ -1645,7 +1646,7 @@ static void vxlan_dellink(struct net_dev
 
 	if (--vs->refcnt == 0) {
 		hlist_del_rcu(&vs->hlist);
-		schedule_work(&vs->del_work);
+		queue_work(vxlan_wq, &vs->del_work);
 	}
 }
 
@@ -1764,6 +1765,10 @@ static int __init vxlan_init_module(void
 {
 	int rc;
 
+	vxlan_wq = alloc_workqueue("vxlan", 0, 0);
+	if (!vxlan_wq)
+		return -ENOMEM;
+
 	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
 
 	rc = register_pernet_device(&vxlan_net_ops);
@@ -1779,6 +1784,7 @@ static int __init vxlan_init_module(void
 out2:
 	unregister_pernet_device(&vxlan_net_ops);
 out1:
+	destroy_workqueue(vxlan_wq);
 	return rc;
 }
 late_initcall(vxlan_init_module);
^ permalink raw reply	[flat|nested] 18+ messages in thread
* [PATCH net-next 08/12] vxlan: convert remotes list to list_rcu
       [not found] <20130610200524.721617349@vyatta.com>
                   ` (10 preceding siblings ...)
  2013-06-10 20:24 ` [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal Stephen Hemminger
@ 2013-06-10 20:25 ` Stephen Hemminger
  11 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-10 20:25 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
Based on initial work by Mike Rapoport <mike.rapoport@ravellosystems.com>
Use list macros and RCU for tracking multiple remotes.
Note: this code assumes list always has at least one entry,
because delete is not supported.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/vxlan.c |   99 +++++++++++++++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 43 deletions(-)
--- a/drivers/net/vxlan.c	2013-06-10 12:20:01.314071646 -0700
+++ b/drivers/net/vxlan.c	2013-06-10 12:20:02.174061866 -0700
@@ -102,7 +102,7 @@ struct vxlan_rdst {
 	__be16			 remote_port;
 	u32			 remote_vni;
 	u32			 remote_ifindex;
-	struct vxlan_rdst	*remote_next;
+	struct list_head	 list;
 };
 
 /* Forwarding table entry */
@@ -111,7 +111,7 @@ struct vxlan_fdb {
 	struct rcu_head	  rcu;
 	unsigned long	  updated;	/* jiffies */
 	unsigned long	  used;
-	struct vxlan_rdst remote;
+	struct list_head  remotes;
 	u16		  state;	/* see ndm_state */
 	u8		  flags;	/* see ndm_flags */
 	u8		  eth_addr[ETH_ALEN];
@@ -170,6 +170,14 @@ static inline struct hlist_head *vs_head
 	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
 }
 
+/* First remote destination for a forwarding entry.
+ * Guaranteed to be non-NULL because remotes are never deleted.
+ */
+static inline struct vxlan_rdst *first_remote(struct vxlan_fdb *fdb)
+{
+	return list_first_or_null_rcu(&fdb->remotes, struct vxlan_rdst, list);
+}
+
 /* Find VXLAN socket based on network namespace and UDP port */
 static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
 {
@@ -275,7 +283,7 @@ static inline size_t vxlan_nlmsg_size(vo
 }
 
 static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
-			     const struct vxlan_fdb *fdb, int type)
+			     struct vxlan_fdb *fdb, int type)
 {
 	struct net *net = dev_net(vxlan->dev);
 	struct sk_buff *skb;
@@ -285,7 +293,7 @@ static void vxlan_fdb_notify(struct vxla
 	if (skb == NULL)
 		goto errout;
 
-	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote);
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, first_remote(fdb));
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -304,11 +312,16 @@ static void vxlan_ip_miss(struct net_dev
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb f;
+	struct vxlan_rdst remote;
 
 	memset(&f, 0, sizeof f);
 	f.state = NUD_STALE;
-	f.remote.remote_ip = ipa; /* goes to NDA_DST */
-	f.remote.remote_vni = VXLAN_N_VID;
+
+	remote.remote_ip = ipa; /* goes to NDA_DST */
+	remote.remote_vni = VXLAN_N_VID;
+
+	INIT_LIST_HEAD(&f.remotes);
+	list_add_rcu(&remote.list, &f.remotes);
 
 	vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 }
@@ -318,6 +331,7 @@ static void vxlan_fdb_miss(struct vxlan_
 	struct vxlan_fdb	f;
 
 	memset(&f, 0, sizeof f);
+	INIT_LIST_HEAD(&f.remotes);
 	f.state = NUD_STALE;
 	memcpy(f.eth_addr, eth_addr, ETH_ALEN);
 
@@ -377,17 +391,17 @@ static struct vxlan_fdb *vxlan_find_mac(
 static int vxlan_fdb_append(struct vxlan_fdb *f,
 			    __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
 {
-	struct vxlan_rdst *rd_prev, *rd;
+	struct vxlan_rdst *rd;
 
-	rd_prev = NULL;
-	for (rd = &f->remote; rd; rd = rd->remote_next) {
+	/* protected by vxlan->hash_lock */
+	list_for_each_entry(rd, &f->remotes, list) {
 		if (rd->remote_ip == ip &&
 		    rd->remote_port == port &&
 		    rd->remote_vni == vni &&
 		    rd->remote_ifindex == ifindex)
 			return 0;
-		rd_prev = rd;
 	}
+
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
 		return -ENOBUFS;
@@ -395,8 +409,9 @@ static int vxlan_fdb_append(struct vxlan
 	rd->remote_port = port;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
-	rd->remote_next = NULL;
-	rd_prev->remote_next = rd;
+
+	list_add_tail_rcu(&rd->list, &f->remotes);
+
 	return 1;
 }
 
@@ -448,16 +463,14 @@ static int vxlan_fdb_create(struct vxlan
 			return -ENOMEM;
 
 		notify = 1;
-		f->remote.remote_ip = ip;
-		f->remote.remote_port = port;
-		f->remote.remote_vni = vni;
-		f->remote.remote_ifindex = ifindex;
-		f->remote.remote_next = NULL;
 		f->state = state;
 		f->flags = ndm_flags;
 		f->updated = f->used = jiffies;
+		INIT_LIST_HEAD(&f->remotes);
 		memcpy(f->eth_addr, mac, ETH_ALEN);
 
+		vxlan_fdb_append(f, ip, port, vni, ifindex);
+
 		++vxlan->addrcnt;
 		hlist_add_head_rcu(&f->hlist,
 				   vxlan_fdb_head(vxlan, mac));
@@ -472,13 +485,10 @@ static int vxlan_fdb_create(struct vxlan
 static void vxlan_fdb_free(struct rcu_head *head)
 {
 	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
+	struct vxlan_rdst *rd, *nd;
 
-	while (f->remote.remote_next) {
-		struct vxlan_rdst *rd = f->remote.remote_next;
-
-		f->remote.remote_next = rd->remote_next;
+	list_for_each_entry_safe(rd, nd, &f->remotes, list)
 		kfree(rd);
-	}
 	kfree(f);
 }
 
@@ -588,23 +598,24 @@ static int vxlan_fdb_dump(struct sk_buff
 
 		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
 			struct vxlan_rdst *rd;
-			for (rd = &f->remote; rd; rd = rd->remote_next) {
-				if (idx < cb->args[0])
-					goto skip;
 
+			if (idx < cb->args[0])
+				goto skip;
+
+			list_for_each_entry_rcu(rd, &f->remotes, list) {
 				err = vxlan_fdb_info(skb, vxlan, f,
 						     NETLINK_CB(cb->skb).portid,
 						     cb->nlh->nlmsg_seq,
 						     RTM_NEWNEIGH,
 						     NLM_F_MULTI, rd);
 				if (err < 0)
-					break;
-skip:
-				++idx;
+					goto out;
 			}
+skip:
+			++idx;
 		}
 	}
-
+out:
 	return idx;
 }
 
@@ -620,19 +631,21 @@ static bool vxlan_snoop(struct net_devic
 
 	f = vxlan_find_mac(vxlan, src_mac);
 	if (likely(f)) {
-		if (likely(f->remote.remote_ip == src_ip))
-			return false;
+		struct vxlan_rdst *rdst = first_remote(f);
 
 		/* Don't migrate static entries, drop packets */
 		if (!(f->flags & NTF_SELF))
 			return true;
 
+		if (likely(rdst->remote_ip == src_ip))
+			return false;
+
 		if (net_ratelimit())
 			netdev_info(dev,
 				    "%pM migrated from %pI4 to %pI4\n",
-				    src_mac, &f->remote.remote_ip, &src_ip);
+				    src_mac, &rdst->remote_ip, &src_ip);
 
-		f->remote.remote_ip = src_ip;
+		rdst->remote_ip = src_ip;
 		f->updated = jiffies;
 		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 	} else {
@@ -866,7 +879,7 @@ static int arp_reduce(struct net_device
 		}
 
 		f = vxlan_find_mac(vxlan, n->ha);
-		if (f && f->remote.remote_ip == htonl(INADDR_ANY)) {
+		if (f && first_remote(f)->remote_ip == htonl(INADDR_ANY)) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			goto out;
@@ -1181,17 +1194,17 @@ static netdev_tx_t vxlan_xmit(struct sk_
 		    (vxlan->flags & VXLAN_F_L2MISS) &&
 		    !is_multicast_ether_addr(eth->h_dest))
 			vxlan_fdb_miss(vxlan, eth->h_dest);
-	} else
-		rdst0 = &f->remote;
-
-
-	/* if there are multiple destinations, send copies */
-	for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) {
-		struct sk_buff *skb1;
+	} else {
+		rdst = rdst0 = first_remote(f);
 
-		skb1 = skb_clone(skb, GFP_ATOMIC);
-		if (skb1)
-			vxlan_xmit_one(skb1, dev, rdst, did_rsc);
+		/* if there are multiple destinations, send copies */
+		list_for_each_entry_continue_rcu(rdst, &f->remotes, list) {
+			struct sk_buff *skb1;
+
+			skb1 = skb_clone(skb, GFP_ATOMIC);
+			if (skb1)
+				vxlan_xmit_one(skb1, dev, rdst, did_rsc);
+		}
 	}
 
 	vxlan_xmit_one(skb, dev, rdst0, did_rsc);
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning
  2013-06-10 20:24 ` [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning Stephen Hemminger
@ 2013-06-11  0:46   ` Cong Wang
  2013-06-11  2:00   ` Cong Wang
  1 sibling, 0 replies; 18+ messages in thread
From: Cong Wang @ 2013-06-11  0:46 UTC (permalink / raw)
  To: netdev
On Mon, 10 Jun 2013 at 20:24 GMT, Stephen Hemminger <stephen@networkplumber.org> wrote:
> It is possible for a packet to arrive during vxlan_stop(), and
> have a dynamic entry created. Close this by checking if device
> is up.
>
>  CPU1                             CPU2   
> vxlan_stop
>   vxlan_flush
>      hash_lock acquired
>                                   vxlan_encap_recv
>                                      vxlan_snoop
>                                         waiting for hash_lock
>      hash_lock relased
>   vxlan_flush done
>                                         hash_lock acquired
>                                         vxlan_fdb_create
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>
This doesn't apply with git am.
% git am /tmp/vxlan-fix-02.diff
Applying: vxlan: fix race between flush and incoming learning
error: patch failed: drivers/net/vxlan.c:632
error: drivers/net/vxlan.c: patch does not apply
Patch failed at 0001 vxlan: fix race between flush and incoming learning
When you have resolved this problem run "git am --resolved".
If you would prefer to skip this patch, instead run "git am --skip".
To restore the original branch and stop patching run "git am --abort".
But can be applied manually... just fyi.
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void
  2013-06-10 20:24 ` [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void Stephen Hemminger
@ 2013-06-11  1:30   ` Cong Wang
  2013-06-11  3:14     ` Stephen Hemminger
  0 siblings, 1 reply; 18+ messages in thread
From: Cong Wang @ 2013-06-11  1:30 UTC (permalink / raw)
  To: netdev
On Mon, 10 Jun 2013 at 20:24 GMT, Stephen Hemminger <stephen@networkplumber.org> wrote:
>  		skb1 = skb_clone(skb, GFP_ATOMIC);
> -		if (skb1) {
> -			rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
> -			if (rc == NETDEV_TX_OK)
> -				rc = rc1;
> -		}
None of your previous patches in _this_ patchset fixes the return value
of skb_clone(), therefore this patch can't be applied directly.
> +		if (skb1)
> +			vxlan_xmit_one(skb1, dev, rdst, did_rsc);
Thanks!
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal
  2013-06-10 20:24 ` [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal Stephen Hemminger
@ 2013-06-11  2:00   ` Cong Wang
  0 siblings, 0 replies; 18+ messages in thread
From: Cong Wang @ 2013-06-11  2:00 UTC (permalink / raw)
  To: netdev
On Mon, 10 Jun 2013 at 20:24 GMT, Stephen Hemminger <stephen@networkplumber.org> wrote:
> Switch to using a per module work queue so that all the socket
> deletion callbacks are done when module is removed.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning
  2013-06-10 20:24 ` [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning Stephen Hemminger
  2013-06-11  0:46   ` Cong Wang
@ 2013-06-11  2:00   ` Cong Wang
  1 sibling, 0 replies; 18+ messages in thread
From: Cong Wang @ 2013-06-11  2:00 UTC (permalink / raw)
  To: netdev
On Mon, 10 Jun 2013 at 20:24 GMT, Stephen Hemminger <stephen@networkplumber.org> wrote:
> It is possible for a packet to arrive during vxlan_stop(), and
> have a dynamic entry created. Close this by checking if device
> is up.
>
>  CPU1                             CPU2   
> vxlan_stop
>   vxlan_flush
>      hash_lock acquired
>                                   vxlan_encap_recv
>                                      vxlan_snoop
>                                         waiting for hash_lock
>      hash_lock relased
>   vxlan_flush done
>                                         hash_lock acquired
>                                         vxlan_fdb_create
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue
  2013-06-10 20:24 ` [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue Stephen Hemminger
@ 2013-06-11  2:01   ` Cong Wang
  0 siblings, 0 replies; 18+ messages in thread
From: Cong Wang @ 2013-06-11  2:01 UTC (permalink / raw)
  To: netdev
On Mon, 10 Jun 2013 at 20:24 GMT, Stephen Hemminger <stephen@networkplumber.org> wrote:
> Do join/leave from work queue to avoid lock inversion problems
> between normal socket and RTNL. The code comes out cleaner
> as well.
>
> Uses Cong Wang's suggestion to turn refcnt into a real atomic
> since now need to handle case where last use of socket is IGMP
> worker.
>
> Also fixes race where vxlan_stop could be called after
> device was deleted on module removal. The call to rtnl_link_unregister
> would call dellink while vxlan device was still up. Reordering
> the calls fixes it.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>
Tested-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
^ permalink raw reply	[flat|nested] 18+ messages in thread
* Re: [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void
  2013-06-11  1:30   ` Cong Wang
@ 2013-06-11  3:14     ` Stephen Hemminger
  0 siblings, 0 replies; 18+ messages in thread
From: Stephen Hemminger @ 2013-06-11  3:14 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev
Huh? you are looking at old version and the current one works fine.
^ permalink raw reply	[flat|nested] 18+ messages in thread
end of thread, other threads:[~2013-06-11  3:15 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20130610200524.721617349@vyatta.com>
2013-06-10 20:24 ` [PATCH net-next 12/12] vxlan: bump version Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 11/12] vxlan whitespace cleanup Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 10/12] vxlan: use initializer for dummy structures Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 09/12] vxlan: port module param should be ushort Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 07/12] vxlan: make vxlan_xmit_one void Stephen Hemminger
2013-06-11  1:30   ` Cong Wang
2013-06-11  3:14     ` Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 06/12] vxlan: move freecpu to uninit Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 05/12] vxlan: fix race caused by dropping rtnl_unlock Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 04/12] vxlan: send notification when MAC migrates Stephen Hemminger
2013-06-10 20:24 ` [PATCH net-next 03/12] vxlan: move IGMP join/leave to work queue Stephen Hemminger
2013-06-11  2:01   ` Cong Wang
2013-06-10 20:24 ` [PATCH net-next 02/12] vxlan: fix race between flush and incoming learning Stephen Hemminger
2013-06-11  0:46   ` Cong Wang
2013-06-11  2:00   ` Cong Wang
2013-06-10 20:24 ` [PATCH net-next 01/12] vxlan: fix crash from work pending on module removal Stephen Hemminger
2013-06-11  2:00   ` Cong Wang
2013-06-10 20:25 ` [PATCH net-next 08/12] vxlan: convert remotes list to list_rcu Stephen Hemminger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).