Netdev List
 help / color / mirror / Atom feed
* [PATCH 5/9] AF_UNIX: Deliver message to several recipients in case of multicast
From: Alban Crequy @ 2010-11-22 18:36 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel, Alban Crequy
In-Reply-To: <20101122183447.124afce5@chocolatine.cbg.collabora.co.uk>

unix_dgram_sendmsg() implements the delivery both for SOCK_DGRAM and
SOCK_SEQPACKET Unix sockets.

The delivery is done in an atomic way: either the message is delivered to all
recipients or none, even in case of interruptions or errors.

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
---
 net/unix/af_unix.c |  247 +++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 188 insertions(+), 59 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3cc9695..9207393 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1553,16 +1553,17 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 {
 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
 	struct sock *sk = sock->sk;
-	struct net *net = sock_net(sk);
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr = msg->msg_name;
-	struct sock *other = NULL;
+	struct sock_set *others_set = NULL;
 	int namelen = 0; /* fake GCC */
 	int err;
 	unsigned hash;
 	struct sk_buff *skb;
+	int i;
 	long timeo;
 	struct scm_cookie tmp_scm;
+	int multicast_delivery = !!u->mcast_subscriptions_cnt;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1580,12 +1581,30 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		if (err < 0)
 			goto out;
 		namelen = err;
-	} else {
+	} else if (!multicast_delivery) {
+		struct sock *other;
 		sunaddr = NULL;
 		err = -ENOTCONN;
 		other = unix_peer_get(sk);
 		if (!other)
 			goto out;
+		err = -ENOMEM;
+		others_set = kmalloc(sizeof(struct sock_set)
+				     + sizeof(struct sock_item),
+				     GFP_KERNEL);
+		if (!others_set)
+			goto out;
+		others_set->cnt = 1;
+		sock_hold(other);
+		others_set->items[0].s = other;
+		others_set->items[0].skb = NULL;
+		others_set->items[0].to_deliver = 1;
+	} else {
+		sunaddr = NULL;
+		err = -ENOTCONN;
+		others_set = unix_find_multicast_recipients(sk, NULL, &err);
+		if (!others_set)
+			goto out;
 	}
 
 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
@@ -1613,90 +1632,200 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
 restart:
-	if (!other) {
+	if (!others_set) {
+		struct sock *other;
+		struct unix_sock *otheru;
 		err = -ECONNRESET;
 		if (sunaddr == NULL)
 			goto out_free;
 
-		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
-					hash, &err);
-		if (other == NULL)
+		other = unix_find_other(sock_net(sk), sunaddr, namelen,
+					sk->sk_type, hash, &err);
+		if (!other)
 			goto out_free;
+		otheru = unix_sk(other);
+
+		if (otheru->is_mcast_addr) {
+			/* FIXME: we should send to the requested recipient
+			 * specified in sendto(...dest_addr) instead of the
+			 * recipient specified by setsockopt... */
+			sock_put(other);
+			others_set = unix_find_multicast_recipients(sk, other,
+								    &err);
+			if (!others_set)
+				goto out_free;
+		} else {
+			others_set = kmalloc(sizeof(struct sock_set)
+					     + sizeof(struct sock_item),
+					     GFP_KERNEL);
+			if (!others_set)
+				goto out_free;
+			others_set->cnt = 1;
+			others_set->items[0].s = other;
+			others_set->items[0].skb = NULL;
+			others_set->items[0].to_deliver = 1;
+		}
 	}
 
-	unix_state_lock(other);
-	err = -EPERM;
-	if (!unix_may_send(sk, other))
-		goto out_unlock;
+	for (i = 0 ; i < others_set->cnt ; i++) {
+		struct sock *cur = others_set->items[i].s;
 
-	if (sock_flag(other, SOCK_DEAD)) {
-		/*
-		 *	Check with 1003.1g - what should
-		 *	datagram error
-		 */
-		unix_state_unlock(other);
-		sock_put(other);
+		others_set->items[i].skb = skb_clone(skb, GFP_KERNEL);
+		if (!others_set->items[i].skb) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		skb_set_owner_w(others_set->items[i].skb, sk);
+	}
 
-		err = 0;
-		unix_state_lock(sk);
-		if (unix_peer(sk) == other) {
-			unix_peer(sk) = NULL;
-			unix_state_unlock(sk);
+	for (i = 0 ; i < others_set->cnt ; i++) {
+		struct sock *cur = others_set->items[i].s;
 
-			unix_dgram_disconnected(sk, other);
-			sock_put(other);
-			err = -ECONNREFUSED;
-		} else {
-			unix_state_unlock(sk);
+		if (!others_set->items[i].to_deliver)
+			continue;
+
+		unix_state_lock(cur);
+		err = -EPERM;
+		if (!multicast_delivery && !unix_may_send(sk, cur)) {
+			others_set->items[i].to_deliver = 0;
+			unix_state_unlock(cur);
+			kfree_skb(others_set->items[i].skb);
+			if (multicast_delivery)
+				continue;
+			else
+				goto out_free;
 		}
 
-		other = NULL;
-		if (err)
-			goto out_free;
-		goto restart;
+		if (sock_flag(cur, SOCK_DEAD)) {
+			/*
+			 *	Check with 1003.1g - what should
+			 *	datagram error
+			 */
+			unix_state_unlock(cur);
+
+			err = 0;
+			unix_state_lock(sk);
+			if (unix_peer(sk) == cur) {
+				unix_peer(sk) = NULL;
+				unix_state_unlock(sk);
+
+				unix_dgram_disconnected(sk, cur);
+				sock_put(cur);
+				err = -ECONNREFUSED;
+			} else {
+				unix_state_unlock(sk);
+			}
+
+			kfree_skb(others_set->items[i].skb);
+			if (err)
+				goto out_free;
+
+			if (multicast_delivery) {
+				others_set->items[i].to_deliver = 0;
+				continue;
+			} else {
+				kfree_sock_set(others_set);
+				others_set = NULL;
+				goto restart;
+			}
+		}
+
+		err = -EPIPE;
+		if (cur->sk_shutdown & RCV_SHUTDOWN) {
+			unix_state_unlock(cur);
+			kfree_skb(others_set->items[i].skb);
+			if (multicast_delivery) {
+				others_set->items[i].to_deliver = 0;
+				continue;
+			} else {
+				goto out_free;
+			}
+		}
+
+		if (sk->sk_type != SOCK_SEQPACKET) {
+			err = security_unix_may_send(sk->sk_socket,
+						     cur->sk_socket);
+			if (err) {
+				unix_state_unlock(cur);
+				kfree_skb(others_set->items[i].skb);
+				if (multicast_delivery) {
+					others_set->items[i].to_deliver = 0;
+					continue;
+				} else {
+					goto out_free;
+				}
+			}
+		}
+
+		if (unix_peer(cur) != sk && unix_recvq_full(cur)) {
+			kfree_skb(others_set->items[i].skb);
+ 
+			if (multicast_delivery) {
+				unix_state_unlock(cur);
+				others_set->items[i].to_deliver = 0;
+				continue;
+			} else {
+				if (!timeo) {
+					unix_state_unlock(cur);
+					err = -EAGAIN;
+					goto out_free;
+				}
+
+				timeo = unix_wait_for_peer(cur, timeo);
+
+				err = sock_intr_errno(timeo);
+				if (signal_pending(current))
+					goto out_free;
+
+				kfree_sock_set(others_set);
+				others_set = NULL;
+				goto restart;
+			}
+		}
 	}
 
-	err = -EPIPE;
-	if (other->sk_shutdown & RCV_SHUTDOWN)
-		goto out_unlock;
+	for (i = 0 ; i < others_set->cnt ; i++) {
+		struct sock *cur = others_set->items[i].s;
 
-	if (sk->sk_type != SOCK_SEQPACKET) {
-		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
-		if (err)
-			goto out_unlock;
+		if (!others_set->items[i].to_deliver)
+			continue;
+
+		if (sock_flag(cur, SOCK_RCVTSTAMP))
+			__net_timestamp(others_set->items[i].skb);
+
+		skb_queue_tail(&cur->sk_receive_queue,
+			       others_set->items[i].skb);
 	}
 
-	if (unix_peer(other) != sk && unix_recvq_full(other)) {
-		if (!timeo) {
-			err = -EAGAIN;
-			goto out_unlock;
-		}
+	for (i = 0 ; i < others_set->cnt ; i++) {
+		struct sock *cur = others_set->items[i].s;
 
-		timeo = unix_wait_for_peer(other, timeo);
+		if (!others_set->items[i].to_deliver)
+			continue;
 
-		err = sock_intr_errno(timeo);
-		if (signal_pending(current))
-			goto out_free;
+		unix_state_unlock(cur);
+	}
 
-		goto restart;
+	for (i = 0 ; i < others_set->cnt ; i++) {
+		struct sock *cur = others_set->items[i].s;
+
+		if (!others_set->items[i].to_deliver)
+			continue;
+
+		cur->sk_data_ready(cur, len);
 	}
 
-	if (sock_flag(other, SOCK_RCVTSTAMP))
-		__net_timestamp(skb);
-	skb_queue_tail(&other->sk_receive_queue, skb);
-	unix_state_unlock(other);
-	other->sk_data_ready(other, len);
-	sock_put(other);
+	kfree_skb(skb);
 	scm_destroy(siocb->scm);
+	if (others_set)
+		kfree_sock_set(others_set);
 	return len;
 
-out_unlock:
-	unix_state_unlock(other);
 out_free:
 	kfree_skb(skb);
 out:
-	if (other)
-		sock_put(other);
+	if (others_set)
+		kfree_sock_set(others_set);
 	scm_destroy(siocb->scm);
 	return err;
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH 6/9] AF_UNIX: Apply Linux Socket Filtering to Unix sockets
From: Alban Crequy @ 2010-11-22 18:36 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel, Alban Crequy
In-Reply-To: <20101122183447.124afce5@chocolatine.cbg.collabora.co.uk>

Linux Socket Filters can already be attached to Unix sockets with
setsockopt(sockfd, SOL_SOCKET, SO_{ATTACH,DETACH}_FILTER, ...) But the filter
was never used in Unix sockets so it did not work. This patch uses sk_filter()
to filter buffers before delivery.

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
---
 net/unix/af_unix.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9207393..52e2aa2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1669,6 +1669,7 @@ restart:
 
 	for (i = 0 ; i < others_set->cnt ; i++) {
 		struct sock *cur = others_set->items[i].s;
+		unsigned int pkt_len;
 
 		others_set->items[i].skb = skb_clone(skb, GFP_KERNEL);
 		if (!others_set->items[i].skb) {
@@ -1676,6 +1677,13 @@ restart:
 			goto out_free;
 		}
 		skb_set_owner_w(others_set->items[i].skb, sk);
+
+		pkt_len = sk_filter(cur, others_set->items[i].skb);
+		if (pkt_len != 0) {
+			others_set->items[i].to_deliver = 0;
+			kfree_skb(others_set->items[i].skb);
+			continue;
+		}
 	}
 
 	for (i = 0 ; i < others_set->cnt ; i++) {
-- 
1.7.1

^ permalink raw reply related

* [PATCH 7/9] AF_UNIX: Documentation on multicast Unix Sockets
From: Alban Crequy @ 2010-11-22 18:36 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel, Alban Crequy
In-Reply-To: <20101122183447.124afce5@chocolatine.cbg.collabora.co.uk>

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
---
 .../networking/multicast-unix-sockets.txt          |   76 ++++++++++++++++++++
 1 files changed, 76 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/networking/multicast-unix-sockets.txt

diff --git a/Documentation/networking/multicast-unix-sockets.txt b/Documentation/networking/multicast-unix-sockets.txt
new file mode 100644
index 0000000..b9882a1
--- /dev/null
+++ b/Documentation/networking/multicast-unix-sockets.txt
@@ -0,0 +1,76 @@
+Multicast Unix sockets
+======================
+
+Multicast group memberships are stored in struct unix_mcast nodes. An Unix
+socket can join several multicast groups. Struct unix_mcast nodes are doubly
+linked:
+- In (struct unix_sock)->mcast_subscriptions
+- In (struct unix_sock)->mcast_members
+
+Example
+=======
+
+                   Addr1             Addr2
+                     |                 |
+                     v                 v
+Socket1  ---->  mcast node  ----> mcast node
+                     |
+                     v
+Socket2  ---->  mcast node
+                     |
+                     v
+Socket3  ---->  mcast node
+
+
+Addr1 and Addr2 are struct unix_sock with is_mcast_addr set to 1. They are
+bount to a multicast address with:
+  setsockopt(sockfd, SOL_UNIX, UNIX_CREATE_GROUP, ...).
+
+Socket1, Socket2 and Socket3 are also struct unix_sock. They are associated to
+a multicast address with:
+  setsockopt(sockfd, SOL_UNIX, UNIX_JOIN_GROUP, ...).
+
+Socket1 joined two multicast groups. Socket2 and Socket3 joined one multicast
+group. The multicast group Addr1 has 3 members. Addr2 has one member.
+
+Atomic delivery and ordering
+============================
+
+Each message sent is delivered atomically to either none of the recipients or
+all the recipients, even with interruptions and errors.
+
+The locking is done to keep the ordering consistent on all recipients. We want
+to avoid the following scenario. Two emitters A and B, and 2 recipients C and
+D:
+
+           C    D
+A -------->|    |    Step 1: A's message is delivered to C
+B -------->|    |    Step 2: B's message is delivered to C
+B ---------|--->|    Step 3: B's message is delivered to D
+A ---------|--->|    Step 4: A's message is delivered to D
+
+Although A and B had a list of recipients (C, D) in the same order, C and D
+received the messages in a different order.
+
+
+SOCK_SEQPACKET semantics
+========================
+
+When a connection is performed on a SOCK_SEQPACKET multicast socket, a new
+socket is created and its file descriptor is received with accept(). The new
+socket could join the multicast group from userspace with setsockopt() but
+there would be race: it could lose the first messages sent by an application
+after connect() returns but before setsockopt() is executed.
+
+To avoid that race, the application should use the flag UNIX_MREQ_AUTOJOIN when
+creating the multicast group.
+
+When several connections are established to a SOCK_SEQPACKET multicast socket,
+the creator of the multicast group using UNIX_MREQ_AUTOJOIN would receive the
+messages several times: one time on each accepted socket. To avoid that, the
+creator of the group may prefer to use UNIX_MREQ_SEND_TO_PEER. Then, the
+accepted socket will not be part of the group but will still receive messages
+from its peer.
+
+
+
-- 
1.7.1

^ permalink raw reply related

* [PATCH 8/9] AF_UNIX: add options on multicast connected socket
From: Alban Crequy @ 2010-11-22 18:36 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel, Alban Crequy
In-Reply-To: <20101122183447.124afce5@chocolatine.cbg.collabora.co.uk>

autojoin and send-to-peer

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
---
 include/net/af_unix.h |   27 +++++++++++++++++++++------
 net/unix/af_unix.c    |   44 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index bf114d5..c82b5f8 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -40,18 +40,31 @@ struct unix_skb_parms {
 				spin_lock_nested(&unix_sk(s)->lock, \
 				SINGLE_DEPTH_NESTING)
 
-#define UNIX_MREQ_LOOPBACK	0x01
+/* UNIX socket options */
+#define UNIX_CREATE_GROUP	1
+#define UNIX_JOIN_GROUP		2
+#define UNIX_LEAVE_GROUP	3
+
+/* Flags on unix_mreq */
+
+/* On UNIX_JOIN_GROUP: the socket will receive its own messages
+ * On UNIX_CREATE_GROUP: the accepted sockets will receive their own messages
+ */
+#define UNIX_MREQ_LOOPBACK		0x01
+
+/* On UNIX_CREATE_GROUP: the accepted socket will be member of the multicast
+ * group */
+#define UNIX_MREQ_AUTOJOIN		0x02
+
+/* ON UNIX_JOIN_GROUP: the messages will also be received by the peer */
+#define UNIX_MREQ_SEND_TO_PEER		0x04
+
 struct unix_mreq
 {
 	struct sockaddr_un	address;
 	unsigned int		flags;
 };
 
-/* UNIX socket options */
-#define UNIX_CREATE_GROUP	1
-#define UNIX_JOIN_GROUP		2
-#define UNIX_LEAVE_GROUP	3
-
 #ifdef __KERNEL__
 /* The AF_UNIX socket */
 struct unix_sock {
@@ -69,6 +82,8 @@ struct unix_sock {
 	unsigned int		gc_candidate : 1;
 	unsigned int		gc_maybe_cycle : 1;
 	unsigned int		is_mcast_addr : 1;
+	unsigned int		mcast_auto_join : 1;
+	unsigned int		mcast_send_to_peer : 1;
 
 	/* These multicast fields are protected by the global spinlock
 	 * unix_multicast_lock */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 52e2aa2..d3d6270 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -878,6 +878,17 @@ static int unix_find_multicast_members(struct sock_set *set,
 		set->items[set->cnt].to_deliver = 1;
 		set->cnt++;
 	}
+
+	if (unix_peer(sender) && unix_sk(sender)->mcast_send_to_peer) {
+		if (set->cnt + 1 > recipient_cnt)
+			return -ENOMEM;
+		sock_hold(unix_peer(sender));
+		set->items[set->cnt].s = unix_peer(sender);
+		set->items[set->cnt].skb = NULL;
+		set->items[set->cnt].to_deliver = 1;
+		set->cnt++;
+	}
+
 	return 0;
 }
 
@@ -1226,6 +1237,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int st;
 	int err;
 	long timeo;
+	struct unix_mcast *node = NULL;
 
 	err = unix_mkname(sunaddr, addr_len, &hash);
 	if (err < 0)
@@ -1245,6 +1257,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 	err = -ENOMEM;
 
+	node = kmalloc(sizeof(struct unix_mcast), GFP_KERNEL);
+	if (!node) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	/* create new sock for complete connection */
 	newsk = unix_create1(sock_net(sk), NULL);
 	if (newsk == NULL)
@@ -1261,6 +1279,8 @@ restart:
 	if (!other)
 		goto out;
 
+	otheru = unix_sk(other);
+
 	/* Latch state of peer */
 	unix_state_lock(other);
 
@@ -1332,6 +1352,21 @@ restart:
 		goto out_unlock;
 	}
 
+	/* Multicast sockets */
+	spin_lock(&unix_multicast_lock);
+	if (otheru->is_mcast_addr && otheru->mcast_auto_join) {
+		node->member = unix_sk(newsk);
+		node->addr = otheru;
+		node->flags = 0;
+
+		hlist_add_head(&node->member_node, &otheru->mcast_members);
+		hlist_add_head(&node->subscription_node,
+			       &unix_sk(newsk)->mcast_subscriptions);
+		otheru->mcast_members_cnt++;
+		u->mcast_subscriptions_cnt++;
+	}
+	spin_unlock(&unix_multicast_lock);
+
 	/* The way is open! Fastly set all the necessary fields... */
 
 	sock_hold(sk);
@@ -1341,7 +1376,6 @@ restart:
 	init_peercred(newsk);
 	newu = unix_sk(newsk);
 	newsk->sk_wq		= &newu->peer_wq;
-	otheru = unix_sk(other);
 
 	/* copy address information from listening to new sock*/
 	if (otheru->addr) {
@@ -1380,6 +1414,8 @@ out_unlock:
 
 out:
 	kfree_skb(skb);
+	if (node)
+		kfree(node);
 	if (newsk)
 		unix_release_sock(newsk, 0);
 	if (other)
@@ -1868,6 +1904,8 @@ static int unix_mc_create(struct socket *sock, struct unix_mreq *mreq)
 
 	unix_state_lock(sock->sk);
 	unix_sk(sock->sk)->is_mcast_addr = 1;
+	if (mreq->flags & UNIX_MREQ_AUTOJOIN)
+		unix_sk(sock->sk)->mcast_auto_join = 1;
 	unix_state_unlock(sock->sk);
 
 	return 0;
@@ -1918,6 +1956,10 @@ static int unix_mc_join(struct socket *sock, struct unix_mreq *mreq)
 	node->addr = otheru;
 	node->flags = mreq->flags;
 
+	unix_state_lock(sock->sk);
+	unix_sk(sock->sk)->mcast_send_to_peer = !!(mreq->flags & UNIX_MREQ_SEND_TO_PEER);
+	unix_state_unlock(sock->sk);
+
 	spin_lock(&unix_multicast_lock);
 	hlist_add_head(&node->member_node, &otheru->mcast_members);
 	hlist_add_head(&node->subscription_node, &u->mcast_subscriptions);
-- 
1.7.1

^ permalink raw reply related

* [PATCH 0/9] RFC v2: Multicast and filtering features on AF_UNIX
From: Alban Crequy @ 2010-11-22 18:34 UTC (permalink / raw)
  To: Alban Crequy, David S. Miller, Eric Dumazet, Stephen Hemminger,
	Cyrill 

Hi,

This is a new serie of patches, following my first request for comments
here: http://marc.info/?l=linux-netdev&m=128534977610124

It implements a new multicast features on AF_UNIX datagram and
seqpacket sockets.

My motivation is to use it for D-Bus. The kernel code here does not
contain anything specific to D-Bus, so it could be used for other IPC
mechanisms too.

The patches apply on linux-next-20101122 and can be pulled from:

  git://git.collabora.co.uk/git/user/alban/linux-2.6.35.y/.git unix-multicast5

Comments & questions welcome! I would appreciate a review on the design
and know if it goes in the right direction.

Regards,
Alban Crequy

^ permalink raw reply

* [PATCH 4/9] AF_UNIX: find the recipients for multicast messages
From: Alban Crequy @ 2010-11-22 18:36 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel, Alban Crequy
In-Reply-To: <20101122183447.124afce5@chocolatine.cbg.collabora.co.uk>

unix_find_multicast_recipients() builds an array of recipients. It can either
find the peers of a specific multicast address, or find all the peers of all
multicast group the sender is part of.

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
---
 net/unix/af_unix.c |  144 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 144 insertions(+), 0 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2278829..3cc9695 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -114,15 +114,48 @@
 #include <linux/mount.h>
 #include <net/checksum.h>
 #include <linux/security.h>
+#include <linux/sort.h>
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
+static DEFINE_SPINLOCK(unix_multicast_lock);
 static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
 
+struct sock_item {
+	struct sock *s;
+	struct sk_buff *skb;
+	int to_deliver;
+};
+
+struct sock_set {
+	int cnt;
+	struct sock_item items[0];
+};
+
+static void kfree_sock_set(struct sock_set *set)
+{
+	int i;
+	for (i = 0 ; i < set->cnt ; i++)
+		sock_put(set->items[i].s);
+	kfree(set);
+}
+
+static int sock_item_compare(const void *_a, const void *_b)
+{
+	const struct sock_item *a = _a;
+	const struct sock_item *b = _b;
+	if (a->s > b->s)
+		return 1;
+	else if (a->s < b->s)
+		return -1;
+	else
+		return 0;
+}
+
 #ifdef CONFIG_SECURITY_NETWORK
 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 {
@@ -824,6 +857,117 @@ fail:
 	return NULL;
 }
 
+static int unix_find_multicast_members(struct sock_set *set,
+				       int recipient_cnt,
+				       struct sock *sender,
+				       struct hlist_head *list)
+{
+	struct unix_mcast *node;
+	struct hlist_node *pos;
+	hlist_for_each_entry(node, pos, list,
+			     member_node) {
+		if (set->cnt + 1 > recipient_cnt)
+			return -ENOMEM;
+		if (node->member == unix_sk(sender) &&
+		    !(node->flags & UNIX_MREQ_LOOPBACK))
+			continue;
+
+		sock_hold(&node->member->sk);
+		set->items[set->cnt].s = &node->member->sk;
+		set->items[set->cnt].skb = NULL;
+		set->items[set->cnt].to_deliver = 1;
+		set->cnt++;
+	}
+	return 0;
+}
+
+/* Find the recipients for a message sent by 'sender' to 'addr'. If 'dest' is
+ * NULL, the recipients are peers of all subscribed groups.
+ */
+static struct sock_set *unix_find_multicast_recipients(struct sock *sender,
+						       struct sock *dest,
+						       int *err)
+{
+	struct unix_sock *u = unix_sk(sender);
+	struct unix_mcast *node;
+	struct hlist_node *pos;
+	struct sock_set *set;
+	int recipient_cnt;
+
+	/* We cannot allocate in the spin lock. First, count the recipients */
+try_again:
+	spin_lock(&unix_multicast_lock);
+	if (dest != NULL) {
+		if (unix_sk(dest)->is_mcast_addr) {
+			recipient_cnt = unix_sk(dest)->mcast_members_cnt;
+		} else {
+			recipient_cnt = 1;
+		}
+	} else {
+		recipient_cnt = 0;
+		hlist_for_each_entry(node, pos, &u->mcast_subscriptions,
+				     subscription_node) {
+			recipient_cnt += node->addr->mcast_members_cnt;
+		}
+	}
+	spin_unlock(&unix_multicast_lock);
+
+        /* Allocate for the set and hope the number of recipients does not
+	 * change while the lock is released. If it changes, we have to try
+	 * again... We allocate a bit more than needed, so if a _few_ members
+	 * are added in a multicast group meanwhile, we don't always need to
+	 * try again. */
+	recipient_cnt += 5;
+
+	set = kmalloc(sizeof(struct sock_set)
+		      + sizeof(struct sock_item) * recipient_cnt,
+	    GFP_KERNEL);
+	if (!set) {
+		*err = -ENOMEM;
+		return NULL;
+	}
+	set->cnt = 0;
+
+	spin_lock(&unix_multicast_lock);
+	if (dest && unix_sk(dest)->is_mcast_addr) {
+		/* Message sent to a multicast address */
+		if (unix_find_multicast_members(set, recipient_cnt,
+				sender,
+				&unix_sk(dest)->mcast_members)) {
+			spin_unlock(&unix_multicast_lock);
+			kfree_sock_set(set);
+			goto try_again;
+		}
+	} else if (!dest) {
+		/* Destination not specified, sending to all peers of
+		 * subscribed groups */
+		hlist_for_each_entry(node, pos, &u->mcast_subscriptions,
+				     subscription_node) {
+			if (unix_find_multicast_members(set, recipient_cnt,
+					sender,
+					&node->addr->mcast_members)) {
+				spin_unlock(&unix_multicast_lock);
+				kfree_sock_set(set);
+				goto try_again;
+			}
+		}
+	} else {
+		/* Message sent to a non-multicast address */
+		BUG_ON(recipient_cnt < 1);
+		set->cnt = 1;
+		sock_hold(dest);
+		set->items[0].s = dest;
+		set->items[0].skb = NULL;
+		set->items[0].to_deliver = 1;
+	}
+	spin_unlock(&unix_multicast_lock);
+
+	/* Keep the array ordered to prevent deadlocks on circular waits */
+	sort(set->items, set->cnt, sizeof(struct sock_item),
+	     sock_item_compare, NULL);
+	return set;
+}
+
 
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
-- 
1.7.1


^ permalink raw reply related

* [PATCH 9/9] AF_UNIX: implement poll(POLLOUT) for multicast sockets
From: Alban Crequy @ 2010-11-22 18:36 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel, Alban Crequy
In-Reply-To: <20101122183447.124afce5@chocolatine.cbg.collabora.co.uk>

When a socket subscribed to a multicast group has its incoming queue full, it
can either block the emission to the multicast group or let the messages be
dropped. The latter is useful to monitor all messages without slowing down the
traffic.

It is specified with the flag UNIX_MREQ_DROP_WHEN_FULL when the multicast group
is joined.

poll(POLLOUT) is implemented by checking all receiving queues of subscribed
sockets. If only one of them has its receiving queue full and does not have
UNIX_MREQ_DROP_WHEN_FULL, the multicast socket is not writeable.

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
---
 include/net/af_unix.h |    5 +++++
 net/unix/af_unix.c    |   38 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index c82b5f8..d18499a 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -59,6 +59,10 @@ struct unix_skb_parms {
 /* ON UNIX_JOIN_GROUP: the messages will also be received by the peer */
 #define UNIX_MREQ_SEND_TO_PEER		0x04
 
+/* ON UNIX_JOIN_GROUP: just drop the message instead of blocking if the
+ * receiving queue is full */
+#define UNIX_MREQ_DROP_WHEN_FULL	0x08
+
 struct unix_mreq
 {
 	struct sockaddr_un	address;
@@ -84,6 +88,7 @@ struct unix_sock {
 	unsigned int		is_mcast_addr : 1;
 	unsigned int		mcast_auto_join : 1;
 	unsigned int		mcast_send_to_peer : 1;
+	unsigned int		mcast_drop_when_peer_full : 1;
 
 	/* These multicast fields are protected by the global spinlock
 	 * unix_multicast_lock */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d3d6270..36ee1fe 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -128,7 +128,8 @@ static atomic_long_t unix_nr_socks;
 struct sock_item {
 	struct sock *s;
 	struct sk_buff *skb;
-	int to_deliver;
+	unsigned int to_deliver : 1;
+	unsigned int drop_when_full : 1;
 };
 
 struct sock_set {
@@ -876,6 +877,8 @@ static int unix_find_multicast_members(struct sock_set *set,
 		set->items[set->cnt].s = &node->member->sk;
 		set->items[set->cnt].skb = NULL;
 		set->items[set->cnt].to_deliver = 1;
+		set->items[set->cnt].drop_when_full =
+			!!(node->flags & UNIX_MREQ_DROP_WHEN_FULL);
 		set->cnt++;
 	}
 
@@ -886,6 +889,8 @@ static int unix_find_multicast_members(struct sock_set *set,
 		set->items[set->cnt].s = unix_peer(sender);
 		set->items[set->cnt].skb = NULL;
 		set->items[set->cnt].to_deliver = 1;
+		set->items[set->cnt].drop_when_full =
+			unix_sk(sender)->mcast_drop_when_peer_full;
 		set->cnt++;
 	}
 
@@ -970,6 +975,7 @@ try_again:
 		set->items[0].s = dest;
 		set->items[0].skb = NULL;
 		set->items[0].to_deliver = 1;
+		set->items[0].drop_when_full = 0;
 	}
 	spin_unlock(&unix_multicast_lock);
 
@@ -1805,6 +1811,7 @@ restart:
 			kfree_skb(others_set->items[i].skb);
  
 			if (multicast_delivery) {
+				/* FIXME: check drop_when_full */
 				unix_state_unlock(cur);
 				others_set->items[i].to_deliver = 0;
 				continue;
@@ -1957,7 +1964,10 @@ static int unix_mc_join(struct socket *sock, struct unix_mreq *mreq)
 	node->flags = mreq->flags;
 
 	unix_state_lock(sock->sk);
-	unix_sk(sock->sk)->mcast_send_to_peer = !!(mreq->flags & UNIX_MREQ_SEND_TO_PEER);
+	unix_sk(sock->sk)->mcast_send_to_peer =
+		!!(mreq->flags & UNIX_MREQ_SEND_TO_PEER);
+	unix_sk(sock->sk)->mcast_drop_when_peer_full =
+		!!(mreq->flags & UNIX_MREQ_DROP_WHEN_FULL);
 	unix_state_unlock(sock->sk);
 
 	spin_lock(&unix_multicast_lock);
@@ -2258,6 +2268,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out_unlock;
 	}
 
+	/* FIXME: wake up peers on the multicast group too */
 	wake_up_interruptible_sync_poll(&u->peer_wait,
 					POLLOUT | POLLWRNORM | POLLWRBAND);
 
@@ -2613,6 +2624,9 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk, *other;
 	unsigned int mask, writable;
+	struct sock_set *others;
+	int err = 0;
+	int i;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
@@ -2652,6 +2666,26 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 		}
 		sock_put(other);
 	}
+	/*
+	 * On multicast sockets, we need to check if the receiving queue is
+	 * full on all peers who don't have UNIX_MREQ_DROP_WHEN_FULL.
+	 */
+	others = unix_find_multicast_recipients(sk, NULL, &err);
+	if (!others)
+		goto skip_multicast;
+	for (i = 0 ; i < others->cnt ; i++) {
+		if (others->items[i].drop_when_full)
+			continue;
+		if (unix_peer(others->items[i].s) != sk) {
+			sock_poll_wait(file,
+				&unix_sk(others->items[i].s)->peer_wait, wait);
+			if (unix_recvq_full(others->items[i].s))
+				writable = 0;
+		}
+	}
+	kfree_sock_set(others);
+
+skip_multicast:
 
 	if (writable)
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH 3/9] AF_UNIX: create, join and leave multicast groups with setsockopt
From: David Miller @ 2010-11-22 19:00 UTC (permalink / raw)
  To: alban.crequy
  Cc: eric.dumazet, shemminger, gorcunov, adobriyan, lennart,
	kay.sievers, ian.molton, netdev, linux-kernel
In-Reply-To: <1290450982-17480-3-git-send-email-alban.crequy@collabora.co.uk>

From: Alban Crequy <alban.crequy@collabora.co.uk>
Date: Mon, 22 Nov 2010 18:36:16 +0000

> +	other = unix_find_other(sock_net(sock->sk), &mreq->address, namelen,
> +				sock->type, hash, &err);
> +	if (other)
> +		return -EADDRINUSE;

Leaks 'other'.

^ permalink raw reply

* Re: [PATCH 4/9] AF_UNIX: find the recipients for multicast messages
From: David Miller @ 2010-11-22 19:05 UTC (permalink / raw)
  To: alban.crequy
  Cc: eric.dumazet, shemminger, gorcunov, adobriyan, lennart,
	kay.sievers, ian.molton, netdev, linux-kernel
In-Reply-To: <1290450982-17480-4-git-send-email-alban.crequy@collabora.co.uk>

From: Alban Crequy <alban.crequy@collabora.co.uk>
Date: Mon, 22 Nov 2010 18:36:17 +0000

> unix_find_multicast_recipients() builds an array of recipients. It can either
> find the peers of a specific multicast address, or find all the peers of all
> multicast group the sender is part of.
> 
> Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>

You really should use RCU to lock this stuff, this way sends run
lockless and have less worries wrt. the memory allocation.  You'll
also only take a spinlock in the write paths which change the
multicast groups, which ought to be rare.

Although to be honest you should optimize the case of small numbers of
recipients, in the same way we optimize small numbers of iovecs on
sends.  Have an on-stack array that holds a small number of entries
and use that if the set fits, otherwise dynamic allocation.

^ permalink raw reply

* Re: [PATCH 7/9] AF_UNIX: Documentation on multicast Unix Sockets
From: Rémi Denis-Courmont @ 2010-11-22 19:07 UTC (permalink / raw)
  To: Alban Crequy, netdev; +Cc: linux-kernel
In-Reply-To: <1290450982-17480-7-git-send-email-alban.crequy@collabora.co.uk>

Le lundi 22 novembre 2010 20:36:20 Alban Crequy, vous avez écrit :
> +Multicast Unix sockets
> +======================
> +
> +Multicast group memberships are stored in struct unix_mcast nodes. An Unix
> +socket can join several multicast groups. Struct unix_mcast nodes are
> doubly +linked:
> +- In (struct unix_sock)->mcast_subscriptions
> +- In (struct unix_sock)->mcast_members

I may be stupid, but I found this whole documentation very confusing, and so 
the API it tries to describe. Traditionally:
- Senders may or not may be part of the group and are not kept track of.
- Receivers join to the group then receive message sent to it.
- Loopback defines whether a sender receives its own echo if it sends to a
group that it has joined.
- If connected to a multicast group, messages from the socket are routed to 
the group (in absence of a contradictoy socket address). This has no effect on 
membership to the multicast group under any circumstance.

You cannot 'listen' or 'accept' on a multicast group.

So I am not entirely clear what semantics your patchset is following. But it 
does not seem like "multicast" to me and therefore seems not very well 
documented :-(

-- 
Rémi Denis-Courmont
http://www.remlab.net/
http://fi.linkedin.com/in/remidenis

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2010-11-22 20:10 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Fix screaming IRQ in e1000, from Anupam Chanda.

2) Fix module parameter bustage in qlge, from Sonny Rao.

3) Interface address leak in ipv6, fix from John Fastabend.

   This would have been merged sooner except that I erroneously
   put this into my net-next-2.6 tree, oops.

4) Support for more Marvell PHY variants, from David Daney.

5) Chip variant checking fix in atl1c from Ben Hutchings.

6) Baud rate correction fix in SH-irda driver, from Nicolas Kaiser.

Please pull, thanks a lot!

The following changes since commit b86db4744230c94e480de56f1b7f31117edbf193:

  Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 (2010-11-19 19:46:45 -0800)

are available in the git repository at:

  master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6.git master

Anupam Chanda (1):
      e1000: fix screaming IRQ

Ben Hutchings (1):
      atl1c: Fix hardware type check for enabling OTP CLK

David Daney (3):
      phylib: Use common page register definition for Marvell PHYs.
      phylib: Add support for Marvell 88E1149R devices.
      of/phylib: Use device tree properties to initialize Marvell PHYs.

Eric Dumazet (1):
      net: allow GFP_HIGHMEM in __vmalloc()

John Fastabend (1):
      ipv6: fix missing in6_ifa_put in addrconf

Nicolas Kaiser (1):
      SuperH IrDA: correct Baud rate error correction

Simon Horman (1):
      bonding: change list contact to netdev@vger.kernel.org

Sonny Rao (1):
      qlge: Fix incorrect usage of module parameters and netdev msg level

 MAINTAINERS                    |    2 +-
 drivers/net/atl1c/atl1c_hw.c   |    2 +-
 drivers/net/e1000/e1000_main.c |   12 ++-
 drivers/net/irda/sh_sir.c      |    2 +-
 drivers/net/phy/marvell.c      |  164 ++++++++++++++++++++++++++++++++++++---
 drivers/net/qlge/qlge_main.c   |    6 +-
 include/linux/marvell_phy.h    |    1 +
 net/ceph/buffer.c              |    2 +-
 net/core/request_sock.c        |    4 +-
 net/ipv4/fib_trie.c            |    2 +-
 net/ipv6/addrconf.c            |    6 +-
 net/xfrm/xfrm_hash.c           |    2 +-
 12 files changed, 173 insertions(+), 32 deletions(-)

^ permalink raw reply

* Re: [PATCH 7/9] AF_UNIX: Documentation on multicast Unix Sockets
From: Alban Crequy @ 2010-11-22 20:09 UTC (permalink / raw)
  To: Rémi Denis-Courmont; +Cc: netdev, linux-kernel
In-Reply-To: <201011222107.41548.remi@remlab.net>

Le Mon, 22 Nov 2010 21:07:40 +0200,
"Rémi Denis-Courmont" <remi@remlab.net> a écrit :

> Le lundi 22 novembre 2010 20:36:20 Alban Crequy, vous avez écrit :
> > +Multicast Unix sockets
> > +======================
> > +
> > +Multicast group memberships are stored in struct unix_mcast nodes.
> > An Unix +socket can join several multicast groups. Struct
> > unix_mcast nodes are doubly +linked:
> > +- In (struct unix_sock)->mcast_subscriptions
> > +- In (struct unix_sock)->mcast_members
> 
> I may be stupid, but I found this whole documentation very confusing,
> and so the API it tries to describe. Traditionally:
> - Senders may or not may be part of the group and are not kept track
> of.
> - Receivers join to the group then receive message sent to it.
> - Loopback defines whether a sender receives its own echo if it sends
> to a group that it has joined.
> - If connected to a multicast group, messages from the socket are
> routed to the group (in absence of a contradictoy socket address).
> This has no effect on membership to the multicast group under any
> circumstance.

I keep these traditional properties for multicast on Unix sockets.

> You cannot 'listen' or 'accept' on a multicast group.

Datagram sockets cannot listen() or accept() but seqpacket sockets can.
I would like multicast to work on seqpacket sockets too. In this case,
there is a central daemon who listen(), and accept() returns a new
socket. The central daemon controls the lifetime of the multicast
group and can receive the messages from the peers on the socket
returned by accept() if UNIX_MREQ_SEND_TO_PEER is set.

The accepted socket could join the multicast group (and then receive
messages addressed to the group) with the setsockopt() call, but then
there would be a race that it may not receive the first messages if a
peer connect() and send a message immediately afterwards. connect() can
returns on the peer process before the daemon accept() and runs
setsockopt(). I added the flag UNIX_MREQ_AUTOJOIN (to be set when
creating the multicast group) to prevent that race.

Using connected sockets (seqpacket) is useful for D-Bus because a
central daemon can know when members are connecting and disconnecting
and then emit the D-Bus signal 'NameOwnerChanged'.

> So I am not entirely clear what semantics your patchset is following.
> But it does not seem like "multicast" to me and therefore seems not
> very well documented :-(

I am willing to improve it.

-- 
Alban

^ permalink raw reply

* Re: [PATCH 4/9] AF_UNIX: find the recipients for multicast messages
From: Andi Kleen @ 2010-11-22 20:14 UTC (permalink / raw)
  To: Alban Crequy
  Cc: David S. Miller, Eric Dumazet, Stephen Hemminger, Cyrill Gorcunov,
	Alexey Dobriyan, Lennart Poettering, Kay Sievers, Ian Molton,
	netdev, linux-kernel
In-Reply-To: <1290450982-17480-4-git-send-email-alban.crequy@collabora.co.uk>

Alban Crequy <alban.crequy@collabora.co.uk> writes:

>+static DEFINE_SPINLOCK(unix_multicast_lock);

For DBUS it's probably ok, but I suspect for other usages
the global lock in the multipath fast path is going to hurt
sooner or later.

> +
> +        /* Allocate for the set and hope the number of recipients does not
> +	 * change while the lock is released. If it changes, we have to try
> +	 * again... We allocate a bit more than needed, so if a _few_ members
> +	 * are added in a multicast group meanwhile, we don't always need to
> +	 * try again. */
> +	recipient_cnt += 5;
> +
> +	set = kmalloc(sizeof(struct sock_set)
> +		      + sizeof(struct sock_item) * recipient_cnt,
> +	    GFP_KERNEL);

FWIW for a large number of sockets this will likely run into
memory fragmentation issues. There are various workarounds like
fallback to vmalloc or use something like flex_arrays.


-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply

* Re: pc300too on a modern kernel?
From: Krzysztof Halasa @ 2010-11-22 21:20 UTC (permalink / raw)
  To: Bernie Innocenti; +Cc: Ward Vandewege, lkml, Jan Seiffert, netdev
In-Reply-To: <1290442675.5515.92.camel@giskard.codewiz.org>

(added Cc: netdev)

Bernie Innocenti <bernie@codewiz.org> writes:

> Now the question is: why do we get so many spurious interrupts?

Let me see... we call sca_tx_done() on (isr0 & 0x2020) which are DMIB3
and DMIB1, which in turn are (EOT & (EOTE = 0) | EOM & (EOME = 1)), i.e.
the interrupt is generated on EOM (end of message = packet).

It seems TN-PSC-339A/E is the answer: the interrupt is generated at the
end of the last DMA access filling the TX buffer. Only then the status
is written to the descriptor (=RAM). I guess it didn't make a difference
on older, slower machines, with slower paths from PCI to CPU.
Also I don't know if the descriptor status is being written in the same
DMA transfer (between the chip and on-board SRAM) as the last data
transfer. Perhaps it's another DMA request and arbitration, and perhaps
the chip has to wait for another transfer to finish.

> With this workaround applied, we're st seeing occasional clusters of
> packet loss. We're working to graph the ping loss alongside traffic to
> see if there's any correlation.

That's interesting. I remember seeing some TX underruns at higher
speeds, though nothing alike at 2 Mb/s. What bit rate are you using?
Does "ifconfig hdlc0" show any errors?
-- 
Krzysztof Halasa

^ permalink raw reply

* Re: Fail Transfer of Large Files
From: Francois Romieu @ 2010-11-22 21:48 UTC (permalink / raw)
  To: Michael D. Berger; +Cc: netdev
In-Reply-To: <ic726a$dep$1@dough.gmane.org>

Michael D. Berger <m_d_berger_1900@yahoo.com> :
[...]
> So what do you think ?

You forgot to specify the kernel version and the involved driver.

> Any other suggestions ?

Use a recent kernel ?

What does dmesg say ?

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH 00/62] drivers/net: Use static const
From: Joe Perches @ 2010-11-22 22:10 UTC (permalink / raw)
  To: David Miller
  Cc: ath5k-devel, libertas-dev, e1000-devel, netdev, linux-usb,
	linux-wireless, users, linux-kernel, socketcan-core, ath9k-devel
In-Reply-To: <20101120.205059.193725921.davem@davemloft.net>

On Sat, 2010-11-20 at 20:50 -0800, David Miller wrote: 
> From: Joe Perches <joe@perches.com>
> Date: Sat, 20 Nov 2010 18:38:01 -0800
> 
> > Using static const generally increases object text and decreases data size.
> > It also generally decreases overall object size.
> 
> Joe, I'm going to be frank and say that I'm not going to review and
> apply such a large chunk of networking patches.

Hi David.  No worries.

> Separation is cool, splitting up patches is cool to make review
> perhaps easier and more distributed.

I know you have ownership of netdev.  Thanks for doing all that
work.  You may not get enough feedback on the very good job that
you do at it.

I posted these patches not so much to get you to immediately pick
them up, but to get review from and/or notify the maintainers of
each of these subsystems.  I would have submitted them to Jiri
Kosina/trivial, but he asked me to post and track these style
patches separately and wait for a few weeks before submitting any
remaining patches not picked up by maintainers to him.

> It severely negatively effects my mood,

Can't have that...

> You also make this more difficult for me by not using GIT.

I do of course use git locally.  I have a public tree as well,
but I hardly update it.  I updated it recently for this.

These patches are available in:

  git://repo.or.cz/linux-2.6/trivial-mods.git 20101121_net_next_static_const

Perhaps I should ask Stephen Rothwell to include some trivial tree
branch like this in next.

> Even with patchwork helping me significantly, it's still a lot of work
> to apply large sets of patches.

I think that cc'ing netdev is always appropriate for netdev patches, but
maybe trivial@kernel.org could become linux-trivial@vger.kernel.org or
something like that.

Perhaps there could be some way to automatically mark these exceptionally
trivial patches as something like "not applicable" so they don't appear
on your personal queue.

> If I had to take in John Linville's
> wireless stuff without GIT I'd be banging my head on a wall.
> 
> And these incessant huge patch bombs also take time away from me for
> the things I'd like to at least occiasionally work on that involve
> more intellect than monkeying around with such mindless patches.

Establishing trust is always a long term thing.
Breaking trust is easy too.

Anyway, I think these patches are obvious and correct and can be
directly applied without significant risk.

I think the the issues are:

o should the nominal maintainers of the subsystems pick them up
o should these subsystem maintainers should be bypassed
o should the subsystem mailing lists be cc'd
o should these be pulled as a single changeset or multiples

I think that getting the nominal subsystem maintainers involved
is good, but perhaps not too necessary for these sorts of patches.

btw: it doesn't seem that, other than John Linville for wireless,
you currently pull from many (any?) other people.  Do let me know
if you'd consider pulling these sorts of changes from me.

$ git log --merges --since=2-years-ago drivers/net | grep "^Author: " | sort | uniq -c | sort -rn
    259 Author: David S. Miller <davem@davemloft.net>
    201 Author: Linus Torvalds <torvalds@linux-foundation.org>
     45 Author: John W. Linville <linville@tuxdriver.com>
     15 Author: Russell King <rmk@dyn-67.arm.linux.org.uk>
     15 Author: Ingo Molnar <mingo@elte.hu>
     14 Author: Benjamin Herrenschmidt <benh@kernel.crashing.org>
      7 Author: Russell King <rmk+kernel@arm.linux.org.uk>
      6 Author: Jiri Kosina <jkosina@suse.cz>
      4 Author: Reinette Chatre <reinette.chatre@intel.com>
      3 Author: Roland Dreier <rolandd@cisco.com>
      3 Author: James Morris <jmorris@namei.org>
      2 Author: Tony Lindgren <tony@atomide.com>
      2 Author: Grant Likely <grant.likely@secretlab.ca>
      1 Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
      1 Author: root <root@dyn-67.arm.linux.org.uk>
      1 Author: Paul Mackerras <paulus@samba.org>
      1 Author: Kevin Hilman <khilman@deeprootsystems.com>
      1 Author: James Bottomley <James.Bottomley@HansenPartnership.com>

> Thanks for your understanding.  Feel free to submit this stuff in
> smaller chunks, say ~10 patches at a time.

Let me know if you want to pull or want them in smaller chunks in
say a month from now.

cheers, Joe


------------------------------------------------------------------------------
Increase Visibility of Your 3D Game App & Earn a Chance To Win $500!
Tap into the largest installed PC base & get more eyes on your game by
optimizing for Intel(R) Graphics Technology. Get started today with the
Intel(R) Software Partner Program. Five $500 cash prizes are up for grabs.
http://p.sf.net/sfu/intelisp-dev2dev
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH 00/62] drivers/net: Use static const
From: David Miller @ 2010-11-22 22:19 UTC (permalink / raw)
  To: joe-6d6DIl74uiNBDgjK7y7TUQ
  Cc: ath5k-devel-xDcbHBWguxEUs3QNXV6qNA,
	libertas-dev-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	e1000-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	users-poMEt7QlJxcwIE2E9O76wjtx2kNaKg5H,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	ath9k-devel-xDcbHBWguxHbcTqmT+pZeQ
In-Reply-To: <1290463846.27683.36.camel@Joe-Laptop>

From: Joe Perches <joe-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org>
Date: Mon, 22 Nov 2010 14:10:46 -0800

> btw: it doesn't seem that, other than John Linville for wireless,
> you currently pull from many (any?) other people.  Do let me know
> if you'd consider pulling these sorts of changes from me.

Expand your horizon beyond drivers/net/ and into places like net/
and you'll see I also pull from people like Patrick McHardy, for
example.

^ permalink raw reply

* Re: [PATCH 00/62] drivers/net: Use static const
From: Joe Perches @ 2010-11-22 22:40 UTC (permalink / raw)
  To: David Miller
  Cc: ath5k-devel, libertas-dev, e1000-devel, netdev, linux-usb,
	linux-wireless, users, linux-kernel, socketcan-core, ath9k-devel
In-Reply-To: <20101122.141939.226774888.davem@davemloft.net>

On Mon, 2010-11-22 at 14:19 -0800, David Miller wrote:
> From: Joe Perches <joe@perches.com>
> Date: Mon, 22 Nov 2010 14:10:46 -0800
> > Do let me know if you'd consider pulling these sorts of
> > changes from me.
> Expand your horizon beyond drivers/net/ and into places like net/
> and you'll see I also pull from people like Patrick McHardy, for
> example.

I'll see who picks up or acks what and send a pull request to
you for the remainder of these patches mid December or so.

Do you want micro patches or a single patch?


------------------------------------------------------------------------------
Increase Visibility of Your 3D Game App & Earn a Chance To Win $500!
Tap into the largest installed PC base & get more eyes on your game by
optimizing for Intel(R) Graphics Technology. Get started today with the
Intel(R) Software Partner Program. Five $500 cash prizes are up for grabs.
http://p.sf.net/sfu/intelisp-dev2dev
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: ixgbe dump
From: Yinghai Lu @ 2010-11-22 22:41 UTC (permalink / raw)
  To: Skidmore, Donald C
  Cc: Brandeburg, Jesse, David Miller, NetDev, Kirsher, Jeffrey T
In-Reply-To: <29F4ED941D916B48B88B4D2A4F3D1B9C01CBFC68C5@orsmsx509.amr.corp.intel.com>

On Thu, Nov 18, 2010 at 11:30 AM, Skidmore, Donald C
<donald.c.skidmore@intel.com> wrote:
>>-----Original Message-----
>>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
>>Behalf Of Yinghai Lu
>>Sent: Wednesday, November 17, 2010 5:22 PM
>>To: Brandeburg, Jesse; David Miller
>>Cc: NetDev
>>Subject: ixgbe dump
>>
>>[ 1546.287521] md: stopping all md devices.
>>[ 1547.283729] kvm: exiting hardware virtualization
>>[ 1547.292876] sd 2:2:1:0: [sdb] Synchronizing SCSI cache
>>[ 1547.293831] sd 2:2:0:0: [sda] Synchronizing SCSI cache
>>[ 1547.299627] BUG: unable to handle kernel NULL pointer dereference at
>>0000000000000033
>>[ 1547.315819] IP: [<ffffffff81746273>] ixgbe_set_rx_mode+0x265/0x38e
>>[ 1547.316448] PGD 3ff4487067 PUD 3ff216b067 PMD 0
>>[ 1547.335626] Oops: 0000 [#1] SMP
>>[ 1547.335941] last sysfs file: /sys/kernel/kexec_loaded
>>[ 1547.336381] CPU 0
>>[ 1547.336548] Modules linked in:
>>[ 1547.355798]
>>[ 1547.355968] Pid: 25630, comm: kexec Not tainted 2.6.37-rc2-tip-yh-01961-
>>g6034289-dirty #281      /Sun Fire X4800
>>[ 1547.375849] RIP: 0010:[<ffffffff81746273>]  [<ffffffff81746273>]
>>ixgbe_set_rx_mode+0x265/0x38e
>>[ 1547.395543] RSP: 0018:ffff881fb9d49ce8  EFLAGS: 00010287
>>[ 1547.396080] RAX: 0000000000000000 RBX: ffff88dffe5a0940 RCX:
>>ffff88dffe5a0940
>>[ 1547.415635] RDX: 0000000000000000 RSI: 0000000000000001 RDI:
>>ffffc90077780000
>>[ 1547.416299] RBP: ffff881fb9d49d48 R08: 0000000000000000 R09:
>>0000000000000000
>>[ 1547.435860] R10: 000000000000a608 R11: 0000000000000000 R12:
>>0000000000000000
>>[ 1547.455516] R13: ffff88dffe5a0000 R14: 0000000000003400 R15:
>>ffff881fb9d49db7
>>[ 1547.456257] FS:  00000000006e0850(0063) GS:ffff88207d600000(0000)
>>knlGS:0000000000000000
>>[ 1547.475937] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>>[ 1547.495330] CR2: 0000000000000033 CR3: 0000003ffa77e000 CR4:
>>00000000000006f0
>>[ 1547.496009] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>>0000000000000000
>>[ 1547.515594] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
>>0000000000000400
>>[ 1547.516229] Process kexec (pid: 25630, threadinfo ffff881fb9d48000, task
>>ffff881f8a57a2d0)
>>[ 1547.536039] Stack:
>>[ 1547.536244]  0000000000000040 0000002b00000000 ffff881ffedc1000
>>0000000000000006
>>[ 1547.555894]  ffff88dffe5a1b40 0b00000081454ec8 ffff881fb9d49d48
>>ffff88dffe5a0940
>>[ 1547.575411]  ffff881ffedc1000 000000000000001e 0000000000000000
>>ffff881fb9d49db7
>>[ 1547.576131] Call Trace:
>>[ 1547.595251]  [<ffffffff8174a8f4>] __ixgbe_shutdown+0x9d/0x153
>>[ 1547.595770]  [<ffffffff8174a9c4>] ixgbe_shutdown+0x1a/0x43
>>[ 1547.615250]  [<ffffffff814574bb>] pci_device_shutdown+0x2c/0x40
>>[ 1547.615887]  [<ffffffff8151c5e9>] device_shutdown+0x75/0xb0
>>[ 1547.635268]  [<ffffffff8108f461>] kernel_restart_prepare+0x2c/0x33
>>[ 1547.635927]  [<ffffffff810bc821>] kernel_kexec+0x38/0x6b
>>[ 1547.655252]  [<ffffffff8108f618>] sys_reboot+0x156/0x194
>>[ 1547.655765]  [<ffffffff8114422a>] ? __d_free+0x59/0x5e
>>[ 1547.675145]  [<ffffffff81144283>] ? d_free+0x54/0x66
>>[ 1547.675612]  [<ffffffff8114439d>] ? d_kill+0x3b/0x43
>>[ 1547.695025]  [<ffffffff81144a1c>] ? dput+0x40/0x140
>>[ 1547.695539]  [<ffffffff811351a9>] ? fput+0x1d7/0x1e6
>>[ 1547.696045]  [<ffffffff81036c0c>] ? sysret_check+0x27/0x62
>>[ 1547.715426]  [<ffffffff81cd9772>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>>[ 1547.734897]  [<ffffffff81036bdb>] system_call_fastpath+0x16/0x1b
>>[ 1547.735428] Code: d2 e9 81 00 00 00 48 8b 83 00 12 00 00 8b 80 88 50 00
>>00 0d 00 00 00 80 e9 a7 00 00 00 48 8b 81 00 0a 00 00 48 8b bb 00 12 00 00
>><0f> b6 40 33 83 f8 3f 7f 0d 89 c6 c1 e6 06 81 c6 28 10 00 00 eb
>>[ 1547.775262] RIP  [<ffffffff81746273>] ixgbe_set_rx_mode+0x265/0x38e
>>[ 1547.775909]  RSP <ffff881fb9d49ce8>
>>[ 1547.794997] CR2: 0000000000000033
>>[ 1547.795987] ---[ end trace 4ed9616adc45007c ]---
>>--
>>To unsubscribe from this list: send the line "unsubscribe netdev" in
>>the body of a message to majordomo@vger.kernel.org
>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> Thanks for the dump.
>
> I believe I've found the problem and will get a patch to Jeff shortly.

Did you have that patch posted already?

Thanks

Yinghai

^ permalink raw reply

* [PATCH v2] tcp: restrict net.ipv4.tcp_adv_win_scale (#20312)
From: Alexey Dobriyan @ 2010-11-22 22:54 UTC (permalink / raw)
  To: davem; +Cc: Eric Dumazet, shemminger, netdev, Ben Hutchings
In-Reply-To: <1289830722.2586.5.camel@bwh-desktop>

tcp_win_from_space() does the following:

      if (sysctl_tcp_adv_win_scale <= 0)
              return space >> (-sysctl_tcp_adv_win_scale);
      else
              return space - (space >> sysctl_tcp_adv_win_scale);

"space" is int.

As per C99 6.5.7 (3) shifting int for 32 or more bits is
undefined behaviour.

Indeed, if sysctl_tcp_adv_win_scale is exactly 32,
space >> 32 equals space and function returns 0.

Which means we busyloop in tcp_fixup_rcvbuf().

Restrict net.ipv4.tcp_adv_win_scale to [-31, 31].

Fix https://bugzilla.kernel.org/show_bug.cgi?id=20312

Steps to reproduce:

      echo 32 >/proc/sys/net/ipv4/tcp_adv_win_scale
      wget www.kernel.org
      [softlockup]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 Documentation/networking/ip-sysctl.txt |    1 +
 net/ipv4/sysctl_net_ipv4.c             |    6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -144,6 +144,7 @@ tcp_adv_win_scale - INTEGER
 	Count buffering overhead as bytes/2^tcp_adv_win_scale
 	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
 	if it is <= 0.
+	Possible values are [-31, 31], inclusive.
 	Default: 2
 
 tcp_allowed_congestion_control - STRING
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -26,6 +26,8 @@ static int zero;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
 
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
@@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = {
 		.data		= &sysctl_tcp_adv_win_scale,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_adv_win_scale_min,
+		.extra2		= &tcp_adv_win_scale_max,
 	},
 	{
 		.procname	= "tcp_tw_reuse",

^ permalink raw reply

* Re: ixgbe dump
From: Jeff Kirsher @ 2010-11-22 23:22 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Skidmore, Donald C, Brandeburg, Jesse, David Miller, NetDev
In-Reply-To: <AANLkTinQ-6CaUY3bYd+SMBCaNvU5YiXL2P+gh7N1tM-x@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 4825 bytes --]

On Mon, 2010-11-22 at 14:41 -0800, Yinghai Lu wrote:
> On Thu, Nov 18, 2010 at 11:30 AM, Skidmore, Donald C
> <donald.c.skidmore@intel.com> wrote:
> >>-----Original Message-----
> >>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> >>Behalf Of Yinghai Lu
> >>Sent: Wednesday, November 17, 2010 5:22 PM
> >>To: Brandeburg, Jesse; David Miller
> >>Cc: NetDev
> >>Subject: ixgbe dump
> >>
> >>[ 1546.287521] md: stopping all md devices.
> >>[ 1547.283729] kvm: exiting hardware virtualization
> >>[ 1547.292876] sd 2:2:1:0: [sdb] Synchronizing SCSI cache
> >>[ 1547.293831] sd 2:2:0:0: [sda] Synchronizing SCSI cache
> >>[ 1547.299627] BUG: unable to handle kernel NULL pointer dereference at
> >>0000000000000033
> >>[ 1547.315819] IP: [<ffffffff81746273>] ixgbe_set_rx_mode+0x265/0x38e
> >>[ 1547.316448] PGD 3ff4487067 PUD 3ff216b067 PMD 0
> >>[ 1547.335626] Oops: 0000 [#1] SMP
> >>[ 1547.335941] last sysfs file: /sys/kernel/kexec_loaded
> >>[ 1547.336381] CPU 0
> >>[ 1547.336548] Modules linked in:
> >>[ 1547.355798]
> >>[ 1547.355968] Pid: 25630, comm: kexec Not tainted 2.6.37-rc2-tip-yh-01961-
> >>g6034289-dirty #281      /Sun Fire X4800
> >>[ 1547.375849] RIP: 0010:[<ffffffff81746273>]  [<ffffffff81746273>]
> >>ixgbe_set_rx_mode+0x265/0x38e
> >>[ 1547.395543] RSP: 0018:ffff881fb9d49ce8  EFLAGS: 00010287
> >>[ 1547.396080] RAX: 0000000000000000 RBX: ffff88dffe5a0940 RCX:
> >>ffff88dffe5a0940
> >>[ 1547.415635] RDX: 0000000000000000 RSI: 0000000000000001 RDI:
> >>ffffc90077780000
> >>[ 1547.416299] RBP: ffff881fb9d49d48 R08: 0000000000000000 R09:
> >>0000000000000000
> >>[ 1547.435860] R10: 000000000000a608 R11: 0000000000000000 R12:
> >>0000000000000000
> >>[ 1547.455516] R13: ffff88dffe5a0000 R14: 0000000000003400 R15:
> >>ffff881fb9d49db7
> >>[ 1547.456257] FS:  00000000006e0850(0063) GS:ffff88207d600000(0000)
> >>knlGS:0000000000000000
> >>[ 1547.475937] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> >>[ 1547.495330] CR2: 0000000000000033 CR3: 0000003ffa77e000 CR4:
> >>00000000000006f0
> >>[ 1547.496009] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> >>0000000000000000
> >>[ 1547.515594] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> >>0000000000000400
> >>[ 1547.516229] Process kexec (pid: 25630, threadinfo ffff881fb9d48000, task
> >>ffff881f8a57a2d0)
> >>[ 1547.536039] Stack:
> >>[ 1547.536244]  0000000000000040 0000002b00000000 ffff881ffedc1000
> >>0000000000000006
> >>[ 1547.555894]  ffff88dffe5a1b40 0b00000081454ec8 ffff881fb9d49d48
> >>ffff88dffe5a0940
> >>[ 1547.575411]  ffff881ffedc1000 000000000000001e 0000000000000000
> >>ffff881fb9d49db7
> >>[ 1547.576131] Call Trace:
> >>[ 1547.595251]  [<ffffffff8174a8f4>] __ixgbe_shutdown+0x9d/0x153
> >>[ 1547.595770]  [<ffffffff8174a9c4>] ixgbe_shutdown+0x1a/0x43
> >>[ 1547.615250]  [<ffffffff814574bb>] pci_device_shutdown+0x2c/0x40
> >>[ 1547.615887]  [<ffffffff8151c5e9>] device_shutdown+0x75/0xb0
> >>[ 1547.635268]  [<ffffffff8108f461>] kernel_restart_prepare+0x2c/0x33
> >>[ 1547.635927]  [<ffffffff810bc821>] kernel_kexec+0x38/0x6b
> >>[ 1547.655252]  [<ffffffff8108f618>] sys_reboot+0x156/0x194
> >>[ 1547.655765]  [<ffffffff8114422a>] ? __d_free+0x59/0x5e
> >>[ 1547.675145]  [<ffffffff81144283>] ? d_free+0x54/0x66
> >>[ 1547.675612]  [<ffffffff8114439d>] ? d_kill+0x3b/0x43
> >>[ 1547.695025]  [<ffffffff81144a1c>] ? dput+0x40/0x140
> >>[ 1547.695539]  [<ffffffff811351a9>] ? fput+0x1d7/0x1e6
> >>[ 1547.696045]  [<ffffffff81036c0c>] ? sysret_check+0x27/0x62
> >>[ 1547.715426]  [<ffffffff81cd9772>] ? trace_hardirqs_on_thunk+0x3a/0x3f
> >>[ 1547.734897]  [<ffffffff81036bdb>] system_call_fastpath+0x16/0x1b
> >>[ 1547.735428] Code: d2 e9 81 00 00 00 48 8b 83 00 12 00 00 8b 80 88 50 00
> >>00 0d 00 00 00 80 e9 a7 00 00 00 48 8b 81 00 0a 00 00 48 8b bb 00 12 00 00
> >><0f> b6 40 33 83 f8 3f 7f 0d 89 c6 c1 e6 06 81 c6 28 10 00 00 eb
> >>[ 1547.775262] RIP  [<ffffffff81746273>] ixgbe_set_rx_mode+0x265/0x38e
> >>[ 1547.775909]  RSP <ffff881fb9d49ce8>
> >>[ 1547.794997] CR2: 0000000000000033
> >>[ 1547.795987] ---[ end trace 4ed9616adc45007c ]---
> >>--
> >>To unsubscribe from this list: send the line "unsubscribe netdev" in
> >>the body of a message to majordomo@vger.kernel.org
> >>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> > Thanks for the dump.
> >
> > I believe I've found the problem and will get a patch to Jeff shortly.
> 
> Did you have that patch posted already?
> 
> Thanks
> 
> Yinghai

It is not posted yet. Don got the patch to our testers and they are
doing a quick validation on the patch before I post it to netdev.

I can send you the patch, so that you can assist in letting us know if
it resolves the issue.

Cheers,
Jeff

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: ixgbe dump
From: Yinghai Lu @ 2010-11-22 23:28 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: Skidmore, Donald C, Brandeburg, Jesse, David Miller, NetDev
In-Reply-To: <1290468133.2603.2.camel@jtkirshe-MOBL1>

On Mon, Nov 22, 2010 at 3:22 PM, Jeff Kirsher
<jeffrey.t.kirsher@intel.com> wrote:
>
> It is not posted yet. Don got the patch to our testers and they are
> doing a quick validation on the patch before I post it to netdev.
>
> I can send you the patch, so that you can assist in letting us know if
> it resolves the issue.

sure. please send that to me.

Thanks

^ permalink raw reply

* Re: [PATCH] arch/tile: fix rwlock so would-be write lockers don't block new readers
From: Cypher Wu @ 2010-11-23  1:36 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel, Américo Wang, Eric Dumazet, netdev
In-Reply-To: <4CEA71AD.5010606@tilera.com>

2010/11/22 Chris Metcalf <cmetcalf@tilera.com>:
> On 11/22/2010 12:39 AM, Cypher Wu wrote:
>> 2010/11/15 Chris Metcalf <cmetcalf@tilera.com>:
>>> This avoids a deadlock in the IGMP code where one core gets a read
>>> lock, another core starts trying to get a write lock (thus blocking
>>> new readers), and then the first core tries to recursively re-acquire
>>> the read lock.
>>>
>>> We still try to preserve some degree of balance by giving priority
>>> to additional write lockers that come along while the lock is held
>>> for write, so they can all complete quickly and return the lock to
>>> the readers.
>>>
>>> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
>>> ---
>>> This should apply relatively cleanly to 2.6.26.7 source code too.
>>> [...]
>>
>> I've finished my business trip and tested that patch for more than an
>> hour and it works. The test is still running now.
>>
>> But it seems there still has a potential problem: we used ticket lock
>> for write_lock(), and if there are so many write_lock() occurred, is
>> 256 ticket enough for 64 or even more cores to avoiding overflow?
>> Since is we try to write_unlock() and there's already write_lock()
>> waiting we'll only adding current ticket.
>
> This is OK, since each core can issue at most one (blocking) write_lock(),
> and we have only 64 cores.  Future >256 core machines will be based on
> TILE-Gx anyway, which doesn't have the 256-core limit since it doesn't use
> the spinlock_32.c implementation.
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
>
>

Say, if core A try to write_lock() rwlock and current_ticket_ is 0 and
it write next_ticket_ to 1, when it processing the lock, core B try to
write_lock() again and write next_ticket_ to 2, then when A
write_unlock() it seen that (current_ticket_+1) is not equal to
next_ticket_, so it increment current_ticket_, and core B get the
lock. If core A try write_lock again before core B write_unlock, it
will increment next_ticket_ to 3. And so on.
This may rarely happened, I've tested it yesterday for several hours
it goes very well under pressure.


-- 
Cyberman Wu
http://www.meganovo.com

^ permalink raw reply

* Re: [PATCH] via-rhine: hardware VLAN support
From: Jesse Gross @ 2010-11-23  2:02 UTC (permalink / raw)
  To: Roger Luethi; +Cc: netdev, David S. Miller
In-Reply-To: <20101121131756.GA18509@core.hellgate.ch>

On Sun, Nov 21, 2010 at 5:17 AM, Roger Luethi <rl@hellgate.ch> wrote:
> On Mon, 08 Nov 2010 12:53:57 -0800, Jesse Gross wrote:
>> On Mon, Nov 8, 2010 at 8:21 AM, Roger Luethi <rl@hellgate.ch> wrote:
>> > On Fri, 05 Nov 2010 11:31:56 -0700, Jesse Gross wrote:
>> > Can you point me to a driver that has been switched to use the new methods
>> > already? Is there some other form of documentation?
>>
>> bnx2 is an example of a driver that has been converted.  The commit
>> that actually made the change was
>> 7d0fd2117e3d0550d7987b3aff2bfbc0244cf7c6, which should highlight the
>> differences.  A key point is that drivers should no longer reference
>> vlan groups at all.
>
> bnx2 does not support hardware VLAN filters, but ixgbe does (converted by
> commit f62bbb5e62c6e4a91fb222d22bc46e8d4d7e59ef). ixgbe keeps a list of
> configured VLANs in a device private data structure (active_vlans). Is that
> the model to follow?

Yes, that's right.  The driver should store whatever information it
requires to manage the CAM or restore the state after a board reset.

^ permalink raw reply

* Re: possible kernel oops from user MSS
From: Li Yewang @ 2010-11-23  2:48 UTC (permalink / raw)
  To: David Miller; +Cc: mzhang, netdev
In-Reply-To: <20101112.152607.193708973.davem@davemloft.net>



At 2010-11-13 7:26, David Miller wrote:
> From: Min Zhang<mzhang@mvista.com>
> Date: Fri, 12 Nov 2010 14:59:58 -0800
>
>> Regarding commit 7a1abd08d52fdeddb3e9a5a33f2f15cc6a5674d2 ("tcp:
>> Increase TCP_MAXSEG socket option minimum"). What is the reason
>> TCP_MAXSEG minimum be 64? Isn't the exact be 40 which is
>> TCPOLEN_MD5SIG_ALIGNED(20) + TCPOLEN_TSTAMP_ALIGNED(12) + 8?
>>
>> Or is it better to use TCP_MIN_MSS from tcp.h:
>>
>> /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
>> #define TCP_MIN_MSS        88U
>
> I suppose TCP_MIN_MSS would be better to use, I'll make that
> change, thanks.

   David, do you have plan to fix this bug using TCP_MIN_MSS?




^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox