Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 01/11] netlink: add symbolic value for congested state
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netlink/af_netlink.c |   18 +++++++++++-------
 1 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0a4db02..fc63ca5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -88,6 +88,10 @@ struct listeners {
 	unsigned long		masks[0];
 };
 
+/* state bits */
+#define NETLINK_CONGESTED	0x0
+
+/* flags */
 #define NETLINK_KERNEL_SOCKET	0x1
 #define NETLINK_RECV_PKTINFO	0x2
 #define NETLINK_BROADCAST_SEND_ERROR	0x4
@@ -737,7 +741,7 @@ static void netlink_overrun(struct sock *sk)
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
-		if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
+		if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
 			sk->sk_err = ENOBUFS;
 			sk->sk_error_report(sk);
 		}
@@ -798,7 +802,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 	nlk = nlk_sk(sk);
 
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    test_bit(0, &nlk->state)) {
+	    test_bit(NETLINK_CONGESTED, &nlk->state)) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
@@ -812,7 +816,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 		add_wait_queue(&nlk->wait, &wait);
 
 		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-		     test_bit(0, &nlk->state)) &&
+		     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
 		    !sock_flag(sk, SOCK_DEAD))
 			*timeo = schedule_timeout(*timeo);
 
@@ -876,8 +880,8 @@ static inline void netlink_rcv_wake(struct sock *sk)
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (skb_queue_empty(&sk->sk_receive_queue))
-		clear_bit(0, &nlk->state);
-	if (!test_bit(0, &nlk->state))
+		clear_bit(NETLINK_CONGESTED, &nlk->state);
+	if (!test_bit(NETLINK_CONGESTED, &nlk->state))
 		wake_up_interruptible(&nlk->wait);
 }
 
@@ -958,7 +962,7 @@ static inline int netlink_broadcast_deliver(struct sock *sk,
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-	    !test_bit(0, &nlk->state)) {
+	    !test_bit(NETLINK_CONGESTED, &nlk->state)) {
 		skb_set_owner_r(skb, sk);
 		skb_queue_tail(&sk->sk_receive_queue, skb);
 		sk->sk_data_ready(sk, skb->len);
@@ -1236,7 +1240,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 	case NETLINK_NO_ENOBUFS:
 		if (val) {
 			nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
-			clear_bit(0, &nlk->state);
+			clear_bit(NETLINK_CONGESTED, &nlk->state);
 			wake_up_interruptible(&nlk->wait);
 		} else
 			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 05/11] netlink: add netlink_skb_set_owner_r()
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

For mmap'ed I/O a netlink specific skb destructor needs to be invoked after the
final kfree_skb() to clean up state in the memory mapped frames. This doesn
t work currently since the skb's ownership is transfered to the receiving
socket using skb_set_owner_r(), which orphans the skb, thereby invoking the
destructor prematurely.

Since netlink doesn't account skbs to the originating socket, there's no need
to orphan the skb. Add a netlink specific skb_set_owner_r() variant that does
not orphan the skb and use a netlink specific destructor to call sock_rfree().

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netlink/af_netlink.c |   18 ++++++++++++++++--
 1 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7b9d7d0..1402acf 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -161,6 +161,20 @@ static void netlink_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	sk->sk_data_ready(sk, len);
 }
 
+static void netlink_skb_destructor(struct sk_buff *skb)
+{
+	sock_rfree(skb);
+}
+
+static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+	WARN_ON(skb->sk != NULL);
+	skb->sk = sk;
+	skb->destructor = netlink_skb_destructor;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, skb->truesize);
+}
+
 static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -838,7 +852,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 		}
 		return 1;
 	}
-	skb_set_owner_r(skb, sk);
+	netlink_skb_set_owner_r(skb, sk);
 	return 0;
 }
 
@@ -900,7 +914,7 @@ static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb)
 	ret = -ECONNREFUSED;
 	if (nlk->netlink_rcv != NULL) {
 		ret = skb->len;
-		skb_set_owner_r(skb, sk);
+		netlink_skb_set_owner_r(skb, sk);
 		nlk->netlink_rcv(skb);
 	}
 	kfree_skb(skb);
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 06/11] netlink: memory mapped netlink: ring setup
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Add support for memory mapped RX and TX ring setup and teardown based on
the af_packet.c code. The following patches will use this to add the real
memory mapped receive and transmit functionality.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netlink.h  |   32 +++++
 net/Kconfig              |    9 ++
 net/netlink/af_netlink.c |  287 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 326 insertions(+), 2 deletions(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 2e17c5d..969b95e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -1,6 +1,7 @@
 #ifndef __LINUX_NETLINK_H
 #define __LINUX_NETLINK_H
 
+#include <linux/netlink.h>
 #include <linux/socket.h> /* for sa_family_t */
 #include <linux/types.h>
 
@@ -102,11 +103,42 @@ struct nlmsgerr {
 #define NETLINK_PKTINFO		3
 #define NETLINK_BROADCAST_ERROR	4
 #define NETLINK_NO_ENOBUFS	5
+#define NETLINK_RX_RING		6
+#define NETLINK_TX_RING		7
 
 struct nl_pktinfo {
 	__u32	group;
 };
 
+struct nl_mmap_req {
+	unsigned int	nm_block_size;
+	unsigned int	nm_block_nr;
+	unsigned int	nm_frame_size;
+	unsigned int	nm_frame_nr;
+};
+
+struct nl_mmap_hdr {
+	unsigned int	nm_status;
+	unsigned int	nm_len;
+	__u32		nm_group;
+	/* credentials */
+	__u32		nm_pid;
+	__u32		nm_uid;
+	__u32		nm_gid;
+};
+
+enum nl_mmap_status {
+	NL_MMAP_STATUS_UNUSED,
+	NL_MMAP_STATUS_RESERVED,
+	NL_MMAP_STATUS_VALID,
+	NL_MMAP_STATUS_COPY,
+	NL_MMAP_STATUS_SKIP,
+};
+
+#define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO
+#define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
+#define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+
 #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/
 
 enum {
diff --git a/net/Kconfig b/net/Kconfig
index a073148..93599e0 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -23,6 +23,15 @@ menuconfig NET
 
 if NET
 
+config NETLINK_MMAP
+	bool "Netlink: mmaped I/O"
+	help
+	  This option enables support for memory mapped netlink I/O. This
+	  reduces overhead by avoiding copying data between kernel- and
+	  userspace.
+
+	  If unsure, say N.
+
 config WANT_COMPAT_NETLINK_MESSAGES
 	bool
 	help
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1402acf..6d4db46 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -55,6 +55,7 @@
 #include <linux/types.h>
 #include <linux/audit.h>
 #include <linux/mutex.h>
+#include <linux/vmalloc.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -64,6 +65,20 @@
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
 
+struct netlink_ring {
+	void			**pg_vec;
+	unsigned int		head;
+	unsigned int		frames_per_block;
+	unsigned int		frame_size;
+	unsigned int		frame_max;
+
+	unsigned int		pg_vec_order;
+	unsigned int		pg_vec_pages;
+	unsigned int		pg_vec_len;
+
+	atomic_t		pending;
+};
+
 struct netlink_sock {
 	/* struct sock has to be the first member of netlink_sock */
 	struct sock		sk;
@@ -81,6 +96,12 @@ struct netlink_sock {
 	struct mutex		cb_def_mutex;
 	void			(*netlink_rcv)(struct sk_buff *skb);
 	struct module		*module;
+#ifdef CONFIG_NETLINK_MMAP
+	struct mutex		pg_vec_lock;
+	struct netlink_ring	rx_ring;
+	struct netlink_ring	tx_ring;
+	atomic_t		mapped;
+#endif /* CONFIG_NETLINK_MMAP */
 };
 
 struct listeners {
@@ -153,6 +174,234 @@ static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
 	return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
 }
 
+#ifdef CONFIG_NETLINK_MMAP
+static __pure struct page *pgvec_to_page(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	else
+		return virt_to_page(addr);
+}
+
+static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; i < len; i++) {
+		if (pg_vec[i] != NULL) {
+			if (is_vmalloc_addr(pg_vec[i]))
+				vfree(pg_vec[i]);
+			else
+				free_pages((unsigned long)pg_vec[i], order);
+		}
+	}
+	kfree(pg_vec);
+}
+
+static void *alloc_one_pg_vec_page(unsigned long order)
+{
+	void *buffer;
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
+			  __GFP_NOWARN | __GFP_NORETRY;
+
+	buffer = (void *)__get_free_pages(gfp_flags, order);
+	if (buffer != NULL)
+		return buffer;
+
+	buffer = vzalloc((1 << order) * PAGE_SIZE);
+	if (buffer != NULL)
+		return buffer;
+
+	gfp_flags &= ~__GFP_NORETRY;
+	return (void *)__get_free_pages(gfp_flags, order);
+}
+
+static void **alloc_pg_vec(struct netlink_sock *nlk,
+			   struct nl_mmap_req *req, unsigned int order)
+{
+	unsigned int block_nr = req->nm_block_nr;
+	unsigned int i;
+	void **pg_vec, *ptr;
+
+	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
+	if (pg_vec == NULL)
+		return NULL;
+
+	for (i = 0; i < block_nr; i++) {
+		pg_vec[i] = ptr = alloc_one_pg_vec_page(order);
+		if (pg_vec[i] == NULL)
+			goto err1;
+	}
+
+	return pg_vec;
+err1:
+	free_pg_vec(pg_vec, order, block_nr);
+	return NULL;
+}
+
+static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
+			    bool closing, bool tx_ring)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct netlink_ring *ring;
+	struct sk_buff_head *queue;
+	void **pg_vec = NULL;
+	unsigned int order = 0;
+	int err;
+
+	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
+	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+
+	if (!closing) {
+		if (atomic_read(&nlk->mapped))
+			return -EBUSY;
+		if (atomic_read(&ring->pending))
+			return -EBUSY;
+	}
+
+	if (req->nm_block_nr) {
+		if (ring->pg_vec != NULL)
+			return -EBUSY;
+
+		if ((int)req->nm_block_size <= 0)
+			return -EINVAL;
+		if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
+			return -EINVAL;
+		if (req->nm_frame_size < NL_MMAP_HDRLEN)
+			return -EINVAL;
+		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
+			return -EINVAL;
+
+		ring->frames_per_block = req->nm_block_size /
+					 req->nm_frame_size;
+		if (ring->frames_per_block == 0)
+			return -EINVAL;
+		if (ring->frames_per_block * req->nm_block_nr !=
+		    req->nm_frame_nr)
+			return -EINVAL;
+
+		order = get_order(req->nm_block_size);
+		pg_vec = alloc_pg_vec(nlk, req, order);
+		if (pg_vec == NULL)
+			return -ENOMEM;
+	} else {
+		if (req->nm_frame_nr)
+			return -EINVAL;
+	}
+
+	err = -EBUSY;
+	mutex_lock(&nlk->pg_vec_lock);
+	if (closing || atomic_read(&nlk->mapped) == 0) {
+		err = 0;
+		spin_lock_bh(&queue->lock);
+
+		ring->frame_max		= req->nm_frame_nr - 1;
+		ring->head		= 0;
+		ring->frame_size	= req->nm_frame_size;
+		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
+
+		swap(ring->pg_vec_len, req->nm_block_nr);
+		swap(ring->pg_vec_order, order);
+		swap(ring->pg_vec, pg_vec);
+
+		__skb_queue_purge(queue);
+		spin_unlock_bh(&queue->lock);
+
+		WARN_ON(atomic_read(&nlk->mapped));
+	}
+	mutex_unlock(&nlk->pg_vec_lock);
+
+	if (pg_vec)
+		free_pg_vec(pg_vec, order, req->nm_block_nr);
+	return err;
+}
+
+static void netlink_mm_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct socket *sock = file->private_data;
+	struct sock *sk = sock->sk;
+
+	if (sk)
+		atomic_inc(&nlk_sk(sk)->mapped);
+}
+
+static void netlink_mm_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct socket *sock = file->private_data;
+	struct sock *sk = sock->sk;
+
+	if (sk)
+		atomic_dec(&nlk_sk(sk)->mapped);
+}
+
+static const struct vm_operations_struct netlink_mmap_ops = {
+	.open	= netlink_mm_open,
+	.close	= netlink_mm_close,
+};
+
+static int netlink_mmap(struct file *file, struct socket *sock,
+			struct vm_area_struct *vma)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct netlink_ring *ring;
+	unsigned long start, size, expected;
+	unsigned int i;
+	int err = -EINVAL;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	mutex_lock(&nlk->pg_vec_lock);
+
+	expected = 0;
+	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
+		if (ring->pg_vec == NULL)
+			continue;
+		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
+	}
+
+	if (expected == 0)
+		goto out;
+
+	size = vma->vm_end - vma->vm_start;
+	if (size != expected)
+		goto out;
+
+	start = vma->vm_start;
+	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
+		if (ring->pg_vec == NULL)
+			continue;
+
+		for (i = 0; i < ring->pg_vec_len; i++) {
+			struct page *page;
+			void *kaddr = ring->pg_vec[i];
+			unsigned int pg_num;
+
+			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
+				page = pgvec_to_page(kaddr);
+				err = vm_insert_page(vma, start, page);
+				if (err < 0)
+					goto out;
+				start += PAGE_SIZE;
+				kaddr += PAGE_SIZE;
+			}
+		}
+	}
+
+	atomic_inc(&nlk->mapped);
+	vma->vm_ops = &netlink_mmap_ops;
+	err = 0;
+out:
+	mutex_unlock(&nlk->pg_vec_lock);
+	return 0;
+}
+#else /* CONFIG_NETLINK_MMAP */
+#define netlink_mmap			sock_no_mmap
+#endif /* CONFIG_NETLINK_MMAP */
+
 static void netlink_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	unsigned int len = skb->len;
@@ -186,6 +435,17 @@ static void netlink_sock_destruct(struct sock *sk)
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
+#ifdef CONFIG_NETLINK_MMAP
+	if (1) {
+		struct nl_mmap_req req;
+
+		memset(&req, 0, sizeof(req));
+		if (nlk->rx_ring.pg_vec)
+			netlink_set_ring(sk, &req, true, false);
+		if (nlk->tx_ring.pg_vec)
+			netlink_set_ring(sk, &req, true, true);
+	}
+#endif /* CONFIG_NETLINK_MMAP */
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -448,6 +708,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
 		mutex_init(nlk->cb_mutex);
 	}
 	init_waitqueue_head(&nlk->wait);
+#ifdef CONFIG_NETLINK_MMAP
+	mutex_init(&nlk->pg_vec_lock);
+#endif
 
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
@@ -1222,7 +1485,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 	if (level != SOL_NETLINK)
 		return -ENOPROTOOPT;
 
-	if (optlen >= sizeof(int) &&
+	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
+	    optlen >= sizeof(int) &&
 	    get_user(val, (unsigned int __user *)optval))
 		return -EFAULT;
 
@@ -1266,6 +1530,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
 		err = 0;
 		break;
+#ifdef CONFIG_NETLINK_MMAP
+	case NETLINK_RX_RING:
+	case NETLINK_TX_RING: {
+		struct nl_mmap_req req;
+
+		/* Rings might consume more memory than queue limits, require
+		 * CAP_NET_ADMIN.
+		 */
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (optlen < sizeof(req))
+			return -EINVAL;
+		if (copy_from_user(&req, optval, sizeof(req)))
+			return -EFAULT;
+		err = netlink_set_ring(sk, &req, false,
+				       optname == NETLINK_TX_RING);
+		break;
+	}
+#endif /* CONFIG_NETLINK_MMAP */
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2081,7 +2364,7 @@ static const struct proto_ops netlink_ops = {
 	.getsockopt =	netlink_getsockopt,
 	.sendmsg =	netlink_sendmsg,
 	.recvmsg =	netlink_recvmsg,
-	.mmap =		sock_no_mmap,
+	.mmap =		netlink_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 08/11] netlink: implement memory mapped sendmsg()
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Add support for memory mapped sendmsg() to netlink. Userspace queued to
be processed frames into the TX ring and invokes sendmsg with
msg.iov.iov_base = NULL to trigger processing of all pending messages.

Since the kernel usually performs full message validation before beginning
processing, userspace must be prevented from modifying the message
contents while the kernel is processing them. In order to do so, the
frames contents are copied to an allocated skb in case the the ring is
mapped more than once or the file descriptor is shared (f.i. through
AF_UNIX file descriptor passing).

Otherwise an skb without a data area is allocated, the data pointer set
to point to the data area of the ring frame and the skb is processed.
Once the skb is freed, the destructor releases the frame back to userspace
by setting the status to NL_MMAP_STATUS_UNUSED.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netlink/af_netlink.c |  129 +++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 229bc03..9b6400f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -182,6 +182,11 @@ static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
 	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
 }
 
+static bool netlink_tx_is_mmaped(struct sock *sk)
+{
+	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
+}
+
 static __pure struct page *pgvec_to_page(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
@@ -548,10 +553,108 @@ void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
 	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
 	NETLINK_CB(skb).sk = sk;
 }
+
+static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
+				u32 dst_pid, u32 dst_group,
+				struct sock_iocb *siocb)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct netlink_ring *ring;
+	struct nl_mmap_hdr *hdr;
+	struct sk_buff *skb;
+	unsigned int maxlen;
+	bool excl = true;
+	int err = 0, len = 0;
+
+	/* Netlink messages are validated by the receiver before processing.
+	 * In order to avoid userspace changing the contents of the message
+	 * after validation, the socket and the ring may only be used by a
+	 * single process, otherwise we fall back to copying.
+	 */
+	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
+	    atomic_read(&nlk->mapped) > 1)
+		excl = false;
+
+	mutex_lock(&nlk->pg_vec_lock);
+
+	ring   = &nlk->tx_ring;
+	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
+
+	do {
+		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
+		if (hdr == NULL) {
+			if (!(msg->msg_flags & MSG_DONTWAIT) &&
+			    atomic_read(&nlk->tx_ring.pending))
+				schedule();
+			continue;
+		}
+		if (hdr->nm_len > maxlen) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		netlink_frame_flush_dcache(hdr);
+
+		if (likely(dst_pid == 0 && dst_group == 0 && excl)) {
+			skb = alloc_skb_head(GFP_KERNEL);
+			if (skb == NULL) {
+				err = -ENOBUFS;
+				goto out;
+			}
+			sock_hold(sk);
+			netlink_ring_setup_skb(skb, sk, ring, hdr);
+			NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
+			__skb_put(skb, hdr->nm_len);
+			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
+			atomic_inc(&ring->pending);
+		} else {
+			skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
+			if (skb == NULL) {
+				err = -ENOBUFS;
+				goto out;
+			}
+			__skb_put(skb, hdr->nm_len);
+			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
+			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+		}
+
+		netlink_increment_head(ring);
+
+		NETLINK_CB(skb).pid	  = nlk->pid;
+		NETLINK_CB(skb).dst_group = dst_group;
+		NETLINK_CB(skb).creds	  = siocb->scm->creds;
+
+		err = security_netlink_send(sk, skb);
+		if (err) {
+			kfree_skb(skb);
+			goto out;
+		}
+
+		if (unlikely(dst_group)) {
+			atomic_inc(&skb->users);
+			netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
+		}
+		err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags & MSG_DONTWAIT);
+		if (err < 0)
+			goto out;
+		len += err;
+
+	} while (hdr != NULL ||
+		 (!(msg->msg_flags & MSG_DONTWAIT) &&
+		  atomic_read(&nlk->tx_ring.pending)));
+
+	if (len > 0)
+		err = len;
+out:
+	mutex_unlock(&nlk->pg_vec_lock);
+	return err;
+}
 #else /* CONFIG_NETLINK_MMAP */
 #define netlink_skb_is_mmaped(skb)	false
+#define netlink_tx_is_mmaped(sk)	false
 #define netlink_mmap			sock_no_mmap
 #define netlink_poll			datagram_poll
+#define netlink_mmap_sendmsg(sk, msg, dst_pid, dst_group, siocb)	0
 #endif /* CONFIG_NETLINK_MMAP */
 
 static void netlink_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -578,11 +681,16 @@ static void netlink_skb_destructor(struct sk_buff *skb)
 		hdr = netlink_mmap_hdr(skb);
 		sk = NETLINK_CB(skb).sk;
 
-		if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-			hdr->nm_len = 0;
-			netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
+			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+			ring = &nlk_sk(sk)->tx_ring;
+		} else {
+			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+				hdr->nm_len = 0;
+				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+			}
+			ring = &nlk_sk(sk)->rx_ring;
 		}
-		ring = &nlk_sk(sk)->rx_ring;
 
 		WARN_ON(atomic_read(&ring->pending) == 0);
 		atomic_dec(&ring->pending);
@@ -1266,8 +1374,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 
 	nlk = nlk_sk(sk);
 
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    test_bit(NETLINK_CONGESTED, &nlk->state)) {
+	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
+	    !netlink_skb_is_mmaped(skb)) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
@@ -1320,6 +1429,8 @@ static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
 	int delta;
 
 	WARN_ON(skb->sk != NULL);
+	if (netlink_skb_is_mmaped(skb))
+		return skb;
 
 	delta = skb->end - skb->tail;
 	if (delta * 2 < skb->truesize)
@@ -1839,6 +1950,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			goto out;
 	}
 
+	if (netlink_tx_is_mmaped(sk) &&
+	    msg->msg_iov->iov_base == NULL) {
+		err = netlink_mmap_sendmsg(sk, msg, dst_pid, dst_group, siocb);
+		goto out;
+	}
+
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 03/11] netlink: add helper function for queueing skbs to the receive queue
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Consolidate skb receive queue code to allow overloading it for memory
mapped sockets.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netlink/af_netlink.c |   26 ++++++++++++++------------
 1 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index fc63ca5..a9f876b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -153,6 +153,14 @@ static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
 	return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
 }
 
+static void netlink_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	unsigned int len = skb->len;
+
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, len);
+}
+
 static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -838,8 +846,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
 	int len = skb->len;
 
-	skb_queue_tail(&sk->sk_receive_queue, skb);
-	sk->sk_data_ready(sk, len);
+	netlink_queue_rcv_skb(sk, skb);
 	sock_put(sk);
 	return len;
 }
@@ -964,8 +971,7 @@ static inline int netlink_broadcast_deliver(struct sock *sk,
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 	    !test_bit(NETLINK_CONGESTED, &nlk->state)) {
 		skb_set_owner_r(skb, sk);
-		skb_queue_tail(&sk->sk_receive_queue, skb);
-		sk->sk_data_ready(sk, skb->len);
+		netlink_queue_rcv_skb(sk, skb);
 		return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf;
 	}
 	return -1;
@@ -1689,10 +1695,8 @@ static int netlink_dump(struct sock *sk)
 
 		if (sk_filter(sk, skb))
 			kfree_skb(skb);
-		else {
-			skb_queue_tail(&sk->sk_receive_queue, skb);
-			sk->sk_data_ready(sk, skb->len);
-		}
+		else
+			netlink_queue_rcv_skb(sk, skb);
 		return 0;
 	}
 
@@ -1706,10 +1710,8 @@ static int netlink_dump(struct sock *sk)
 
 	if (sk_filter(sk, skb))
 		kfree_skb(skb);
-	else {
-		skb_queue_tail(&sk->sk_receive_queue, skb);
-		sk->sk_data_ready(sk, skb->len);
-	}
+	else
+		netlink_queue_rcv_skb(sk, skb);
 
 	if (cb->done)
 		cb->done(cb);
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 02/11] net: add function to allocate skbuff head without data area
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Add a function to allocate a skbuff head without any data. This will be
used by memory mapped netlink to attach data from the mmaped area to the
skb.

Additionally change skb_release_all() to check whether the skb has a
data area to allow the skb destructor to clear the data pointer in
case only a head has been allocated.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/skbuff.h |    6 ++++++
 net/core/skbuff.c      |   31 ++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7b996ed..8cfc285 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -521,6 +521,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 	return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
 }
 
+extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
+static inline struct sk_buff *alloc_skb_head(gfp_t priority)
+{
+	return __alloc_skb_head(priority, -1);
+}
+
 extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
 
 extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2beda82..d632de9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -153,6 +153,34 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *
  */
 
+struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
+{
+	struct sk_buff *skb;
+
+	/* Get the HEAD */
+	skb = kmem_cache_alloc_node(skbuff_head_cache,
+				    gfp_mask & ~__GFP_DMA, node);
+	if (!skb)
+		goto out;
+	prefetchw(skb);
+
+	/*
+	 * Only clear those fields we need to clear, not those that we will
+	 * actually initialise below. Hence, don't put any more fields after
+	 * the tail pointer in struct sk_buff!
+	 */
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->data = NULL;
+	skb->truesize = sizeof(struct sk_buff);
+	atomic_set(&skb->users, 1);
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->mac_header = ~0U;
+#endif
+out:
+	return skb;
+}
+
 /**
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
@@ -414,7 +442,8 @@ static void skb_release_head_state(struct sk_buff *skb)
 static void skb_release_all(struct sk_buff *skb)
 {
 	skb_release_head_state(skb);
-	skb_release_data(skb);
+	if (likely(skb->data))
+		skb_release_data(skb);
 }
 
 /**
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH RFC 0/11] netlink: memory mapped I/O
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

The following RFC patches contain an implementation of memory mapped I/O
for netlink. The implementation is modelled after AF_PACKET memory mapped
I/O with a few differences:

- In order to perform memory mapped I/O to userspace, the kernel allocates
  skbs with the data area pointing to the data area of the mapped frames.
  All netlink subsystems assume a linear data area, so for the sake of
  simplicity, the mapped data area is not attached to the paged area but
  to skb->data. This requires introduction of a special skb alloction
  function that just allocates an skb head without the data area. Since this
  is a quite rare use case, I introduced a new function based on __alloc_skb
  instead of splitting it up into head and data alloction. The alternative
  would be to   introduce an __alloc_skb_head and __alloc_skb_data function,
  which would actually be useful for a specific error case in memory mapped
  netlink, but would require a couple of extra instructions for the common
  skb allocation case, so it doesn't really seem worth it.

  In order to get the destination memory area for skb->data before message
  construction, memory mapped netlink I/O needs to look up the destination
  socket during allocation instead of during transmission because the
  ring is owned by the receiveing socket/process. A special skb allocation
  function (netlink_alloc_skb) taking the destination pid as an argument is
  used for this, all subsystems that want to support memory mapped I/O need
  to use this function, automatic fallback to the receive queue happens
  for unconverted subsystems. Dumps automatically use memory mapped I/O if
  the receiving socket has enabled it.

  The visible effect of looking up the destination socket during allocation
  instead of transmission is that message ordering in userspace might
  change in case allocation and transmission aren't performed atomically.
  This usually doesn't matter since most subsystems have a BKL-like lock
  like the rtnl mutex, to my knowledge the currently only existing case
  where it might matter is nfnetlink_queue combined with the recently
  introduced batched verdicts, but a) that subsystem already includes
  sequence numbers which allow userspace to reorder messages in case it
  cares to, also the reodering window is quite small and b) with memory
  mapped transmission batching can be performed in a subsystem indepandant
  manner.

- AF_NETLINK contains flow control for database dumps, with regular I/O
  dump continuation are triggered based on the sockets receive queue space
  and by recvmsg() calls. Since with memory mapped I/O there are no
  recvmsg() calls under normal operation, this is done in netlink_poll(),
  under the assumption that userspace has processed all pending frames
  before invoking poll(), thus the ring is expected to have room for new
  messages. Dumps currently don't benefit as much as they could from
  memory mapped I/O because each single continuation requires a poll()
  call. A more agressive approach seems like a good idea to me, especially
  in case the socket is not subscribed to any multicast groups (IOW only
  receiving explicitly requested data).

Besides that, the memory mapped netlink implementation extends the states
defined by AF_PACKET between userspace and the kernel by a SKIP status, this
is intended for the case that userspace wants to queue frames (specifically
when using nfnetlink_queue, an IDS and stream reassembly, requested by
Eric Leblond) for a longer period of time. The kernel skips over all frames
marked with SKIP when looking or unused frames and only fails when not finding
a free frame or when having skipped the entire ring.

Also noteworthy is memory mapped sendmsg: the kernel performs validation
of messages before accepting and processing them, in order to prevent
userspace from changing the messages contents after validation, the
kernel checks that the ring is only mapped once and the file descriptor
is not shared (in order to avoid having userspace set up another mapping
after the first mentioned check). If either of both is not true, the
message copied to an allocated skb and processed as with regular I/O.
I'd especially appreciate review of this part since I'm not really versed
in memory, file and process management,

The remaining interesting details are included in the changelogs of the
individual patches and the documentation, so I won't repeat them here.

As an example, nfnetlink_queue is convererted to support memory mapped
I/O. Other subsystems that would probably benefit are nfnetlink_log,
audit and maybe ISCSI, not sure. Since I don't own sufficiently powerful
hardware for real testing, my testcases where based on iperf over loopback.
Depending on the MTU, the latest patchset shows a 900% improvement for
an MTU of 1500 and a roughly 300% improvement for an MTU of 15000.

Jesper is taking benchmarks on real hardware sometime soon, for now I'd
just like to get the basic concept reviewed. An example implementation
for userspace-queueing is available at:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/libmnl-mmap.git

once master.kernel.org is up again. My git tree of the kernel parts
is not up to date anymore, the latest patches show way better
performance in my limited test setup.

Comments and rewiew highly welcome!

Cheers,
Patrick

^ permalink raw reply

* [PATCH 11/11] nfnetlink: add support for memory mapped netlink
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink.h |    2 ++
 net/netfilter/nfnetlink.c           |    7 +++++++
 net/netfilter/nfnetlink_log.c       |    9 +++++----
 net/netfilter/nfnetlink_queue.c     |    2 +-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 74d3386..07b48cf 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -78,6 +78,8 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
 extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
+extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+					   u32 dst_pid, gfp_t gfp_mask);
 extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group,
 			  int echo, gfp_t flags);
 extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 1905976..de0d9b3 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -103,6 +103,13 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
+struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+				    u32 dst_pid, gfp_t gfp_mask)
+{
+	return netlink_alloc_skb(net->nfnl, size, dst_pid, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
+
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
 		   unsigned group, int echo, gfp_t flags)
 {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 2d8158a..c37fb0c 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -296,7 +296,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
 }
 
 static struct sk_buff *
-nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
+nfulnl_alloc_skb(u32 peer_pid, unsigned int inst_size, unsigned int pkt_size)
 {
 	struct sk_buff *skb;
 	unsigned int n;
@@ -305,7 +305,7 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
 	 * message.  WARNING: has to be <= 128k due to slab restrictions */
 
 	n = max(inst_size, pkt_size);
-	skb = alloc_skb(n, GFP_ATOMIC);
+	skb = nfnetlink_alloc_skb(&init_net, n, peer_pid, GFP_ATOMIC);
 	if (!skb) {
 		pr_notice("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
 			inst_size);
@@ -314,7 +314,8 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
 			/* try to allocate only as much as we need for current
 			 * packet */
 
-			skb = alloc_skb(pkt_size, GFP_ATOMIC);
+			skb = nfnetlink_alloc_skb(&init_net, pkt_size, peer_pid,
+						  GFP_ATOMIC);
 			if (!skb)
 				pr_err("nfnetlink_log: can't even alloc %u "
 				       "bytes\n", pkt_size);
@@ -642,7 +643,7 @@ nfulnl_log_packet(u_int8_t pf,
 	}
 
 	if (!inst->skb) {
-		inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size);
+		inst->skb = nfulnl_alloc_skb(inst->peer_pid, inst->nlbufsiz, size);
 		if (!inst->skb)
 			goto alloc_failure;
 	}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 00bd475..2bb29a3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -266,7 +266,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	}
 
 
-	skb = alloc_skb(size, GFP_ATOMIC);
+	skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_pid, GFP_ATOMIC);
 	if (!skb)
 		goto nlmsg_failure;
 
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 07/11] netlink: add memory mapped netlink helper functions
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Add helper functions for looking up memory mapped frame headers, reading and
writing their status, setting up skbs with memory mapped data areas and
cleaning up state again and a poll function.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netlink.h  |    8 ++
 net/netlink/af_netlink.c |  184 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 969b95e..955adc1 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -190,10 +190,18 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 	return (struct nlmsghdr *)skb->data;
 }
 
+enum netlink_skb_flags {
+	NETLINK_SKB_MMAPED	= 0x1,		/* Packet data is mmapped */
+	NETLINK_SKB_TX		= 0x2,		/* Packet was sent by userspace */
+	NETLINK_SKB_DELIVERED	= 0x4,		/* Packet was delivered */
+};
+
 struct netlink_skb_parms {
 	struct ucred		creds;		/* Skb credentials	*/
 	__u32			pid;
 	__u32			dst_group;
+	__u32			flags;
+	struct sock		*sk;		/* socket owning mmaped ring */
 };
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6d4db46..229bc03 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -56,6 +56,7 @@
 #include <linux/audit.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -158,6 +159,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
 
 static int netlink_dump(struct sock *sk);
 static void netlink_destroy_callback(struct netlink_callback *cb);
+static void netlink_skb_destructor(struct sk_buff *skb);
 
 static DEFINE_RWLOCK(nl_table_lock);
 static atomic_t nl_table_users = ATOMIC_INIT(0);
@@ -175,6 +177,11 @@ static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
 }
 
 #ifdef CONFIG_NETLINK_MMAP
+static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
+{
+	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
+}
+
 static __pure struct page *pgvec_to_page(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
@@ -398,8 +405,153 @@ out:
 	mutex_unlock(&nlk->pg_vec_lock);
 	return 0;
 }
+
+static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr)
+{
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+	struct page *p_start, *p_end;
+
+	/* First page is flushed through netlink_{get,set}_status */
+	p_start = pgvec_to_page(hdr + PAGE_SIZE);
+	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1);
+	while (p_start <= p_end) {
+		flush_dcache_page(p_start);
+		p_start++;
+	}
+#endif
+}
+
+static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
+{
+	smp_rmb();
+	flush_dcache_page(pgvec_to_page(hdr));
+	return hdr->nm_status;
+}
+
+static void netlink_set_status(struct nl_mmap_hdr *hdr,
+			       enum nl_mmap_status status)
+{
+	hdr->nm_status = status;
+	flush_dcache_page(pgvec_to_page(hdr));
+	smp_wmb();
+}
+
+static struct nl_mmap_hdr *
+__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
+{
+	unsigned int pg_vec_pos, frame_off;
+
+	pg_vec_pos = pos / ring->frames_per_block;
+	frame_off  = pos % ring->frames_per_block;
+
+	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
+}
+
+static struct nl_mmap_hdr *
+netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
+		     enum nl_mmap_status status)
+{
+	struct nl_mmap_hdr *hdr;
+
+	hdr = __netlink_lookup_frame(ring, pos);
+	if (netlink_get_status(hdr) != status)
+		return NULL;
+
+	return hdr;
+}
+
+static struct nl_mmap_hdr *
+netlink_current_frame(const struct netlink_ring *ring,
+		      enum nl_mmap_status status)
+{
+	return netlink_lookup_frame(ring, ring->head, status);
+}
+
+static struct nl_mmap_hdr *
+netlink_previous_frame(const struct netlink_ring *ring,
+		       enum nl_mmap_status status)
+{
+	unsigned int prev;
+
+	prev = ring->head ? ring->head - 1 : ring->frame_max;
+	return netlink_lookup_frame(ring, prev, status);
+}
+
+static void netlink_increment_head(struct netlink_ring *ring)
+{
+	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
+}
+
+static void netlink_forward_ring(struct netlink_ring *ring)
+{
+	unsigned int head = ring->head, pos = head;
+	const struct nl_mmap_hdr *hdr;
+
+	do {
+		hdr = __netlink_lookup_frame(ring, pos);
+		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
+			break;
+		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
+			break;
+		netlink_increment_head(ring);
+	} while (ring->head != head);
+}
+
+static unsigned int netlink_poll(struct file *file, struct socket *sock,
+				 poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	unsigned int mask;
+
+	mask = datagram_poll(file, sock, wait);
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	if (nlk->rx_ring.pg_vec) {
+		netlink_forward_ring(&nlk->rx_ring);
+		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
+			mask |= POLLIN | POLLRDNORM;
+	}
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	if (nlk->tx_ring.pg_vec) {
+		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
+			mask |= POLLOUT | POLLWRNORM;
+	}
+	spin_unlock_bh(&sk->sk_write_queue.lock);
+
+	return mask;
+}
+
+static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
+{
+	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
+}
+
+void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
+			    struct netlink_ring *ring, struct nl_mmap_hdr *hdr)
+{
+	unsigned int size;
+	void *data;
+
+	size = ring->frame_size - NL_MMAP_HDRLEN;
+	data = (void *)hdr + NL_MMAP_HDRLEN;
+
+	skb->head	= data;
+	skb->data	= data;
+	skb_reset_tail_pointer(skb);
+	skb->end	= skb->tail + size;
+	skb->len	= 0;
+
+	skb->destructor	= netlink_skb_destructor;
+	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
+	NETLINK_CB(skb).sk = sk;
+}
 #else /* CONFIG_NETLINK_MMAP */
+#define netlink_skb_is_mmaped(skb)	false
 #define netlink_mmap			sock_no_mmap
+#define netlink_poll			datagram_poll
 #endif /* CONFIG_NETLINK_MMAP */
 
 static void netlink_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -412,7 +564,35 @@ static void netlink_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
-	sock_rfree(skb);
+#ifdef CONFIG_NETLINK_MMAP
+	struct nl_mmap_hdr *hdr;
+	struct netlink_ring *ring;
+	struct sock *sk;
+
+	/* If a packet from the kernel to userspace was freed because of an
+	 * error without being delivered to userspace, the kernel must reset
+	 * the status. In the direction userspace to kernel, the status is
+	 * always reset here after the packet was processed and freed.
+	 */
+	if (netlink_skb_is_mmaped(skb)) {
+		hdr = netlink_mmap_hdr(skb);
+		sk = NETLINK_CB(skb).sk;
+
+		if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+			hdr->nm_len = 0;
+			netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+		}
+		ring = &nlk_sk(sk)->rx_ring;
+
+		WARN_ON(atomic_read(&ring->pending) == 0);
+		atomic_dec(&ring->pending);
+		sock_put(sk);
+
+		skb->data = NULL;
+	}
+#endif
+	if (skb->sk != NULL)
+		sock_rfree(skb);
 }
 
 static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
@@ -2356,7 +2536,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		datagram_poll,
+	.poll =		netlink_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 04/11] netlink: don't orphan skb in netlink_trim()
From: kaber @ 2011-09-03 17:26 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1315070771-18576-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Netlink doesn't account skbs to the sending socket, so the there's no
need to orphan the skb before trimming it.

Removing the skb_orphan() call is required for mmap'ed netlink, which uses
a netlink specific skb destructor that must not be invoked before the
final freeing of the skb.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netlink/af_netlink.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a9f876b..7b9d7d0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -862,7 +862,7 @@ static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
 {
 	int delta;
 
-	skb_orphan(skb);
+	WARN_ON(skb->sk != NULL);
 
 	delta = skb->end - skb->tail;
 	if (delta * 2 < skb->truesize)
-- 
1.7.4.4


^ permalink raw reply related

* Re: [PATCH] usbnet: ignore get interface retval of -EINPROGRESS
From: Oliver Neukum @ 2011-09-03 16:37 UTC (permalink / raw)
  To: Jim Wylder; +Cc: netdev
In-Reply-To: <CAPopfEUUaEChuh1mdZnHHT_9uRfsQmwMu7=gjnYr=AmFWik2HA@mail.gmail.com>

Am Samstag, 3. September 2011, 16:21:00 schrieb Jim Wylder:
> When calling pm_runtime_get, usb_autopm_get_interface_async
> treats a return value of -EINPROGRESS as a success and
> increments the usage count.  Since the interface is resuming,
> it is safe for usbnet_start_xmit to submit the urb.

usbnet_start_xmit() is exported, so simply stating that it is
called when the interface is resumingisn't enough. It seems to
me that the later check for DEV_ASLEEP will save as, but have
you checked for that?

	Regards
		Oliver

^ permalink raw reply

* [PATCH -next] unix stream: Fix use-after-free crashes
From: Yan, Zheng @ 2011-09-03 16:25 UTC (permalink / raw)
  To: netdev; +Cc: davem, sfr, tim.c.chen, jirislaby, sedat.dilek

Commit 0856a30409 (Scm: Remove unnecessary pid & credential references
in Unix socket's send and receive path) introduced a use-after-free bug.
It happens that if skb is consumed and destructed by the receive side
before unix_stream_sendmsg finishes its job.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Reported-by: Jiri Slaby <jirislaby@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@googlemail.com>

---
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e6d9d10..70cf1f9 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1577,6 +1577,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int sent = 0;
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
+	bool scm_ref = true;
 	int max_level;
 
 	if (NULL == siocb->scm)
@@ -1637,12 +1638,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		 */
 		size = min_t(int, size, skb_tailroom(skb));
 
+		/*
+		 * If a single skb is large enough to hold all data, pass the
+		 * scm reference to the skb. Otherwise we should hold a scm
+		 * reference because the skb can be consumed at any time after
+		 * we queue it into sk_receive_queue.
+		 */
+		if (!fds_sent && sent + size >= len)
+			scm_ref = false;
 
-		/* Only send the fds and no ref to pid in the first buffer */
-		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, fds_sent);
+		/* Only send the fds in the first buffer */
+		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent,
+					fds_sent || scm_ref);
 		if (err < 0) {
 			kfree_skb(skb);
-			goto out;
+			goto out_err;
 		}
 		max_level = err + 1;
 		fds_sent = true;
@@ -1650,7 +1660,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 		if (err) {
 			kfree_skb(skb);
-			goto out;
+			goto out_err;
 		}
 
 		unix_state_lock(other);
@@ -1667,10 +1677,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		sent += size;
 	}
 
-	if (skb)
-		scm_release(siocb->scm);
-	else
+	if (scm_ref)
 		scm_destroy(siocb->scm);
+	else
+		scm_release(siocb->scm);
 	siocb->scm = NULL;
 
 	return sent;
@@ -1683,9 +1693,10 @@ pipe_err:
 		send_sig(SIGPIPE, current, 0);
 	err = -EPIPE;
 out_err:
-	if (skb == NULL)
+	if (scm_ref)
 		scm_destroy(siocb->scm);
-out:
+	else
+		scm_release(siocb->scm);
 	siocb->scm = NULL;
 	return sent ? : err;
 }

^ permalink raw reply related

* Re: [next] unix stream crashes
From: Yan, Zheng  @ 2011-09-03 15:38 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Jiri Slaby, Valdis.Kletnieks, Tim Chen, David S. Miller,
	ML netdev, LKML, Stephen Rothwell
In-Reply-To: <CA+icZUWTS4fM9ZjEvjpPUsEzSdtKnF9yZPMpLwQmOGDL6Z9D_Q@mail.gmail.com>

On Sat, Sep 3, 2011 at 10:46 PM, Sedat Dilek <sedat.dilek@googlemail.com> wrote:
> On Sat, Sep 3, 2011 at 3:47 PM, Sedat Dilek <sedat.dilek@googlemail.com> wrote:
>> On Sat, Sep 3, 2011 at 2:30 PM, Yan, Zheng <yanzheng@21cn.com> wrote:
>>> The skb can be destructed before the while loop in unix_stream_sendmsg stops.
>>> please try below patch.
>>>
> [...]
>>
>> I have tested your patch on i386 against:
>>
>> 1. linux-next/patch-v3.1-rc3-next-20110826
>> 2. scm-fix/0001-Revert-Scm-Remove-unnecessary-pid-credential-referen.patch
>> 3. scm-fix-2/scm_send.patch
>> 4. scm-fix-3/0001-Fix-unix-stream-crashes.patch
>>
>> So the BROKEN scm-send path seems to be fixed, now!
>>
> [...]
>> Doing now a 2nd run with:
>>
>> 1. linux-next/patch-v3.1-rc3-next-20110826
>> 2. scm-fix-3/0001-Fix-unix-stream-crashes.patch
>>
>
> 2nd run is fine, too.
>

Thank you, will send a new patch later.

> - Sedat -
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [next] unix stream crashes
From: Sedat Dilek @ 2011-09-03 14:46 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: Jiri Slaby, Valdis.Kletnieks, Tim Chen, David S. Miller,
	ML netdev, LKML, Stephen Rothwell
In-Reply-To: <CA+icZUXULisfr6_EOrj8+q36UMo2mudcoJu3z0SX4T3x_OZQSg@mail.gmail.com>

On Sat, Sep 3, 2011 at 3:47 PM, Sedat Dilek <sedat.dilek@googlemail.com> wrote:
> On Sat, Sep 3, 2011 at 2:30 PM, Yan, Zheng <yanzheng@21cn.com> wrote:
>> The skb can be destructed before the while loop in unix_stream_sendmsg stops.
>> please try below patch.
>>
[...]
>
> I have tested your patch on i386 against:
>
> 1. linux-next/patch-v3.1-rc3-next-20110826
> 2. scm-fix/0001-Revert-Scm-Remove-unnecessary-pid-credential-referen.patch
> 3. scm-fix-2/scm_send.patch
> 4. scm-fix-3/0001-Fix-unix-stream-crashes.patch
>
> So the BROKEN scm-send path seems to be fixed, now!
>
[...]
> Doing now a 2nd run with:
>
> 1. linux-next/patch-v3.1-rc3-next-20110826
> 2. scm-fix-3/0001-Fix-unix-stream-crashes.patch
>

2nd run is fine, too.

- Sedat -

^ permalink raw reply

* [PATCH] usbnet: ignore get interface retval of -EINPROGRESS
From: Jim Wylder @ 2011-09-03 14:21 UTC (permalink / raw)
  To: Oliver Neukum, netdev

When calling pm_runtime_get, usb_autopm_get_interface_async
treats a return value of -EINPROGRESS as a success and
increments the usage count.  Since the interface is resuming,
it is safe for usbnet_start_xmit to submit the urb.  If instead,
usbnet_start_xmit treats this as an error the packet will be
dropped.  Additionally, a corresponding tx_complete will not
run to offset the earlier increment of the usage count from the
call to usb_autopm_get_interface_async.

Signed-off-by: James Wylder <james.wylder@motorola.com>
---
 drivers/net/usb/usbnet.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index ce395fe..90849d6 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1105,7 +1105,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,

 	spin_lock_irqsave(&dev->txq.lock, flags);
 	retval = usb_autopm_get_interface_async(dev->intf);
-	if (retval < 0) {
+	if (retval < 0 && retval != -EINPROGRESS) {
 		spin_unlock_irqrestore(&dev->txq.lock, flags);
 		goto drop;
 	}
-- 
1.7.6

^ permalink raw reply related

* Re: [next] unix stream crashes
From: Sedat Dilek @ 2011-09-03 13:47 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: Jiri Slaby, Valdis.Kletnieks, Tim Chen, David S. Miller,
	ML netdev, LKML, Stephen Rothwell
In-Reply-To: <CAAM7YAkB3VVNMmBMVuvEZuV6oGZeyog37_sjFGUunu+15apvZA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 4744 bytes --]

On Sat, Sep 3, 2011 at 2:30 PM, Yan, Zheng <yanzheng@21cn.com> wrote:
> The skb can be destructed before the while loop in unix_stream_sendmsg stops.
> please try below patch.
>
> ---
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index e6d9d10..f6d7ed7 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1577,6 +1577,7 @@ static int unix_stream_sendmsg(struct kiocb
> *kiocb, struct socket *sock,
>        int sent = 0;
>        struct scm_cookie tmp_scm;
>        bool fds_sent = false;
> +       bool scm_ref = true;
>        int max_level;
>
>        if (NULL == siocb->scm)
> @@ -1637,12 +1638,19 @@ static int unix_stream_sendmsg(struct kiocb
> *kiocb, struct socket *sock,
>                 */
>                size = min_t(int, size, skb_tailroom(skb));
>
> +               /*
> +                * pass the scm reference to the skb if a single skb is large
> +                * enough to hold all data.
> +                */
> +               if (!fds_sent && sent + size >= len)
> +                       scm_ref = false;
>
> -               /* Only send the fds and no ref to pid in the first buffer */
> -               err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, fds_sent);
> +               /* Only send the fds in the first buffer */
> +               err = unix_scm_to_skb(siocb->scm, skb, !fds_sent,
> +                                       fds_sent || scm_ref);
>                if (err < 0) {
>                        kfree_skb(skb);
> -                       goto out;
> +                       goto out_err;
>                }
>                max_level = err + 1;
>                fds_sent = true;
> @@ -1650,7 +1658,7 @@ static int unix_stream_sendmsg(struct kiocb
> *kiocb, struct socket *sock,
>                err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
>                if (err) {
>                        kfree_skb(skb);
> -                       goto out;
> +                       goto out_err;
>                }
>
>                unix_state_lock(other);
> @@ -1667,10 +1675,10 @@ static int unix_stream_sendmsg(struct kiocb
> *kiocb, struct socket *sock,
>                sent += size;
>        }
>
> -       if (skb)
> -               scm_release(siocb->scm);
> -       else
> +       if (scm_ref)
>                scm_destroy(siocb->scm);
> +       else
> +               scm_release(siocb->scm);
>        siocb->scm = NULL;
>
>        return sent;
> @@ -1683,9 +1691,10 @@ pipe_err:
>                send_sig(SIGPIPE, current, 0);
>        err = -EPIPE;
>  out_err:
> -       if (skb == NULL)
> +       if (scm_ref)
>                scm_destroy(siocb->scm);
> -out:
> +       else
> +               scm_release(siocb->scm);
>        siocb->scm = NULL;
>        return sent ? : err;
>  }
>
>

I have tested your patch on i386 against:

1. linux-next/patch-v3.1-rc3-next-20110826
2. scm-fix/0001-Revert-Scm-Remove-unnecessary-pid-credential-referen.patch
3. scm-fix-2/scm_send.patch
4. scm-fix-3/0001-Fix-unix-stream-crashes.patch

So the BROKEN scm-send path seems to be fixed, now!

As the patch arrived "malformed" in my mbox I git-am-ed it on top of
linux-next (next-20110826) GIT repository (patch attached).

After confirmation of Valdis (x86_64) and ACK-by Tim, I would
appreciate a proper patch with all Reported-by/Tested-by/S-o-b etc.
In my case I bisected the issue, I recall that there is sth. like
Bisected-by, so feel free to do so.

Doing now a 2nd run with:

1. linux-next/patch-v3.1-rc3-next-20110826
2. scm-fix-3/0001-Fix-unix-stream-crashes.patch

- Sedat -

> On Sat, Sep 3, 2011 at 2:23 PM, Jiri Slaby <jirislaby@gmail.com> wrote:
>> On 09/03/2011 07:54 AM, Sedat Dilek wrote:
>>>
>>> I saw similiar call-traces with put_cred_rcu() - besides with
>>> kmem_cache_alloc_trace().
>>> My post-it says:
>>> Kernel panic - not syncing: CRED: put_cred_rcu sees f67ac0c0 with usage
>>> -43
>>
>> Hm, Tim, it looks like you put a pid which you did not get?
>>
>> regards,
>> --
>> js
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>

[-- Attachment #2: 0001-Fix-unix-stream-crashes.patch --]
[-- Type: text/x-diff, Size: 2267 bytes --]

From 54d8a5c590c06f070d9adbfffba0b32246d727e2 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <yanzheng@21cn.com>
Date: Sat, 3 Sep 2011 14:30:19 +0200
Subject: [PATCH] Fix unix stream crashes

The skb can be destructed before the while loop in unix_stream_sendmsg stops.
please try below patch.
---
 net/unix/af_unix.c |   27 ++++++++++++++++++---------
 1 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e6d9d10..f6d7ed7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1577,6 +1577,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int sent = 0;
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
+	bool scm_ref = true;
 	int max_level;
 
 	if (NULL == siocb->scm)
@@ -1637,12 +1638,19 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		 */
 		size = min_t(int, size, skb_tailroom(skb));
 
+		/*
+		 * pass the scm reference to the skb if a single skb is large
+		 * enough to hold all data.
+		 */
+		if (!fds_sent && sent + size >= len)
+			scm_ref = false;
 
-		/* Only send the fds and no ref to pid in the first buffer */
-		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, fds_sent);
+		/* Only send the fds in the first buffer */
+		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent,
+					fds_sent || scm_ref);
 		if (err < 0) {
 			kfree_skb(skb);
-			goto out;
+			goto out_err;
 		}
 		max_level = err + 1;
 		fds_sent = true;
@@ -1650,7 +1658,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 		if (err) {
 			kfree_skb(skb);
-			goto out;
+			goto out_err;
 		}
 
 		unix_state_lock(other);
@@ -1667,10 +1675,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		sent += size;
 	}
 
-	if (skb)
-		scm_release(siocb->scm);
-	else
+	if (scm_ref)
 		scm_destroy(siocb->scm);
+	else
+		scm_release(siocb->scm);
 	siocb->scm = NULL;
 
 	return sent;
@@ -1683,9 +1691,10 @@ pipe_err:
 		send_sig(SIGPIPE, current, 0);
 	err = -EPIPE;
 out_err:
-	if (skb == NULL)
+	if (scm_ref)
 		scm_destroy(siocb->scm);
-out:
+	else
+		scm_release(siocb->scm);
 	siocb->scm = NULL;
 	return sent ? : err;
 }
-- 
1.7.6


^ permalink raw reply related

* Re: [patch net-next-2.6 v3] net: consolidate and fix ethtool_ops->get_settings calling
From: Ben Hutchings @ 2011-09-03 13:46 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, ralf, fubar, andy, kaber, bprakash, JBottomley,
	robert.w.love, davem, shemminger, decot, mirq-linux,
	alexander.h.duyck, amit.salecha, eric.dumazet, therbert, paulmck,
	laijs, xiaosuo, greearb, loke.chetan, linux-mips, linux-scsi,
	devel, bridge
In-Reply-To: <20110903133428.GA2821@minipsycho>

On Sat, 2011-09-03 at 15:34 +0200, Jiri Pirko wrote:
> This patch does several things:
> - introduces __ethtool_get_settings which is called from ethtool code and
>   from drivers as well. Put ASSERT_RTNL there.
> - dev_ethtool_get_settings() is replaced by __ethtool_get_settings()
> - changes calling in drivers so rtnl locking is respected. In
>   iboe_get_rate was previously ->get_settings() called unlocked. This
>   fixes it. Also prb_calc_retire_blk_tmo() in af_packet.c had the same
>   problem. Also fixed by calling __dev_get_by_index() instead of
>   dev_get_by_index() and holding rtnl_lock for both calls.
> - introduces rtnl_lock in bnx2fc_vport_create() and fcoe_vport_create()
>   so bnx2fc_if_create() and fcoe_if_create() are called locked as they
>   are from other places.
> - use __ethtool_get_settings() in bonding code
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Ben Hutchings <bhutchings@solarflare.com> [except FCoE bits]

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [patch net-next-2.6 v3] net: consolidate and fix ethtool_ops->get_settings calling
From: Jiri Pirko @ 2011-09-03 13:34 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: netdev, ralf, fubar, andy, kaber, bprakash, JBottomley,
	robert.w.love, davem, shemminger, decot, mirq-linux,
	alexander.h.duyck, amit.salecha, eric.dumazet, therbert, paulmck,
	laijs, xiaosuo, greearb, loke.chetan, linux-mips, linux-scsi,
	devel, bridge
In-Reply-To: <1314989161.3419.5.camel@bwh-desktop>

This patch does several things:
- introduces __ethtool_get_settings which is called from ethtool code and
  from drivers as well. Put ASSERT_RTNL there.
- dev_ethtool_get_settings() is replaced by __ethtool_get_settings()
- changes calling in drivers so rtnl locking is respected. In
  iboe_get_rate was previously ->get_settings() called unlocked. This
  fixes it. Also prb_calc_retire_blk_tmo() in af_packet.c had the same
  problem. Also fixed by calling __dev_get_by_index() instead of
  dev_get_by_index() and holding rtnl_lock for both calls.
- introduces rtnl_lock in bnx2fc_vport_create() and fcoe_vport_create()
  so bnx2fc_if_create() and fcoe_if_create() are called locked as they
  are from other places.
- use __ethtool_get_settings() in bonding code

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

v2->v3:
	-removed dev_ethtool_get_settings()
	-added ASSERT_RTNL into __ethtool_get_settings()
	-prb_calc_retire_blk_tmo - use __dev_get_by_index() and lock
	 around it and __ethtool_get_settings() call
v1->v2:
        add missing export_symbol 
 
---
 arch/mips/txx9/generic/setup_tx4939.c |    2 +-
 drivers/net/bonding/bond_main.c       |   13 +++-----
 drivers/net/macvlan.c                 |    3 +-
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c     |    4 ++-
 drivers/scsi/fcoe/fcoe.c              |    4 ++-
 include/linux/ethtool.h               |    3 ++
 include/linux/netdevice.h             |    3 --
 include/rdma/ib_addr.h                |    6 +++-
 net/8021q/vlan_dev.c                  |    3 +-
 net/bridge/br_if.c                    |    2 +-
 net/core/dev.c                        |   24 ---------------
 net/core/ethtool.c                    |   20 +++++++++---
 net/core/net-sysfs.c                  |    4 +-
 net/packet/af_packet.c                |   52 +++++++++++++++++----------------
 14 files changed, 69 insertions(+), 74 deletions(-)

diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c
index e9f95dc..ba3cec3 100644
--- a/arch/mips/txx9/generic/setup_tx4939.c
+++ b/arch/mips/txx9/generic/setup_tx4939.c
@@ -321,7 +321,7 @@ void __init tx4939_sio_init(unsigned int sclk, unsigned int cts_mask)
 static u32 tx4939_get_eth_speed(struct net_device *dev)
 {
 	struct ethtool_cmd cmd;
-	if (dev_ethtool_get_settings(dev, &cmd))
+	if (__ethtool_get_settings(dev, &cmd))
 		return 100;	/* default 100Mbps */
 
 	return ethtool_cmd_speed(&cmd);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8cb75a6..1dcb07c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -557,7 +557,7 @@ down:
 static int bond_update_speed_duplex(struct slave *slave)
 {
 	struct net_device *slave_dev = slave->dev;
-	struct ethtool_cmd etool = { .cmd = ETHTOOL_GSET };
+	struct ethtool_cmd ecmd;
 	u32 slave_speed;
 	int res;
 
@@ -565,18 +565,15 @@ static int bond_update_speed_duplex(struct slave *slave)
 	slave->speed = SPEED_100;
 	slave->duplex = DUPLEX_FULL;
 
-	if (!slave_dev->ethtool_ops || !slave_dev->ethtool_ops->get_settings)
-		return -1;
-
-	res = slave_dev->ethtool_ops->get_settings(slave_dev, &etool);
+	res = __ethtool_get_settings(slave_dev, &ecmd);
 	if (res < 0)
 		return -1;
 
-	slave_speed = ethtool_cmd_speed(&etool);
+	slave_speed = ethtool_cmd_speed(&ecmd);
 	if (slave_speed == 0 || slave_speed == ((__u32) -1))
 		return -1;
 
-	switch (etool.duplex) {
+	switch (ecmd.duplex) {
 	case DUPLEX_FULL:
 	case DUPLEX_HALF:
 		break;
@@ -585,7 +582,7 @@ static int bond_update_speed_duplex(struct slave *slave)
 	}
 
 	slave->speed = slave_speed;
-	slave->duplex = etool.duplex;
+	slave->duplex = ecmd.duplex;
 
 	return 0;
 }
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 836e13f..b100c90 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -543,7 +543,8 @@ static int macvlan_ethtool_get_settings(struct net_device *dev,
 					struct ethtool_cmd *cmd)
 {
 	const struct macvlan_dev *vlan = netdev_priv(dev);
-	return dev_ethtool_get_settings(vlan->lowerdev, cmd);
+
+	return __ethtool_get_settings(vlan->lowerdev, cmd);
 }
 
 static const struct ethtool_ops macvlan_ethtool_ops = {
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index 2c780a7..820a184 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -673,7 +673,7 @@ static void bnx2fc_link_speed_update(struct fc_lport *lport)
 	struct net_device *netdev = interface->netdev;
 	struct ethtool_cmd ecmd;
 
-	if (!dev_ethtool_get_settings(netdev, &ecmd)) {
+	if (!__ethtool_get_settings(netdev, &ecmd)) {
 		lport->link_supported_speeds &=
 			~(FC_PORTSPEED_1GBIT | FC_PORTSPEED_10GBIT);
 		if (ecmd.supported & (SUPPORTED_1000baseT_Half |
@@ -1001,9 +1001,11 @@ static int bnx2fc_vport_create(struct fc_vport *vport, bool disabled)
 			"this interface\n");
 		return -EIO;
 	}
+	rtnl_lock();
 	mutex_lock(&bnx2fc_dev_lock);
 	vn_port = bnx2fc_if_create(interface, &vport->dev, 1);
 	mutex_unlock(&bnx2fc_dev_lock);
+	rtnl_unlock();
 
 	if (IS_ERR(vn_port)) {
 		printk(KERN_ERR PFX "bnx2fc_vport_create (%s) failed\n",
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 3416ab6..83aa3ac 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -2043,7 +2043,7 @@ int fcoe_link_speed_update(struct fc_lport *lport)
 	struct net_device *netdev = fcoe_netdev(lport);
 	struct ethtool_cmd ecmd;
 
-	if (!dev_ethtool_get_settings(netdev, &ecmd)) {
+	if (!__ethtool_get_settings(netdev, &ecmd)) {
 		lport->link_supported_speeds &=
 			~(FC_PORTSPEED_1GBIT | FC_PORTSPEED_10GBIT);
 		if (ecmd.supported & (SUPPORTED_1000baseT_Half |
@@ -2452,7 +2452,9 @@ static int fcoe_vport_create(struct fc_vport *vport, bool disabled)
 	}
 
 	mutex_lock(&fcoe_config_mutex);
+	rtnl_lock();
 	vn_port = fcoe_if_create(fcoe, &vport->dev, 1);
+	rtnl_unlock();
 	mutex_unlock(&fcoe_config_mutex);
 
 	if (IS_ERR(vn_port)) {
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 3829712..8571f18 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -728,6 +728,9 @@ enum ethtool_sfeatures_retval_bits {
 /* needed by dev_disable_lro() */
 extern int __ethtool_set_flags(struct net_device *dev, u32 flags);
 
+extern int __ethtool_get_settings(struct net_device *dev,
+				  struct ethtool_cmd *cmd);
+
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dad7e4d..8b1080b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2600,9 +2600,6 @@ static inline int netif_is_bond_slave(struct net_device *dev)
 
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
-int dev_ethtool_get_settings(struct net_device *dev,
-			     struct ethtool_cmd *cmd);
-
 static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev)
 {
 	if (dev->features & NETIF_F_RXCSUM)
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index ae8c68f..639a449 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -218,8 +218,12 @@ static inline int iboe_get_rate(struct net_device *dev)
 {
 	struct ethtool_cmd cmd;
 	u32 speed;
+	int err;
 
-	if (dev_ethtool_get_settings(dev, &cmd))
+	rtnl_lock();
+	err = __ethtool_get_settings(dev, &cmd);
+	rtnl_unlock();
+	if (err)
 		return IB_RATE_PORT_CURRENT;
 
 	speed = ethtool_cmd_speed(&cmd);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index eba705b..c8cf939 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -610,7 +610,8 @@ static int vlan_ethtool_get_settings(struct net_device *dev,
 				     struct ethtool_cmd *cmd)
 {
 	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
-	return dev_ethtool_get_settings(vlan->real_dev, cmd);
+
+	return __ethtool_get_settings(vlan->real_dev, cmd);
 }
 
 static void vlan_ethtool_get_drvinfo(struct net_device *dev,
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index b365bba..043a5eb 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -35,7 +35,7 @@ static int port_cost(struct net_device *dev)
 {
 	struct ethtool_cmd ecmd;
 
-	if (!dev_ethtool_get_settings(dev, &ecmd)) {
+	if (!__ethtool_get_settings(dev, &ecmd)) {
 		switch (ethtool_cmd_speed(&ecmd)) {
 		case SPEED_10000:
 			return 2;
diff --git a/net/core/dev.c b/net/core/dev.c
index 11b0fc7..94f3254 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4566,30 +4566,6 @@ void dev_set_rx_mode(struct net_device *dev)
 }
 
 /**
- *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
- *	@dev: device
- *	@cmd: memory area for ethtool_ops::get_settings() result
- *
- *      The cmd arg is initialized properly (cleared and
- *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
- *
- *	Return device's ethtool_ops::get_settings() result value or
- *	-EOPNOTSUPP when device doesn't expose
- *	ethtool_ops::get_settings() operation.
- */
-int dev_ethtool_get_settings(struct net_device *dev,
-			     struct ethtool_cmd *cmd)
-{
-	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
-		return -EOPNOTSUPP;
-
-	memset(cmd, 0, sizeof(struct ethtool_cmd));
-	cmd->cmd = ETHTOOL_GSET;
-	return dev->ethtool_ops->get_settings(dev, cmd);
-}
-EXPORT_SYMBOL(dev_ethtool_get_settings);
-
-/**
  *	dev_get_flags - get flags reported to userspace
  *	@dev: device
  *
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6cdba5f..f444817 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -569,15 +569,25 @@ int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-	struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
-	int err;
+	ASSERT_RTNL();
 
-	if (!dev->ethtool_ops->get_settings)
+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
 		return -EOPNOTSUPP;
 
-	err = dev->ethtool_ops->get_settings(dev, &cmd);
+	memset(cmd, 0, sizeof(struct ethtool_cmd));
+	cmd->cmd = ETHTOOL_GSET;
+	return dev->ethtool_ops->get_settings(dev, cmd);
+}
+EXPORT_SYMBOL(__ethtool_get_settings);
+
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+	int err;
+	struct ethtool_cmd cmd;
+
+	err = __ethtool_get_settings(dev, &cmd);
 	if (err < 0)
 		return err;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 90fdb46..48e6279 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -172,7 +172,7 @@ static ssize_t show_speed(struct device *dev,
 
 	if (netif_running(netdev)) {
 		struct ethtool_cmd cmd;
-		if (!dev_ethtool_get_settings(netdev, &cmd))
+		if (!__ethtool_get_settings(netdev, &cmd))
 			ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
 	}
 	rtnl_unlock();
@@ -190,7 +190,7 @@ static ssize_t show_duplex(struct device *dev,
 
 	if (netif_running(netdev)) {
 		struct ethtool_cmd cmd;
-		if (!dev_ethtool_get_settings(netdev, &cmd))
+		if (!__ethtool_get_settings(netdev, &cmd))
 			ret = sprintf(buf, "%s\n",
 				      cmd.duplex ? "full" : "half");
 	}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2ea3d63..25e68f5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -530,33 +530,35 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
 {
 	struct net_device *dev;
 	unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
+	struct ethtool_cmd ecmd;
+	int err;
 
-	dev = dev_get_by_index(sock_net(&po->sk), po->ifindex);
-	if (unlikely(dev == NULL))
+	rtnl_lock();
+	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
+	if (unlikely(!dev)) {
+		rtnl_unlock();
 		return DEFAULT_PRB_RETIRE_TOV;
-
-	if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
-		struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
-
-		if (!dev->ethtool_ops->get_settings(dev, &ecmd)) {
-			switch (ecmd.speed) {
-			case SPEED_10000:
-				msec = 1;
-				div = 10000/1000;
-				break;
-			case SPEED_1000:
-				msec = 1;
-				div = 1000/1000;
-				break;
-			/*
-			 * If the link speed is so slow you don't really
-			 * need to worry about perf anyways
-			 */
-			case SPEED_100:
-			case SPEED_10:
-			default:
-				return DEFAULT_PRB_RETIRE_TOV;
-			}
+	}
+	err = __ethtool_get_settings(dev, &ecmd);
+	rtnl_unlock();
+	if (!err) {
+		switch (ecmd.speed) {
+		case SPEED_10000:
+			msec = 1;
+			div = 10000/1000;
+			break;
+		case SPEED_1000:
+			msec = 1;
+			div = 1000/1000;
+			break;
+		/*
+		 * If the link speed is so slow you don't really
+		 * need to worry about perf anyways
+		 */
+		case SPEED_100:
+		case SPEED_10:
+		default:
+			return DEFAULT_PRB_RETIRE_TOV;
 		}
 	}
 
-- 
1.7.6

^ permalink raw reply related

* Re: [next] unix stream crashes
From: Yan, Zheng  @ 2011-09-03 12:30 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: sedat.dilek, Sedat Dilek, Valdis.Kletnieks, Tim Chen,
	David S. Miller, ML netdev, LKML, Stephen Rothwell
In-Reply-To: <4E61C7F2.3090902@gmail.com>

The skb can be destructed before the while loop in unix_stream_sendmsg stops.
please try below patch.

---
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e6d9d10..f6d7ed7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1577,6 +1577,7 @@ static int unix_stream_sendmsg(struct kiocb
*kiocb, struct socket *sock,
 	int sent = 0;
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
+	bool scm_ref = true;
 	int max_level;

 	if (NULL == siocb->scm)
@@ -1637,12 +1638,19 @@ static int unix_stream_sendmsg(struct kiocb
*kiocb, struct socket *sock,
 		 */
 		size = min_t(int, size, skb_tailroom(skb));

+		/*
+		 * pass the scm reference to the skb if a single skb is large
+		 * enough to hold all data.
+		 */
+		if (!fds_sent && sent + size >= len)
+			scm_ref = false;

-		/* Only send the fds and no ref to pid in the first buffer */
-		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, fds_sent);
+		/* Only send the fds in the first buffer */
+		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent,
+					fds_sent || scm_ref);
 		if (err < 0) {
 			kfree_skb(skb);
-			goto out;
+			goto out_err;
 		}
 		max_level = err + 1;
 		fds_sent = true;
@@ -1650,7 +1658,7 @@ static int unix_stream_sendmsg(struct kiocb
*kiocb, struct socket *sock,
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 		if (err) {
 			kfree_skb(skb);
-			goto out;
+			goto out_err;
 		}

 		unix_state_lock(other);
@@ -1667,10 +1675,10 @@ static int unix_stream_sendmsg(struct kiocb
*kiocb, struct socket *sock,
 		sent += size;
 	}

-	if (skb)
-		scm_release(siocb->scm);
-	else
+	if (scm_ref)
 		scm_destroy(siocb->scm);
+	else
+		scm_release(siocb->scm);
 	siocb->scm = NULL;

 	return sent;
@@ -1683,9 +1691,10 @@ pipe_err:
 		send_sig(SIGPIPE, current, 0);
 	err = -EPIPE;
 out_err:
-	if (skb == NULL)
+	if (scm_ref)
 		scm_destroy(siocb->scm);
-out:
+	else
+		scm_release(siocb->scm);
 	siocb->scm = NULL;
 	return sent ? : err;
 }


On Sat, Sep 3, 2011 at 2:23 PM, Jiri Slaby <jirislaby@gmail.com> wrote:
> On 09/03/2011 07:54 AM, Sedat Dilek wrote:
>>
>> I saw similiar call-traces with put_cred_rcu() - besides with
>> kmem_cache_alloc_trace().
>> My post-it says:
>> Kernel panic - not syncing: CRED: put_cred_rcu sees f67ac0c0 with usage
>> -43
>
> Hm, Tim, it looks like you put a pid which you did not get?
>
> regards,
> --
> js
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply related

* Re: 3.1.0-rc2: irq/112-eth0-Tx: page allocation failure: (w/frame pointers enabled)
From: Justin Piszcz @ 2011-09-03 10:06 UTC (permalink / raw)
  To: Maciej Rutecki; +Cc: linux-kernel, netdev, Alan Piszcz
In-Reply-To: <201109031049.00728.maciej.rutecki@gmail.com>



On Sat, 3 Sep 2011, Maciej Rutecki wrote:

> On pi?tek, 26 sierpnia 2011 o 14:14:12 Justin Piszcz wrote:
>> Hello,
>>
>> Why does this occur on a machine with 48GB of memory (used mainly as a
>> router) and backup server, is it a kernel bug?
>>
>
> It's regression? Any older kernel works OK?

Hi,

It has been occurring since 'threadirqs' was enabled.

Justin.

^ permalink raw reply

* Re: 3.1.0-rc2: irq/112-eth0-Tx: page allocation failure: (w/frame pointers enabled)
From: Maciej Rutecki @ 2011-09-03  8:49 UTC (permalink / raw)
  To: Justin Piszcz; +Cc: linux-kernel, netdev, Alan Piszcz
In-Reply-To: <alpine.DEB.2.02.1108260802590.14539@p34.internal.lan>

On piątek, 26 sierpnia 2011 o 14:14:12 Justin Piszcz wrote:
> Hello,
> 
> Why does this occur on a machine with 48GB of memory (used mainly as a
> router) and backup server, is it a kernel bug?
> 

It's regression? Any older kernel works OK?

Regards
-- 
Maciej Rutecki
http://www.maciek.unixy.pl

^ permalink raw reply

* Re: [PATCH] net: Initialize entire flowi struct
From: Julian Anastasov @ 2011-09-03  7:27 UTC (permalink / raw)
  To: Ward, David - 0663 - MITLL; +Cc: David Miller, netdev@vger.kernel.org
In-Reply-To: <4E5F89E3.5060903@ll.mit.edu>


	Hello,

On Thu, 1 Sep 2011, Ward, David - 0663 - MITLL wrote:

> > 	Not sure if adding size as parameter to flow_hash_code
> > is better approach. May be flow_cache_lookup needs to
> > determine size from family that can be used for flow_hash_code,
> > flow_key_compare and the memcpy(&fle->key, key, sizeof(*key))
> > after fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC).
> 
> Makes sense to me.  However should we just replace flow_key_compare with
> memcmp then, since the assumptions about constant size and alignment will no
> longer apply?  Or should there be a separate flow_key_compare function for
> each family, and have all of the flowi* structures become
> __attribute__((__aligned__(BITS_PER_LONG/8))) ?

	I don't know this code well but I guess memcmp is
not preferred. IMHO, as the callers provide per-family
structures and we do not want to change that, these
structures must be aligned to long type as required by
flow_compare_t and jhash2 (at least u32) usage. The second
option is to use memcmp and jhash instead of jhash2 to
avoid such alignment but I guess other developers will
oppose it. More opinions are needed here.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: [PATCH 1/2] bridge: leave carrier on for empty bridge
From: Stephen Hemminger @ 2011-09-03  6:30 UTC (permalink / raw)
  To: Ang Way Chuang
  Cc: Nicolas de Pesloüan, David S. Miller, netdev, Achmad Basuki
In-Reply-To: <4E618DCD.5070901@sfc.wide.ad.jp>

On Sat, 03 Sep 2011 11:15:41 +0900
Ang Way Chuang <wcang@sfc.wide.ad.jp> wrote:

> A more ideal solution in this case is to add the option to enabling carrier on sysfs and modify libvirtd
> to turn on the carrier if IPv6 is enabled. But it will still break the existing configuration until everyone
> upgrade to the latest libvirtd and kernel. Since there is no other complain from other user with this setup,
> I guess nobody actually assigns IPv6 to libvirtd network device at this moment (partly because virt-manager
> doesn't expose that functionality yet??).

Not sure about adding an option to support a configuration that is only available
by manually editing the xml files. There can't be that many people using libvirt in
this way, and fixing libvirt seems like a better solution.

Rather than rush in a kludge to handle this, let's take a few days and
examine what libvirt is trying to do and how it is doing it.

^ permalink raw reply

* Re: [next] unix stream crashes
From: Jiri Slaby @ 2011-09-03  6:23 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Sedat Dilek, Valdis.Kletnieks, Tim Chen, David S. Miller,
	ML netdev, LKML, Stephen Rothwell
In-Reply-To: <CA+icZUVuJWpLquLk4kHJzmMKtPw24oxud=u3Na0U0HhSYqwV1w@mail.gmail.com>

On 09/03/2011 07:54 AM, Sedat Dilek wrote:
> I saw similiar call-traces with put_cred_rcu() - besides with
> kmem_cache_alloc_trace().
> My post-it says:
> Kernel panic - not syncing: CRED: put_cred_rcu sees f67ac0c0 with usage -43

Hm, Tim, it looks like you put a pid which you did not get?

regards,
-- 
js

^ permalink raw reply

* Re: [next] unix stream crashes
From: Sedat Dilek @ 2011-09-03  5:54 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: Tim Chen, Jiri Slaby, David S. Miller, ML netdev, LKML,
	Stephen Rothwell
In-Reply-To: <6043.1315028115@turing-police.cc.vt.edu>

On Sat, Sep 3, 2011 at 7:35 AM,  <Valdis.Kletnieks@vt.edu> wrote:
> On Fri, 02 Sep 2011 16:55:03 PDT, Tim Chen said:
>
>> I'll like to isolate the problem to either the send path or receive
>> path. My suspicion is the error handling portion of the send path is not
>> quite right but I haven't yet found any issues after reviewing the
>> patch.
>
> Took a while, because it took a few tries to get netconsole working,
> and then I was seeing odd results, but here we go:
>
> next-20110831 - crashes 100% consistent.
> next-20110831 + revert 0856a30409 - OK.
> revert + scm_recv.patch - OK.
> revert + scm_send.patch - crashes 100% consistent.
>

YES, I can confirm this with next-20110826.

> Now the odd part - although I was seeing crashes 100% of the time, I saw a
> number of different tracebacks (but I never actually saw the same traceback
> that Jiri had). Also, the system died at different points - most of the time it
> would live long enough for GDM to prompt for a userid/password and then die,
> but sometimes it didn't get as far as the GDM screen. Hopefully the variety of
> crashes will tell you something useful.
>
> I'll be able to test patches for go/nogo over the weekend, but probably won't
> have a second machine to catch netconsole until I'm back in the office Monday.
>
> Example 1:
>
> [  142.316258] Kernel panic - not syncing: CRED: put_cred_rcu() sees ffff88010d1ff300 with usage -41
> [  142.316260]
> [  142.316275] Pid: 2264, comm: gdm-simple-slav Tainted: G        W   3.1.0-rc4-next-20110831-dirty #17
> [  142.316279] Call Trace:
> [  142.316283]  <IRQ>  [<ffffffff81577a6c>] panic+0x96/0x1a2
> [  142.316300]  [<ffffffff8105cb54>] put_cred_rcu+0x32/0x91
> [  142.316306]  [<ffffffff8157a44f>] rcu_do_batch+0xcb/0x1e4
> [  142.316313]  [<ffffffff81092967>] invoke_rcu_callbacks+0x6c/0xc7
> [  142.316319]  [<ffffffff810932f8>] __rcu_process_callbacks+0x118/0x124
> [  142.316325]  [<ffffffff810934f0>] rcu_process_callbacks+0x64/0x72
> [  142.316331]  [<ffffffff8103f8c4>] __do_softirq+0x110/0x278
> [  142.316338]  [<ffffffff815a23ac>] call_softirq+0x1c/0x30
> [  142.316342]  <EOI>  [<ffffffff81003647>] do_softirq+0x44/0xf1
> [  142.316352]  [<ffffffff8103f485>] _local_bh_enable_ip+0x12a/0x178
> [  142.316358]  [<ffffffff8103f4dc>] local_bh_enable_ip+0x9/0xb
> [  142.316364]  [<ffffffff8159a2f3>] _raw_write_unlock_bh+0x36/0x3a
> [  142.316372]  [<ffffffff814c1ac3>] unix_release_sock+0x86/0x1ff
> [  142.316378]  [<ffffffff8105b548>] ? up_read+0x1b/0x32
> [  142.316383]  [<ffffffff814c1c5d>] unix_release+0x21/0x23
> [  142.316390]  [<ffffffff81423d02>] sock_release+0x1a/0x6f
> [  142.316395]  [<ffffffff81424a30>] sock_close+0x22/0x26
> [  142.316401]  [<ffffffff810fcacb>] __fput+0x140/0x1fe
> [  142.316407]  [<ffffffff810f97cb>] ? sys_close+0xe6/0x158
> [  142.316412]  [<ffffffff810fcb9e>] fput+0x15/0x17
> [  142.316417]  [<ffffffff810f8ef2>] filp_close+0x87/0x93
> [  142.316422]  [<ffffffff810f97d6>] sys_close+0xf1/0x158
> [  142.316429]  [<ffffffff815a0ffb>] system_call_fastpath+0x16/0x1b
>

I saw similiar call-traces with put_cred_rcu() - besides with
kmem_cache_alloc_trace().
My post-it says:
Kernel panic - not syncing: CRED: put_cred_rcu sees f67ac0c0 with usage -43

BTW, systemd (uses dbus/sockets) is more sensitive than Debian's
standard sysvinit.

- Sedat -

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox