netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: "Tantilov, Emil S" <emil.s.tantilov@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>,
	Emil S Tantilov <emils.tantilov@gmail.com>,
	NetDev <netdev@vger.kernel.org>,
	"Brandeburg, Jesse" <jesse.brandeburg@intel.com>,
	"Kirsher, Jeffrey T" <jeffrey.t.kirsher@intel.com>,
	Jiri Olsa <jolsa@redhat.com>
Subject: [PATCH] net: sk_prot_alloc() should not blindly overwrite memory
Date: Wed, 08 Jul 2009 00:33:29 +0200	[thread overview]
Message-ID: <4A53CD39.7080407@gmail.com> (raw)
In-Reply-To: <EA929A9653AAE14F841771FB1DE5A1365F8ACEBEE9@rrsmsx501.amr.corp.intel.com>

Tantilov, Emil S a écrit :
> Eric Dumazet wrote:
>> Eric Dumazet a écrit :
>>>
>>> Hold on, I found the problem and will submit a patch (after testing
>>> it) in following hours 
>>>
>> David, while looking at Emil case, I found following problem.
>>
>> I am not sure this bug is responsible for the BUG hit by Emil,
>> but it seems a candidate for stable as well...
>>
>> Thanks
>>
>> [PATCH] net: sk_alloc() should not blindly overwrite memory
>>
>> Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code rely that some
>> fields should not be blindly overwritten, even with null.
>>
>> These fields are sk->sk_refcnt and sk->sk_nulls_node.next
>>
>> Current sk_alloc() implementation doesnt respect this hypothesis,
>> calling kmem_alloc with __GFP_ZERO and setting sk_refcnt to 1 instead
>> of atomically increment it.
>>
>> Reported-by: Emil S Tantilov <emils.tantilov@gmail.com>
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Eric,
> 
> With this patch applied, I get panic on boot:

Oops, this stuff has to be done inside sk_prot_alloc() or breaks selinux

Thanks for testing Emil, here is a second try !


[PATCH] net: sk_prot_alloc() should not blindly overwrite memory

Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code rely that some
fields should not be blindly overwritten, even with null.

These fields are sk->sk_refcnt and sk->sk_nulls_node.next

Current sk_prot_alloc() implementation doesnt respect this hypothesis,
calling kmem_cache_alloc() with __GFP_ZERO and setting sk_refcnt to 1
instead of atomically increment it.

Reported-by: Emil S Tantilov <emils.tantilov@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sock.h            |   24 +++++++++------
 net/core/sock.c               |   49 ++++++++++++++++++++++++++++----
 net/ipv4/inet_timewait_sock.c |    2 -
 3 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 352f06b..3f6284e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -103,15 +103,15 @@ struct net;
 
 /**
  *	struct sock_common - minimal network layer representation of sockets
+ *	@skc_node: main hash linkage for various protocol lookup tables
+ *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
+ *	@skc_refcnt: reference count
+ *	@skc_hash: hash value used with various protocol lookup tables
  *	@skc_family: network address family
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
- *	@skc_node: main hash linkage for various protocol lookup tables
- *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
- *	@skc_refcnt: reference count
- *	@skc_hash: hash value used with various protocol lookup tables
  *	@skc_prot: protocol handlers inside a network family
  *	@skc_net: reference to the network namespace of this socket
  *
@@ -119,17 +119,23 @@ struct net;
  *	for struct sock and struct inet_timewait_sock.
  */
 struct sock_common {
-	unsigned short		skc_family;
-	volatile unsigned char	skc_state;
-	unsigned char		skc_reuse;
-	int			skc_bound_dev_if;
+	/*
+	 * WARNING : skc_node, skc_refcnt must be first elements of this struct
+	 * (sk_prot_alloc() should not clear skc_node.next & skc_refcnt because
+	 *  of RCU lookups)
+	 */
 	union {
 		struct hlist_node	skc_node;
 		struct hlist_nulls_node skc_nulls_node;
 	};
-	struct hlist_node	skc_bind_node;
 	atomic_t		skc_refcnt;
+
 	unsigned int		skc_hash;
+	unsigned short		skc_family;
+	volatile unsigned char	skc_state;
+	unsigned char		skc_reuse;
+	int			skc_bound_dev_if;
+	struct hlist_node	skc_bind_node;
 	struct proto		*skc_prot;
 #ifdef CONFIG_NET_NS
 	struct net	 	*skc_net;
diff --git a/net/core/sock.c b/net/core/sock.c
index b0ba569..cf43ff1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -939,8 +939,31 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 	struct kmem_cache *slab;
 
 	slab = prot->slab;
-	if (slab != NULL)
-		sk = kmem_cache_alloc(slab, priority);
+	if (slab != NULL) {
+		/*
+		 * caches using SLAB_DESTROY_BY_RCU should let sk_node
+		 * and sk_refcnt untouched.
+		 *
+		 * Following makes sense only if sk first fields are
+		 * in following order : sk_node, refcnt, sk_hash
+		 */
+		BUILD_BUG_ON(offsetof(struct sock, sk_node) != 0);
+		BUILD_BUG_ON(offsetof(struct sock, sk_refcnt) !=
+			     sizeof(sk->sk_node));
+		BUILD_BUG_ON(offsetof(struct sock, sk_hash) !=
+			     sizeof(sk->sk_node) + sizeof(sk->sk_refcnt));
+		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+		if (!sk)
+			return sk;
+		if (priority & __GFP_ZERO) {
+			if (prot->slab_flags & SLAB_DESTROY_BY_RCU)
+				memset(&sk->sk_hash, 0,
+				       prot->obj_size -
+				       offsetof(struct sock, sk_hash));
+			else
+				memset(sk, 0, prot->obj_size);
+		}
+	}
 	else
 		sk = kmalloc(prot->obj_size, priority);
 
@@ -1125,7 +1148,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
-		atomic_set(&newsk->sk_refcnt, 2);
+		atomic_add(2, &newsk->sk_refcnt);
 
 		/*
 		 * Increment the counter in the same struct proto as the master
@@ -1840,7 +1863,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
-	atomic_set(&sk->sk_refcnt, 1);
+	atomic_inc(&sk->sk_refcnt);
 	atomic_set(&sk->sk_wmem_alloc, 1);
 	atomic_set(&sk->sk_drops, 0);
 }
@@ -2140,12 +2163,25 @@ static inline void release_proto_idx(struct proto *prot)
 }
 #endif
 
+/*
+ * We need to initialize skc->skc_refcnt to 0, and skc->skc_node.pprev to NULL
+ * but only on newly allocated objects (check sk_prot_alloc())
+ */
+static void sk_init_once(void *arg)
+{
+	struct sock_common *skc = arg;
+
+	atomic_set(&skc->skc_refcnt, 0);
+	skc->skc_node.pprev = NULL;
+}
+
 int proto_register(struct proto *prot, int alloc_slab)
 {
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
-					NULL);
+					prot->slab_flags & SLAB_DESTROY_BY_RCU ?
+					    sk_init_once : NULL);
 
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
@@ -2187,7 +2223,8 @@ int proto_register(struct proto *prot, int alloc_slab)
 						  0,
 						  SLAB_HWCACHE_ALIGN |
 							prot->slab_flags,
-						  NULL);
+						  prot->slab_flags & SLAB_DESTROY_BY_RCU ?
+							sk_init_once : NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
 		}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 61283f9..a2997df 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -139,7 +139,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
 		twsk_net_set(tw, hold_net(sock_net(sk)));
-		atomic_set(&tw->tw_refcnt, 1);
+		atomic_inc(&tw->tw_refcnt);
 		inet_twsk_dead_node_init(tw);
 		__module_get(tw->tw_prot->owner);
 	}

  reply	other threads:[~2009-07-07 22:33 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-01 18:08 WARNING: at include/net/sock.h:417 udp_lib_unhash Tantilov, Emil S
2009-07-02  6:10 ` Eric Dumazet
2009-07-07  0:54   ` Emil S Tantilov
2009-07-07  7:21     ` Eric Dumazet
2009-07-07  7:40       ` Eric Dumazet
2009-07-07 16:14         ` [PATCH] net: sk_alloc() should not blindly overwrite memory Eric Dumazet
2009-07-07 18:33           ` Tantilov, Emil S
2009-07-07 22:33             ` Eric Dumazet [this message]
2009-07-08  2:14               ` [PATCH] net: sk_prot_alloc() " David Miller
2009-07-08  6:50                 ` Eric Dumazet
2009-07-09  5:36                   ` Eric Dumazet
2009-07-09 17:13                     ` Paul E. McKenney
2009-07-09 20:50                       ` Eric Dumazet
2009-07-12  3:27                     ` David Miller
2009-07-12  7:07                       ` Eric Dumazet
2009-07-15 12:28                         ` [PATCH] net: nf_conntrack_alloc() should not use kmem_cache_zalloc() Eric Dumazet
2009-07-15 15:28                           ` Patrick McHardy
2009-07-15 19:54                             ` [PATCH] net: nf_conntrack_alloc() fixes Eric Dumazet
2009-07-16  9:13                               ` [PATCH] net: sock_copy() fixes Eric Dumazet
2009-07-17  1:09                                 ` David Miller
2009-07-16 12:05                               ` [PATCH] net: nf_conntrack_alloc() fixes Patrick McHardy
2009-07-08 17:02                 ` [PATCH] net: sk_prot_alloc() should not blindly overwrite memory Tantilov, Emil S
2009-07-08 17:45                   ` David Miller
2009-07-08 23:21                     ` Eric Dumazet
2009-07-08 23:35                       ` Tantilov, Emil S
2009-07-09  0:20                         ` [PATCH] net: ip_push_pending_frames() fix Eric Dumazet
2009-07-09 14:32                           ` Tantilov, Emil S
2009-07-09 14:38                             ` Eric Dumazet
2009-07-12  3:27                           ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4A53CD39.7080407@gmail.com \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=emil.s.tantilov@intel.com \
    --cc=emils.tantilov@gmail.com \
    --cc=jeffrey.t.kirsher@intel.com \
    --cc=jesse.brandeburg@intel.com \
    --cc=jolsa@redhat.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).