All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: "Tantilov, Emil S" <emil.s.tantilov@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>,
	Emil S Tantilov <emils.tantilov@gmail.com>,
	NetDev <netdev@vger.kernel.org>,
	"Brandeburg, Jesse" <jesse.brandeburg@intel.com>,
	"Kirsher, Jeffrey T" <jeffrey.t.kirsher@intel.com>,
	Jiri Olsa <jolsa@redhat.com>
Subject: [PATCH] net: sk_prot_alloc() should not blindly overwrite memory
Date: Wed, 08 Jul 2009 00:33:29 +0200	[thread overview]
Message-ID: <4A53CD39.7080407@gmail.com> (raw)
In-Reply-To: <EA929A9653AAE14F841771FB1DE5A1365F8ACEBEE9@rrsmsx501.amr.corp.intel.com>

Tantilov, Emil S a écrit :
> Eric Dumazet wrote:
>> Eric Dumazet a écrit :
>>>
>>> Hold on, I found the problem and will submit a patch (after testing
>>> it) in following hours 
>>>
>> David, while looking at Emil case, I found following problem.
>>
>> I am not sure this bug is responsible for the BUG hit by Emil,
>> but it seems a candidate for stable as well...
>>
>> Thanks
>>
>> [PATCH] net: sk_alloc() should not blindly overwrite memory
>>
>> Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code rely that some
>> fields should not be blindly overwritten, even with null.
>>
>> These fields are sk->sk_refcnt and sk->sk_nulls_node.next
>>
>> Current sk_alloc() implementation doesnt respect this hypothesis,
>> calling kmem_alloc with __GFP_ZERO and setting sk_refcnt to 1 instead
>> of atomically increment it.
>>
>> Reported-by: Emil S Tantilov <emils.tantilov@gmail.com>
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Eric,
> 
> With this patch applied, I get panic on boot:

Oops, this stuff has to be done inside sk_prot_alloc() or breaks selinux

Thanks for testing Emil, here is a second try !


[PATCH] net: sk_prot_alloc() should not blindly overwrite memory

Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code rely that some
fields should not be blindly overwritten, even with null.

These fields are sk->sk_refcnt and sk->sk_nulls_node.next

Current sk_prot_alloc() implementation doesnt respect this hypothesis,
calling kmem_cache_alloc() with __GFP_ZERO and setting sk_refcnt to 1
instead of atomically increment it.

Reported-by: Emil S Tantilov <emils.tantilov@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sock.h            |   24 +++++++++------
 net/core/sock.c               |   49 ++++++++++++++++++++++++++++----
 net/ipv4/inet_timewait_sock.c |    2 -
 3 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 352f06b..3f6284e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -103,15 +103,15 @@ struct net;
 
 /**
  *	struct sock_common - minimal network layer representation of sockets
+ *	@skc_node: main hash linkage for various protocol lookup tables
+ *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
+ *	@skc_refcnt: reference count
+ *	@skc_hash: hash value used with various protocol lookup tables
  *	@skc_family: network address family
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
- *	@skc_node: main hash linkage for various protocol lookup tables
- *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
- *	@skc_refcnt: reference count
- *	@skc_hash: hash value used with various protocol lookup tables
  *	@skc_prot: protocol handlers inside a network family
  *	@skc_net: reference to the network namespace of this socket
  *
@@ -119,17 +119,23 @@ struct net;
  *	for struct sock and struct inet_timewait_sock.
  */
 struct sock_common {
-	unsigned short		skc_family;
-	volatile unsigned char	skc_state;
-	unsigned char		skc_reuse;
-	int			skc_bound_dev_if;
+	/*
+	 * WARNING : skc_node, skc_refcnt must be first elements of this struct
+	 * (sk_prot_alloc() should not clear skc_node.next & skc_refcnt because
+	 *  of RCU lookups)
+	 */
 	union {
 		struct hlist_node	skc_node;
 		struct hlist_nulls_node skc_nulls_node;
 	};
-	struct hlist_node	skc_bind_node;
 	atomic_t		skc_refcnt;
+
 	unsigned int		skc_hash;
+	unsigned short		skc_family;
+	volatile unsigned char	skc_state;
+	unsigned char		skc_reuse;
+	int			skc_bound_dev_if;
+	struct hlist_node	skc_bind_node;
 	struct proto		*skc_prot;
 #ifdef CONFIG_NET_NS
 	struct net	 	*skc_net;
diff --git a/net/core/sock.c b/net/core/sock.c
index b0ba569..cf43ff1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -939,8 +939,31 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 	struct kmem_cache *slab;
 
 	slab = prot->slab;
-	if (slab != NULL)
-		sk = kmem_cache_alloc(slab, priority);
+	if (slab != NULL) {
+		/*
+		 * caches using SLAB_DESTROY_BY_RCU should let sk_node
+		 * and sk_refcnt untouched.
+		 *
+		 * Following makes sense only if sk first fields are
+		 * in following order : sk_node, refcnt, sk_hash
+		 */
+		BUILD_BUG_ON(offsetof(struct sock, sk_node) != 0);
+		BUILD_BUG_ON(offsetof(struct sock, sk_refcnt) !=
+			     sizeof(sk->sk_node));
+		BUILD_BUG_ON(offsetof(struct sock, sk_hash) !=
+			     sizeof(sk->sk_node) + sizeof(sk->sk_refcnt));
+		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+		if (!sk)
+			return sk;
+		if (priority & __GFP_ZERO) {
+			if (prot->slab_flags & SLAB_DESTROY_BY_RCU)
+				memset(&sk->sk_hash, 0,
+				       prot->obj_size -
+				       offsetof(struct sock, sk_hash));
+			else
+				memset(sk, 0, prot->obj_size);
+		}
+	}
 	else
 		sk = kmalloc(prot->obj_size, priority);
 
@@ -1125,7 +1148,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
-		atomic_set(&newsk->sk_refcnt, 2);
+		atomic_add(2, &newsk->sk_refcnt);
 
 		/*
 		 * Increment the counter in the same struct proto as the master
@@ -1840,7 +1863,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
-	atomic_set(&sk->sk_refcnt, 1);
+	atomic_inc(&sk->sk_refcnt);
 	atomic_set(&sk->sk_wmem_alloc, 1);
 	atomic_set(&sk->sk_drops, 0);
 }
@@ -2140,12 +2163,25 @@ static inline void release_proto_idx(struct proto *prot)
 }
 #endif
 
+/*
+ * We need to initialize skc->skc_refcnt to 0, and skc->skc_node.pprev to NULL
+ * but only on newly allocated objects (check sk_prot_alloc())
+ */
+static void sk_init_once(void *arg)
+{
+	struct sock_common *skc = arg;
+
+	atomic_set(&skc->skc_refcnt, 0);
+	skc->skc_node.pprev = NULL;
+}
+
 int proto_register(struct proto *prot, int alloc_slab)
 {
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
-					NULL);
+					prot->slab_flags & SLAB_DESTROY_BY_RCU ?
+					    sk_init_once : NULL);
 
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
@@ -2187,7 +2223,8 @@ int proto_register(struct proto *prot, int alloc_slab)
 						  0,
 						  SLAB_HWCACHE_ALIGN |
 							prot->slab_flags,
-						  NULL);
+						  prot->slab_flags & SLAB_DESTROY_BY_RCU ?
+							sk_init_once : NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
 		}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 61283f9..a2997df 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -139,7 +139,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
 		twsk_net_set(tw, hold_net(sock_net(sk)));
-		atomic_set(&tw->tw_refcnt, 1);
+		atomic_inc(&tw->tw_refcnt);
 		inet_twsk_dead_node_init(tw);
 		__module_get(tw->tw_prot->owner);
 	}

  reply	other threads:[~2009-07-07 22:33 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-01 18:08 WARNING: at include/net/sock.h:417 udp_lib_unhash Tantilov, Emil S
2009-07-02  6:10 ` Eric Dumazet
2009-07-07  0:54   ` Emil S Tantilov
2009-07-07  7:21     ` Eric Dumazet
2009-07-07  7:40       ` Eric Dumazet
2009-07-07 16:14         ` [PATCH] net: sk_alloc() should not blindly overwrite memory Eric Dumazet
2009-07-07 18:33           ` Tantilov, Emil S
2009-07-07 22:33             ` Eric Dumazet [this message]
2009-07-08  2:14               ` [PATCH] net: sk_prot_alloc() " David Miller
2009-07-08  6:50                 ` Eric Dumazet
2009-07-09  5:36                   ` Eric Dumazet
2009-07-09 17:13                     ` Paul E. McKenney
2009-07-09 20:50                       ` Eric Dumazet
2009-07-12  3:27                     ` David Miller
2009-07-12  7:07                       ` Eric Dumazet
2009-07-15 12:28                         ` [PATCH] net: nf_conntrack_alloc() should not use kmem_cache_zalloc() Eric Dumazet
2009-07-15 15:28                           ` Patrick McHardy
2009-07-15 19:54                             ` [PATCH] net: nf_conntrack_alloc() fixes Eric Dumazet
2009-07-16  9:13                               ` [PATCH] net: sock_copy() fixes Eric Dumazet
2009-07-17  1:09                                 ` David Miller
2009-07-16 12:05                               ` [PATCH] net: nf_conntrack_alloc() fixes Patrick McHardy
2009-07-08 17:02                 ` [PATCH] net: sk_prot_alloc() should not blindly overwrite memory Tantilov, Emil S
2009-07-08 17:45                   ` David Miller
2009-07-08 23:21                     ` Eric Dumazet
2009-07-08 23:35                       ` Tantilov, Emil S
2009-07-09  0:20                         ` [PATCH] net: ip_push_pending_frames() fix Eric Dumazet
2009-07-09 14:32                           ` Tantilov, Emil S
2009-07-09 14:38                             ` Eric Dumazet
2009-07-12  3:27                           ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4A53CD39.7080407@gmail.com \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=emil.s.tantilov@intel.com \
    --cc=emils.tantilov@gmail.com \
    --cc=jeffrey.t.kirsher@intel.com \
    --cc=jesse.brandeburg@intel.com \
    --cc=jolsa@redhat.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.