stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Jonathan Perry <jonperry@fb.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"David S. Miller" <davem@davemloft.net>,
	Chenbo Feng <fengc@google.com>, Sasha Levin <sashal@kernel.org>
Subject: [PATCH 4.9 03/51] bpf: convert htab map to hlist_nulls
Date: Wed, 15 May 2019 12:55:38 +0200	[thread overview]
Message-ID: <20190515090618.570752336@linuxfoundation.org> (raw)
In-Reply-To: <20190515090616.669619870@linuxfoundation.org>

commit 4fe8435909fddc97b81472026aa954e06dd192a5 upstream.

when all map elements are pre-allocated one cpu can delete and reuse htab_elem
while another cpu is still walking the hlist. In such case the lookup may
miss the element. Convert hlist to hlist_nulls to avoid such scenario.
When bucket lock is taken there is no need to take such precautions,
so only convert map_lookup and map_get_next to nulls.
The race window is extremely small and only reproducible with explicit
udelay() inside lookup_nulls_elem_raw()

Similar to hlist add hlist_nulls_for_each_entry_safe() and
hlist_nulls_entry_safe() helpers.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Reported-by: Jonathan Perry <jonperry@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/list_nulls.h    |  5 +++
 include/linux/rculist_nulls.h | 14 +++++++
 kernel/bpf/hashtab.c          | 71 +++++++++++++++++++++++------------
 3 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index b01fe10090843..87ff4f58a2f01 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -29,6 +29,11 @@ struct hlist_nulls_node {
 	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+	})
 /**
  * ptr_is_a_nulls - Test if a ptr is a nulls
  * @ptr: ptr to be tested
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 6224a0ab0b1e8..2720b2fbfb86d 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -118,5 +118,19 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
 		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
 
+/**
+ * hlist_nulls_for_each_entry_safe -
+ *   iterate over list of given type safe against removal of list entry
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_nulls_node within the struct.
+ */
+#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)		\
+	for (({barrier();}),							\
+	     pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));		\
+		(!is_a_nulls(pos)) &&						\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);	\
+		   pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
 #endif
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f9d53ac57f640..8648d7d297081 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,10 +13,11 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
+#include <linux/rculist_nulls.h>
 #include "percpu_freelist.h"
 
 struct bucket {
-	struct hlist_head head;
+	struct hlist_nulls_head head;
 	raw_spinlock_t lock;
 };
 
@@ -40,7 +41,7 @@ enum extra_elem_state {
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
 	union {
-		struct hlist_node hash_node;
+		struct hlist_nulls_node hash_node;
 		struct {
 			void *padding;
 			union {
@@ -245,7 +246,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		goto free_htab;
 
 	for (i = 0; i < htab->n_buckets; i++) {
-		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
@@ -282,28 +283,52 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 	return &htab->buckets[hash & (htab->n_buckets - 1)];
 }
 
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
 {
 	return &__select_bucket(htab, hash)->head;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
 					 void *key, u32 key_size)
 {
+	struct hlist_nulls_node *n;
 	struct htab_elem *l;
 
-	hlist_for_each_entry_rcu(l, head, hash_node)
+	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
 			return l;
 
 	return NULL;
 }
 
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+					       u32 hash, void *key,
+					       u32 key_size, u32 n_buckets)
+{
+	struct hlist_nulls_node *n;
+	struct htab_elem *l;
+
+again:
+	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+
+	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+		goto again;
+
+	return NULL;
+}
+
 /* Called from syscall or from eBPF program */
 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct htab_elem *l;
 	u32 hash, key_size;
 
@@ -316,7 +341,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 
 	head = select_bucket(htab, hash);
 
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 
 	return l;
 }
@@ -335,7 +360,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct htab_elem *l, *next_l;
 	u32 hash, key_size;
 	int i = 0;
@@ -352,13 +377,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	head = select_bucket(htab, hash);
 
 	/* lookup the key */
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 
 	if (!l)
 		goto find_first_elem;
 
 	/* key was found, get next key in the same bucket */
-	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
 				  struct htab_elem, hash_node);
 
 	if (next_l) {
@@ -377,7 +402,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 		head = select_bucket(htab, i);
 
 		/* pick first element in the bucket */
-		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
 					  struct htab_elem, hash_node);
 		if (next_l) {
 			/* if it's not empty, just return it */
@@ -534,7 +559,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
@@ -573,9 +598,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
+	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
-		hlist_del_rcu(&l_old->hash_node);
+		hlist_nulls_del_rcu(&l_old->hash_node);
 		free_htab_elem(htab, l_old);
 	}
 	ret = 0;
@@ -590,7 +615,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
@@ -642,7 +667,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 			ret = PTR_ERR(l_new);
 			goto err;
 		}
-		hlist_add_head_rcu(&l_new->hash_node, head);
+		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 	}
 	ret = 0;
 err:
@@ -660,7 +685,7 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
 	unsigned long flags;
@@ -680,7 +705,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	l = lookup_elem_raw(head, hash, key, key_size);
 
 	if (l) {
-		hlist_del_rcu(&l->hash_node);
+		hlist_nulls_del_rcu(&l->hash_node);
 		free_htab_elem(htab, l);
 		ret = 0;
 	}
@@ -694,12 +719,12 @@ static void delete_all_elements(struct bpf_htab *htab)
 	int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
-		struct hlist_head *head = select_bucket(htab, i);
-		struct hlist_node *n;
+		struct hlist_nulls_head *head = select_bucket(htab, i);
+		struct hlist_nulls_node *n;
 		struct htab_elem *l;
 
-		hlist_for_each_entry_safe(l, n, head, hash_node) {
-			hlist_del_rcu(&l->hash_node);
+		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_nulls_del_rcu(&l->hash_node);
 			if (l->state != HTAB_EXTRA_ELEM_USED)
 				htab_elem_free(htab, l);
 		}
-- 
2.20.1




  parent reply	other threads:[~2019-05-15 12:01 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-15 10:55 [PATCH 4.9 00/51] 4.9.177-stable review Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 01/51] netfilter: compat: initialize all fields in xt_init Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 02/51] bpf: fix struct htab_elem layout Greg Kroah-Hartman
2019-05-15 10:55 ` Greg Kroah-Hartman [this message]
2019-05-15 10:55 ` [PATCH 4.9 04/51] platform/x86: sony-laptop: Fix unintentional fall-through Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 05/51] USB: serial: fix unthrottle races Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 06/51] iio: adc: xilinx: fix potential use-after-free on remove Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 07/51] libnvdimm/namespace: Fix a potential NULL pointer dereference Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 08/51] HID: input: add mapping for Expose/Overview key Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 09/51] HID: input: add mapping for keyboard Brightness Up/Down/Toggle keys Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 10/51] HID: input: add mapping for "Toggle Display" key Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 11/51] libnvdimm/btt: Fix a kmemdup failure check Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 12/51] s390/dasd: Fix capacity calculation for large volumes Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 13/51] mac80211: fix unaligned access in mesh table hash function Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 14/51] s390/3270: fix lockdep false positive on view->lock Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 15/51] mISDN: Check address length before reading address family Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 16/51] x86/reboot, efi: Use EFI reboot for Acer TravelMate X514-51T Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 17/51] KVM: x86: avoid misreporting level-triggered irqs as edge-triggered in tracing Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 18/51] tools lib traceevent: Fix missing equality check for strcmp Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 19/51] init: initialize jump labels before command line option parsing Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 20/51] selftests: netfilter: check icmp pkttoobig errors are set as related Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 21/51] ipvs: do not schedule icmp errors from tunnels Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 22/51] MIPS: perf: ath79: Fix perfcount IRQ assignment Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 23/51] s390: ctcm: fix ctcm_new_device error return code Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 24/51] drm/sun4i: Set device driver data at bind time for use in unbind Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 25/51] selftests/net: correct the return value for run_netsocktests Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 26/51] gpu: ipu-v3: dp: fix CSC handling Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 27/51] spi: Micrel eth switch: declare missing of table Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 28/51] spi: ST ST95HF NFC: " Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 29/51] Input: synaptics-rmi4 - fix possible double free Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 30/51] cw1200: fix missing unlock on error in cw1200_hw_scan() Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 31/51] ALSA: pcm: remove SNDRV_PCM_IOCTL1_INFO internal command Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 32/51] rtlwifi: rtl8723ae: Fix missing break in switch statement Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 33/51] Dont jump to compute_result state from check_result state Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 34/51] Revert "x86/vdso: Drop implicit common-page-size linker flag" Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 35/51] Revert "x86: vdso: Use $LD instead of $CC to link" Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 36/51] x86: vdso: Use $LD instead of $CC to link Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 37/51] x86/vdso: Drop implicit common-page-size linker flag Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 38/51] x86/vdso: Pass --eh-frame-hdr to the linker Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 39/51] powerpc/64s: Include cpu header Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 40/51] bridge: Fix error path for kobject_init_and_add() Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 41/51] fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied Greg Kroah-Hartman
2019-05-19 15:43   ` Nathan Chancellor
2019-05-19 20:27     ` Florian Westphal
2019-05-20  2:00       ` Hangbin Liu
2019-05-20  0:11     ` Sasha Levin
2019-05-20  0:29     ` David Ahern
2019-05-20  9:04       ` Greg Kroah-Hartman
2019-05-20  9:11         ` Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 42/51] net: ucc_geth - fix Oops when changing number of buffers in the ring Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 43/51] packet: Fix error path in packet_init Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 44/51] vlan: disable SIOCSHWTSTAMP in container Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 45/51] vrf: sit mtu should not be updated when vrf netdev is the link Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 46/51] ipv4: Fix raw socket lookup for local traffic Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 47/51] bonding: fix arp_validate toggling in active-backup mode Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 48/51] drivers/virt/fsl_hypervisor.c: dereferencing error pointers in ioctl Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 49/51] drivers/virt/fsl_hypervisor.c: prevent integer overflow " Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 50/51] powerpc/lib: fix book3s/32 boot failure due to code patching Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 51/51] powerpc/booke64: set RI in default MSR Greg Kroah-Hartman
2019-05-15 18:27 ` [PATCH 4.9 00/51] 4.9.177-stable review kernelci.org bot
2019-05-15 19:54 ` Naresh Kamboju
2019-05-16  3:34 ` Guenter Roeck
2019-05-16 11:00 ` Jon Hunter
2019-05-16 14:09 ` shuah
2019-05-17  6:33 ` Kelsey Skunberg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190515090618.570752336@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=ast@kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=fengc@google.com \
    --cc=jonperry@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sashal@kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).