From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Jonathan Perry <jonperry@fb.com>,
Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
"David S. Miller" <davem@davemloft.net>,
Chenbo Feng <fengc@google.com>, Sasha Levin <sashal@kernel.org>
Subject: [PATCH 4.9 03/51] bpf: convert htab map to hlist_nulls
Date: Wed, 15 May 2019 12:55:38 +0200 [thread overview]
Message-ID: <20190515090618.570752336@linuxfoundation.org> (raw)
In-Reply-To: <20190515090616.669619870@linuxfoundation.org>
commit 4fe8435909fddc97b81472026aa954e06dd192a5 upstream.
when all map elements are pre-allocated one cpu can delete and reuse htab_elem
while another cpu is still walking the hlist. In such case the lookup may
miss the element. Convert hlist to hlist_nulls to avoid such scenario.
When bucket lock is taken there is no need to take such precautions,
so only convert map_lookup and map_get_next to nulls.
The race window is extremely small and only reproducible with explicit
udelay() inside lookup_nulls_elem_raw()
Similar to hlist add hlist_nulls_for_each_entry_safe() and
hlist_nulls_entry_safe() helpers.
Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Reported-by: Jonathan Perry <jonperry@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
include/linux/list_nulls.h | 5 +++
include/linux/rculist_nulls.h | 14 +++++++
kernel/bpf/hashtab.c | 71 +++++++++++++++++++++++------------
3 files changed, 67 insertions(+), 23 deletions(-)
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index b01fe10090843..87ff4f58a2f01 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -29,6 +29,11 @@ struct hlist_nulls_node {
((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+ ({ typeof(ptr) ____ptr = (ptr); \
+ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+ })
/**
* ptr_is_a_nulls - Test if a ptr is a nulls
* @ptr: ptr to be tested
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 6224a0ab0b1e8..2720b2fbfb86d 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -118,5 +118,19 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
+/**
+ * hlist_nulls_for_each_entry_safe -
+ * iterate over list of given type safe against removal of list entry
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_nulls_node to use as a loop cursor.
+ * @head: the head for your list.
+ * @member: the name of the hlist_nulls_node within the struct.
+ */
+#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \
+ for (({barrier();}), \
+ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \
+ (!is_a_nulls(pos)) && \
+ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \
+ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
#endif
#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f9d53ac57f640..8648d7d297081 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,10 +13,11 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
+#include <linux/rculist_nulls.h>
#include "percpu_freelist.h"
struct bucket {
- struct hlist_head head;
+ struct hlist_nulls_head head;
raw_spinlock_t lock;
};
@@ -40,7 +41,7 @@ enum extra_elem_state {
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
- struct hlist_node hash_node;
+ struct hlist_nulls_node hash_node;
struct {
void *padding;
union {
@@ -245,7 +246,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
- INIT_HLIST_HEAD(&htab->buckets[i].head);
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
}
@@ -282,28 +283,52 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
{
return &__select_bucket(htab, hash)->head;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
void *key, u32 key_size)
{
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_rcu(l, head, hash_node)
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
return NULL;
}
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+ u32 hash, void *key,
+ u32 key_size, u32 n_buckets)
+{
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+again:
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+ goto again;
+
+ return NULL;
+}
+
/* Called from syscall or from eBPF program */
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l;
u32 hash, key_size;
@@ -316,7 +341,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
head = select_bucket(htab, hash);
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
return l;
}
@@ -335,7 +360,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
int i = 0;
@@ -352,13 +377,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
head = select_bucket(htab, hash);
/* lookup the key */
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
if (!l)
goto find_first_elem;
/* key was found, get next key in the same bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
struct htab_elem, hash_node);
if (next_l) {
@@ -377,7 +402,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
head = select_bucket(htab, i);
/* pick first element in the bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
struct htab_elem, hash_node);
if (next_l) {
/* if it's not empty, just return it */
@@ -534,7 +559,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -573,9 +598,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
- hlist_del_rcu(&l_old->hash_node);
+ hlist_nulls_del_rcu(&l_old->hash_node);
free_htab_elem(htab, l_old);
}
ret = 0;
@@ -590,7 +615,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -642,7 +667,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = PTR_ERR(l_new);
goto err;
}
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
ret = 0;
err:
@@ -660,7 +685,7 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct bucket *b;
struct htab_elem *l;
unsigned long flags;
@@ -680,7 +705,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
free_htab_elem(htab, l);
ret = 0;
}
@@ -694,12 +719,12 @@ static void delete_all_elements(struct bpf_htab *htab)
int i;
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
- struct hlist_node *n;
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_safe(l, n, head, hash_node) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
if (l->state != HTAB_EXTRA_ELEM_USED)
htab_elem_free(htab, l);
}
--
2.20.1
next prev parent reply other threads:[~2019-05-15 11:14 UTC|newest]
Thread overview: 66+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-05-15 10:55 [PATCH 4.9 00/51] 4.9.177-stable review Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 01/51] netfilter: compat: initialize all fields in xt_init Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 02/51] bpf: fix struct htab_elem layout Greg Kroah-Hartman
2019-05-15 10:55 ` Greg Kroah-Hartman [this message]
2019-05-15 10:55 ` [PATCH 4.9 04/51] platform/x86: sony-laptop: Fix unintentional fall-through Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 05/51] USB: serial: fix unthrottle races Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 06/51] iio: adc: xilinx: fix potential use-after-free on remove Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 07/51] libnvdimm/namespace: Fix a potential NULL pointer dereference Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 08/51] HID: input: add mapping for Expose/Overview key Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 09/51] HID: input: add mapping for keyboard Brightness Up/Down/Toggle keys Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 10/51] HID: input: add mapping for "Toggle Display" key Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 11/51] libnvdimm/btt: Fix a kmemdup failure check Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 12/51] s390/dasd: Fix capacity calculation for large volumes Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 13/51] mac80211: fix unaligned access in mesh table hash function Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 14/51] s390/3270: fix lockdep false positive on view->lock Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 15/51] mISDN: Check address length before reading address family Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 16/51] x86/reboot, efi: Use EFI reboot for Acer TravelMate X514-51T Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 17/51] KVM: x86: avoid misreporting level-triggered irqs as edge-triggered in tracing Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 18/51] tools lib traceevent: Fix missing equality check for strcmp Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 19/51] init: initialize jump labels before command line option parsing Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 20/51] selftests: netfilter: check icmp pkttoobig errors are set as related Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 21/51] ipvs: do not schedule icmp errors from tunnels Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 22/51] MIPS: perf: ath79: Fix perfcount IRQ assignment Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 23/51] s390: ctcm: fix ctcm_new_device error return code Greg Kroah-Hartman
2019-05-15 10:55 ` [PATCH 4.9 24/51] drm/sun4i: Set device driver data at bind time for use in unbind Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 25/51] selftests/net: correct the return value for run_netsocktests Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 26/51] gpu: ipu-v3: dp: fix CSC handling Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 27/51] spi: Micrel eth switch: declare missing of table Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 28/51] spi: ST ST95HF NFC: " Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 29/51] Input: synaptics-rmi4 - fix possible double free Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 30/51] cw1200: fix missing unlock on error in cw1200_hw_scan() Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 31/51] ALSA: pcm: remove SNDRV_PCM_IOCTL1_INFO internal command Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 32/51] rtlwifi: rtl8723ae: Fix missing break in switch statement Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 33/51] Dont jump to compute_result state from check_result state Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 34/51] Revert "x86/vdso: Drop implicit common-page-size linker flag" Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 35/51] Revert "x86: vdso: Use $LD instead of $CC to link" Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 36/51] x86: vdso: Use $LD instead of $CC to link Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 37/51] x86/vdso: Drop implicit common-page-size linker flag Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 38/51] x86/vdso: Pass --eh-frame-hdr to the linker Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 39/51] powerpc/64s: Include cpu header Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 40/51] bridge: Fix error path for kobject_init_and_add() Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 41/51] fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied Greg Kroah-Hartman
2019-05-19 15:43 ` Nathan Chancellor
2019-05-19 20:27 ` Florian Westphal
2019-05-20 2:00 ` Hangbin Liu
2019-05-20 0:11 ` Sasha Levin
2019-05-20 0:29 ` David Ahern
2019-05-20 9:04 ` Greg Kroah-Hartman
2019-05-20 9:11 ` Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 42/51] net: ucc_geth - fix Oops when changing number of buffers in the ring Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 43/51] packet: Fix error path in packet_init Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 44/51] vlan: disable SIOCSHWTSTAMP in container Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 45/51] vrf: sit mtu should not be updated when vrf netdev is the link Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 46/51] ipv4: Fix raw socket lookup for local traffic Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 47/51] bonding: fix arp_validate toggling in active-backup mode Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 48/51] drivers/virt/fsl_hypervisor.c: dereferencing error pointers in ioctl Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 49/51] drivers/virt/fsl_hypervisor.c: prevent integer overflow " Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 50/51] powerpc/lib: fix book3s/32 boot failure due to code patching Greg Kroah-Hartman
2019-05-15 10:56 ` [PATCH 4.9 51/51] powerpc/booke64: set RI in default MSR Greg Kroah-Hartman
2019-05-15 18:27 ` [PATCH 4.9 00/51] 4.9.177-stable review kernelci.org bot
2019-05-15 19:54 ` Naresh Kamboju
2019-05-16 3:34 ` Guenter Roeck
2019-05-16 11:00 ` Jon Hunter
2019-05-16 11:00 ` Jon Hunter
2019-05-16 14:09 ` shuah
2019-05-17 6:33 ` Kelsey Skunberg
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190515090618.570752336@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=ast@kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=fengc@google.com \
--cc=jonperry@fb.com \
--cc=linux-kernel@vger.kernel.org \
--cc=sashal@kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.