[BUG] bpf: use-after-free in hashtab BPF_F

public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed

* [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
@ 2026-03-26  8:49 Aaron Esau
  2026-03-26 13:39 ` Puranjay Mohan
  0 siblings, 1 reply; 12+ messages in thread
From: Aaron Esau @ 2026-03-26  8:49 UTC (permalink / raw)
  To: bpf; +Cc: ast, daniel, andrii

[-- Attachment #1: Type: text/plain, Size: 1885 bytes --]

Reported-by: Aaron Esau <aaron1esau@gmail.com>

htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
for in-place updates.

The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
bucket lock, then dereferences the element via copy_map_value_locked().
A concurrent htab_map_delete_elem() can delete and free the element
between these steps.

free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
the object to the per-CPU free list (not RCU-deferred). The memory may
be reallocated before copy_map_value_locked() executes, leading to
writes into a different element.

When lookup succeeds (l_old != NULL), the in-place update path returns
early, so the “full lookup under lock” path is not taken.

Race:

  CPU 0: htab_map_update_elem (BPF_F_LOCK)
         lookup_nulls_elem_raw() → E (no bucket lock)
         ...
  CPU 1: htab_map_delete_elem()
         htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
         free_htab_elem → bpf_mem_cache_free (immediate free)
  CPU 1: htab_map_update_elem (new key)
         alloc_htab_elem → reuses E
  CPU 0: copy_map_value_locked(E, ...) → writes into reused object

Reproduction:

  1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
     (max_entries=64, 7 u64 fields + lock).
  2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
  3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
  4. Threads C: same as A (pattern 0xCCCC...)
  5. Verifier threads: LOOKUP loop, detect mixed-pattern values
  6. Run 60s on >=4 CPUs

Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
I observed ~645 torn values in 2.5M checks (~0.026%).

Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")

System:
  Linux 6.19.9, x86_64, GCC 15.2.1
  CONFIG_BPF_SYSCALL=y, CONFIG_PREEMPT=y

[-- Attachment #2: poc-bpf-hashtab-uaf.c --]
[-- Type: text/x-csrc, Size: 13434 bytes --]

/*
 * BPF hashtab UAF — torn write reproducer
 *
 * Races BPF_F_LOCK in-place updates against concurrent delete+re-add
 * on the same key. Detects torn writes (mixed patterns in a single
 * value) which prove the unlocked lookup returned a stale element.
 *
 * Requires: CONFIG_BPF_SYSCALL=y, root/CAP_BPF, SMP (>= 4 CPUs)
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sched.h>
#include <signal.h>

#ifndef __NR_bpf
#define __NR_bpf 321
#endif

enum bpf_cmd_num {
	BPF_MAP_CREATE_CMD       = 0,
	BPF_MAP_LOOKUP_ELEM_CMD  = 1,
	BPF_MAP_UPDATE_ELEM_CMD  = 2,
	BPF_MAP_DELETE_ELEM_CMD  = 3,
	BPF_BTF_LOAD_CMD         = 18,
};

#define BPF_MAP_TYPE_HASH    1
#define BPF_ANY     0
#define BPF_EXIST   2
#define BPF_F_LOCK  4
#define BPF_F_NO_PREALLOC (1U << 0)

#define BTF_MAGIC   0xeB9F
#define BTF_VERSION 1
#define BTF_KIND_INT    1
#define BTF_KIND_STRUCT 4
#define BTF_INT_SIGNED (1 << 0)

struct btf_header {
	uint16_t magic;
	uint8_t  version;
	uint8_t  flags;
	uint32_t hdr_len;
	uint32_t type_off;
	uint32_t type_len;
	uint32_t str_off;
	uint32_t str_len;
};

struct btf_type {
	uint32_t name_off;
	uint32_t info;
	union { uint32_t size; uint32_t type; };
};

struct btf_member {
	uint32_t name_off;
	uint32_t type;
	uint32_t offset;
};

static inline uint32_t btf_info_enc(uint32_t kind, uint32_t vlen)
{
	return (kind << 24) | (vlen & 0xffff);
}

struct bpf_attr_map_create {
	uint32_t map_type;
	uint32_t key_size;
	uint32_t value_size;
	uint32_t max_entries;
	uint32_t map_flags;
	uint32_t inner_map_fd;
	uint32_t numa_node;
	char     map_name[16];
	uint32_t map_ifindex;
	uint32_t btf_fd;
	uint32_t btf_key_type_id;
	uint32_t btf_value_type_id;
};

struct bpf_attr_btf_load {
	uint64_t btf;
	uint64_t btf_log_buf;
	uint32_t btf_size;
	uint32_t btf_log_size;
	uint32_t btf_log_level;
};

struct bpf_attr_map_elem {
	uint32_t map_fd;
	uint32_t __pad0;
	uint64_t key;
	union {
		uint64_t value;
		uint64_t next_key;
	};
	uint64_t flags;
};

static long bpf_sys(int cmd, void *attr, unsigned int size)
{
	return syscall(__NR_bpf, cmd, attr, size);
}

struct map_value {
	uint32_t lock;
	uint32_t seq;
	uint64_t data[7];
};

#define VALUE_SIZE sizeof(struct map_value)
#define NUM_DATA_FIELDS 7

#define PATTERN_A 0xAAAAAAAAAAAAAAAAULL
#define PATTERN_B 0xBBBBBBBBBBBBBBBBULL
#define PATTERN_C 0xCCCCCCCCCCCCCCCCULL

static int load_btf(void)
{
	static const char strings[] =
		"\0" "int\0" "bpf_spin_lock\0" "val\0"
		"__u32\0" "map_value\0" "lock\0" "data\0";

	struct btf_type t1 = { 1, btf_info_enc(BTF_KIND_INT, 0), {4} };
	uint32_t t1_extra = (BTF_INT_SIGNED << 24) | 32;
	struct btf_type t2 = { 5, btf_info_enc(BTF_KIND_STRUCT, 1), {4} };
	struct btf_member t2_m = { 19, 3, 0 };
	struct btf_type t3 = { 23, btf_info_enc(BTF_KIND_INT, 0), {4} };
	uint32_t t3_extra = 32;
	struct btf_type t4 = { 29, btf_info_enc(BTF_KIND_STRUCT, 2), {VALUE_SIZE} };
	struct btf_member t4_m1 = { 39, 2, 0 };
	struct btf_member t4_m2 = { 44, 3, 32 };

	uint32_t type_len =
		sizeof(t1) + sizeof(t1_extra) +
		sizeof(t2) + sizeof(t2_m) +
		sizeof(t3) + sizeof(t3_extra) +
		sizeof(t4) + sizeof(t4_m1) + sizeof(t4_m2);
	uint32_t str_len = sizeof(strings);
	uint32_t blob_sz = sizeof(struct btf_header) + type_len + str_len;

	char *blob = calloc(1, blob_sz);
	if (!blob) return -1;

	struct btf_header *hdr = (struct btf_header *)blob;
	hdr->magic = BTF_MAGIC;
	hdr->version = BTF_VERSION;
	hdr->hdr_len = sizeof(struct btf_header);
	hdr->type_off = 0;
	hdr->type_len = type_len;
	hdr->str_off = type_len;
	hdr->str_len = str_len;

	char *p = blob + sizeof(struct btf_header);
#define EMIT(x) do { memcpy(p, &(x), sizeof(x)); p += sizeof(x); } while(0)
	EMIT(t1); EMIT(t1_extra);
	EMIT(t2); EMIT(t2_m);
	EMIT(t3); EMIT(t3_extra);
	EMIT(t4); EMIT(t4_m1); EMIT(t4_m2);
#undef EMIT
	memcpy(p, strings, str_len);

	char log_buf[4096] = {};
	struct bpf_attr_btf_load attr = {};
	attr.btf = (uint64_t)(unsigned long)blob;
	attr.btf_size = blob_sz;
	attr.btf_log_buf = (uint64_t)(unsigned long)log_buf;
	attr.btf_log_size = sizeof(log_buf);
	attr.btf_log_level = 1;

	int fd = bpf_sys(BPF_BTF_LOAD_CMD, &attr, sizeof(attr));
	if (fd < 0)
		fprintf(stderr, "BTF load failed: %s\n", strerror(errno));
	free(blob);
	return fd;
}

static int create_map(int btf_fd, uint32_t max_entries)
{
	struct bpf_attr_map_create attr = {};
	attr.map_type = BPF_MAP_TYPE_HASH;
	attr.key_size = sizeof(uint32_t);
	attr.value_size = VALUE_SIZE;
	attr.max_entries = max_entries;
	attr.map_flags = BPF_F_NO_PREALLOC;
	attr.btf_fd = btf_fd;
	attr.btf_key_type_id = 1;
	attr.btf_value_type_id = 4;

	int fd = bpf_sys(BPF_MAP_CREATE_CMD, &attr, sizeof(attr));
	if (fd < 0)
		fprintf(stderr, "Map create failed: %s\n", strerror(errno));
	return fd;
}

static int map_update(int fd, uint32_t *key, struct map_value *val, uint64_t flags)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	attr.value = (uint64_t)(unsigned long)val;
	attr.flags = flags;
	return bpf_sys(BPF_MAP_UPDATE_ELEM_CMD, &attr, sizeof(attr));
}

static int map_delete(int fd, uint32_t *key)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	return bpf_sys(BPF_MAP_DELETE_ELEM_CMD, &attr, sizeof(attr));
}

static int map_lookup(int fd, uint32_t *key, struct map_value *val)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	attr.value = (uint64_t)(unsigned long)val;
	return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
}

static volatile int *g_stop;
static int g_map_fd;
static uint32_t g_key = 1;

struct shared_stats {
	volatile uint64_t total_checks;
	volatile uint64_t torn_writes;
	volatile uint64_t unknown_pattern;
	volatile uint64_t a_into_b, b_into_a;
	volatile uint64_t c_into_a, a_into_c;
	volatile uint64_t b_into_c, c_into_b;
	volatile uint64_t max_torn_fields;
};
static struct shared_stats *g_stats;

#define MAX_LOG_EVENTS 20
struct corruption_event {
	uint64_t check_num;
	uint32_t seq;
	uint64_t data[NUM_DATA_FIELDS];
	int      field_owners[NUM_DATA_FIELDS];
};
static volatile int *g_log_count;
static struct corruption_event *g_log;

#define STACK_SIZE (1024 * 1024)

static int pattern_owner(uint64_t val)
{
	if (val == PATTERN_A) return 0;
	if (val == PATTERN_B) return 1;
	if (val == PATTERN_C) return 2;
	return -1;
}

static void fill_value(struct map_value *v, uint64_t pattern, uint32_t seq)
{
	v->lock = 0;
	v->seq = seq;
	for (int i = 0; i < NUM_DATA_FIELDS; i++)
		v->data[i] = pattern;
}

static int worker_update(void *arg)
{
	uint64_t pattern = (uint64_t)(unsigned long)arg;
	struct map_value val;
	uint32_t seq = 0;

	while (!*g_stop) {
		fill_value(&val, pattern, seq++);
		map_update(g_map_fd, &g_key, &val, BPF_F_LOCK | BPF_EXIST);
	}
	return 0;
}

static int worker_delete_readd(void *arg)
{
	(void)arg;
	struct map_value val;
	uint32_t seq = 0;

	while (!*g_stop) {
		map_delete(g_map_fd, &g_key);
		fill_value(&val, PATTERN_B, seq++);
		map_update(g_map_fd, &g_key, &val, BPF_ANY);
	}
	return 0;
}

static int worker_pressure(void *arg)
{
	int btf_fd = (int)(long)arg;
	struct map_value val;
	memset(&val, 0, sizeof(val));

	int fd = create_map(btf_fd, 256);
	if (fd < 0)
		return 1;

	while (!*g_stop) {
		for (uint32_t i = 0; i < 200; i++) {
			uint32_t k = i;
			val.seq = i;
			map_update(fd, &k, &val, BPF_ANY);
		}
		for (uint32_t i = 0; i < 200; i++) {
			uint32_t k = i;
			map_delete(fd, &k);
		}
	}

	close(fd);
	return 0;
}

static int worker_verify(void *arg)
{
	(void)arg;
	struct map_value val;
	uint64_t checks = 0;

	while (!*g_stop) {
		if (map_lookup(g_map_fd, &g_key, &val) != 0) {
			checks++;
			continue;
		}

		int owners[NUM_DATA_FIELDS];
		int has_a = 0, has_b = 0, has_c = 0, has_unk = 0;

		for (int i = 0; i < NUM_DATA_FIELDS; i++) {
			owners[i] = pattern_owner(val.data[i]);
			switch (owners[i]) {
			case 0:  has_a++; break;
			case 1:  has_b++; break;
			case 2:  has_c++; break;
			default: has_unk++; break;
			}
		}

		int distinct = (has_a > 0) + (has_b > 0) + (has_c > 0);

		if (distinct >= 2 || has_unk > 0) {
			g_stats->torn_writes++;

			int majority = (has_a >= has_b && has_a >= has_c) ? 0 :
				       (has_b >= has_c) ? 1 : 2;
			uint64_t torn_fields = 0;
			for (int i = 0; i < NUM_DATA_FIELDS; i++)
				if (owners[i] != majority)
					torn_fields++;
			if (torn_fields > g_stats->max_torn_fields)
				g_stats->max_torn_fields = torn_fields;

			for (int i = 0; i < NUM_DATA_FIELDS; i++) {
				int o = owners[i], m = majority;
				if (o == m) continue;
				if (o == 0 && m == 1) g_stats->a_into_b++;
				if (o == 1 && m == 0) g_stats->b_into_a++;
				if (o == 2 && m == 0) g_stats->c_into_a++;
				if (o == 0 && m == 2) g_stats->a_into_c++;
				if (o == 1 && m == 2) g_stats->b_into_c++;
				if (o == 2 && m == 1) g_stats->c_into_b++;
				if (o == -1)          g_stats->unknown_pattern++;
			}

			int idx = __sync_fetch_and_add(g_log_count, 1);
			if (idx < MAX_LOG_EVENTS) {
				g_log[idx].check_num = checks;
				g_log[idx].seq = val.seq;
				for (int i = 0; i < NUM_DATA_FIELDS; i++) {
					g_log[idx].data[i] = val.data[i];
					g_log[idx].field_owners[i] = owners[i];
				}
			}
		}

		checks++;
	}

	g_stats->total_checks += checks;
	return 0;
}

static pid_t spawn_worker(int (*fn)(void *), void *arg)
{
	void *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
	if (stack == MAP_FAILED) {
		perror("mmap stack");
		return -1;
	}
	pid_t pid = clone(fn, (char *)stack + STACK_SIZE,
			  CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | SIGCHLD,
			  arg);
	if (pid < 0)
		perror("clone");
	return pid;
}

#define RACE_SECONDS 60
#define NUM_UPDATE_A 2
#define NUM_DELETE_B 2
#define NUM_UPDATE_C 2
#define NUM_PRESSURE 2
#define NUM_VERIFY   2
#define TOTAL (NUM_UPDATE_A + NUM_DELETE_B + NUM_UPDATE_C + NUM_PRESSURE + NUM_VERIFY)

int main(void)
{
	size_t shm_sz = 4096 * 4;
	void *shm = mmap(NULL, shm_sz, PROT_READ | PROT_WRITE,
			 MAP_SHARED | MAP_ANONYMOUS, -1, 0);
	if (shm == MAP_FAILED) {
		perror("mmap shared");
		return 1;
	}
	memset(shm, 0, shm_sz);

	g_stop = (volatile int *)shm;
	g_stats = (struct shared_stats *)((char *)shm + 64);
	g_log_count = (volatile int *)((char *)shm + 64 + sizeof(struct shared_stats));
	g_log = (struct corruption_event *)((char *)shm + 4096);

	struct bpf_attr_map_create test = {};
	if (bpf_sys(BPF_MAP_CREATE_CMD, &test, sizeof(test)) < 0 && errno == ENOSYS) {
		fprintf(stderr, "BPF syscall not available\n");
		return 1;
	}

	int btf_fd = load_btf();
	if (btf_fd < 0)
		return 1;

	g_map_fd = create_map(btf_fd, 8);
	if (g_map_fd < 0)
		return 1;

	struct map_value init_val;
	fill_value(&init_val, PATTERN_B, 0);
	if (map_update(g_map_fd, &g_key, &init_val, BPF_ANY) < 0) {
		fprintf(stderr, "Initial update failed: %s\n", strerror(errno));
		return 1;
	}

	printf("Running %d threads for %d seconds...\n", TOTAL, RACE_SECONDS);

	pid_t pids[TOTAL];
	int idx = 0;

	for (int i = 0; i < NUM_UPDATE_A; i++)
		pids[idx++] = spawn_worker(worker_update, (void *)PATTERN_A);
	for (int i = 0; i < NUM_DELETE_B; i++)
		pids[idx++] = spawn_worker(worker_delete_readd, NULL);
	for (int i = 0; i < NUM_UPDATE_C; i++)
		pids[idx++] = spawn_worker(worker_update, (void *)PATTERN_C);
	for (int i = 0; i < NUM_PRESSURE; i++)
		pids[idx++] = spawn_worker(worker_pressure, (void *)(long)btf_fd);
	for (int i = 0; i < NUM_VERIFY; i++)
		pids[idx++] = spawn_worker(worker_verify, NULL);

	sleep(RACE_SECONDS);
	*g_stop = 1;

	for (int i = 0; i < TOTAL; i++)
		if (pids[i] > 0)
			waitpid(pids[i], NULL, 0);

	int logged = *g_log_count;
	if (logged > MAX_LOG_EVENTS) logged = MAX_LOG_EVENTS;

	printf("\nTotal checks:    %lu\n", g_stats->total_checks);
	printf("Torn writes:     %lu\n", g_stats->torn_writes);
	printf("Max torn fields: %lu / %d\n", g_stats->max_torn_fields,
	       NUM_DATA_FIELDS);

	if (g_stats->torn_writes > 0) {
		double rate = (double)g_stats->torn_writes /
			      g_stats->total_checks * 100.0;
		printf("Corruption rate: %.6f%%\n", rate);

		printf("\nCross-pattern breakdown:\n");
		if (g_stats->a_into_b) printf("  A in B: %lu\n", g_stats->a_into_b);
		if (g_stats->b_into_a) printf("  B in A: %lu\n", g_stats->b_into_a);
		if (g_stats->c_into_a) printf("  C in A: %lu\n", g_stats->c_into_a);
		if (g_stats->a_into_c) printf("  A in C: %lu\n", g_stats->a_into_c);
		if (g_stats->b_into_c) printf("  B in C: %lu\n", g_stats->b_into_c);
		if (g_stats->c_into_b) printf("  C in B: %lu\n", g_stats->c_into_b);
		if (g_stats->unknown_pattern)
			printf("  Unknown: %lu\n", g_stats->unknown_pattern);
	}

	if (logged > 0) {
		printf("\nFirst %d events:\n", logged);
		for (int e = 0; e < logged; e++) {
			printf("  [%d] check #%lu seq=%u ",
			       e, g_log[e].check_num, g_log[e].seq);
			for (int i = 0; i < NUM_DATA_FIELDS; i++) {
				char c = '?';
				if (g_log[e].data[i] == PATTERN_A) c = 'A';
				else if (g_log[e].data[i] == PATTERN_B) c = 'B';
				else if (g_log[e].data[i] == PATTERN_C) c = 'C';
				printf("%c", c);
			}
			printf("\n");
		}
	}

	printf("\n%s\n", g_stats->torn_writes > 0 ?
	       "CORRUPTION DETECTED" :
	       "No corruption detected (try more CPUs or longer run)");

	close(g_map_fd);
	close(btf_fd);
	return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26  8:49 [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path Aaron Esau
@ 2026-03-26 13:39 ` Puranjay Mohan
  2026-03-26 14:58   ` Kumar Kartikeya Dwivedi
                     ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Puranjay Mohan @ 2026-03-26 13:39 UTC (permalink / raw)
  To: Aaron Esau, bpf; +Cc: ast, daniel, andrii

Aaron Esau <aaron1esau@gmail.com> writes:

> Reported-by: Aaron Esau <aaron1esau@gmail.com>
>
> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
> for in-place updates.
>
> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
> bucket lock, then dereferences the element via copy_map_value_locked().
> A concurrent htab_map_delete_elem() can delete and free the element
> between these steps.
>
> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
> the object to the per-CPU free list (not RCU-deferred). The memory may
> be reallocated before copy_map_value_locked() executes, leading to
> writes into a different element.
>
> When lookup succeeds (l_old != NULL), the in-place update path returns
> early, so the “full lookup under lock” path is not taken.
>
> Race:
>
>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
>          lookup_nulls_elem_raw() → E (no bucket lock)
>          ...
>   CPU 1: htab_map_delete_elem()
>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
>          free_htab_elem → bpf_mem_cache_free (immediate free)
>   CPU 1: htab_map_update_elem (new key)
>          alloc_htab_elem → reuses E
>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
>
> Reproduction:
>
>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
>      (max_entries=64, 7 u64 fields + lock).
>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
>   4. Threads C: same as A (pattern 0xCCCC...)
>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
>   6. Run 60s on >=4 CPUs
>
> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
> I observed ~645 torn values in 2.5M checks (~0.026%).
>
> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")

Although this is a real issue, your reproducer is not accurate, it will
see torn writes even without the UAF issue, because the verifier thread
is not taking the lock:

So the torn write pattern CCCAAAA can mean:
  1. Thread A finished writing AAAAAAA (while holding the lock)
  2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
  3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
  4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock

This race happens regardless of whether the element is freed/reused.  It
would happen even without thread B (the delete+readd thread). The
corruption source is the non-atomic read, not the UAF.

If you change the preproducer like:

-- >8 --

--- repro.c     2026-03-26 05:22:49.012503218 -0700
+++ repro2.c    2026-03-26 06:24:40.951044279 -0700
@@ -227,6 +227,7 @@
        attr.map_fd = fd;
        attr.key = (uint64_t)(unsigned long)key;
        attr.value = (uint64_t)(unsigned long)val;
+       attr.flags = BPF_F_LOCK;
        return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
 }

-- 8< --

Now it will detect the correct UAF problem.

I verified that this updated reproducer shows the problem, the following
kernel diff fixes it:

-- >8 --

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index bc6bc8bb871d..af33f62069f0 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -953,7 +953,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)

        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
                bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
-       bpf_mem_cache_free(&htab->ma, l);
+       bpf_mem_cache_free_rcu(&htab->ma, l);
 }

 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)

-- 8< --

Before:

[root@alarm host0]# ./repro2
Running 10 threads for 60 seconds...

Total checks:    49228421
Torn writes:     5470
Max torn fields: 3 / 7
Corruption rate: 0.011111%

Cross-pattern breakdown:
  A in B: 8595
  C in B: 7826
  Unknown: 1

First 20 events:
  [0] check #42061 seq=39070 CCCBBBB
  [1] check #65714 seq=60575 CCCBBBB
  [2] check #65287 seq=60575 CCCBBBB
  [3] check #70474 seq=65793 AAABBBB
  [4] check #70907 seq=65793 AAABBBB
  [5] check #103389 seq=95745 AAABBBB
  [6] check #107208 seq=98672 CCCBBBB
  [7] check #108218 seq=100387 CCCBBBB
  [8] check #111490 seq=103388 CCCBBBB
  [9] check #140942 seq=128894 CCCBBBB
  [10] check #164845 seq=151828 CCCBBBB
  [11] check #163993 seq=151828 CCCBBBB
  [12] check #169184 seq=155453 CCCBBBB
  [13] check #171383 seq=158572 AAABBBB
  [14] check #179943 seq=165425 CCCBBBB
  [15] check #189218 seq=173926 CCCBBBB
  [16] check #192119 seq=177892 CCCBBBB
  [17] check #194253 seq=180562 AAABBBB
  [18] check #202169 seq=187253 CCCBBBB
  [19] check #205452 seq=189021 CCCBBBB

CORRUPTION DETECTED

After:

[root@alarm host0]# ./repro2
Running 10 threads for 60 seconds...

Total checks:    108666576
Torn writes:     0
Max torn fields: 0 / 7

No corruption detected (try more CPUs or longer run)
[root@alarm host0]# nproc
16

I will send a patch to fix this soon after validating the above kernel
diff and figuring out how we got to this state in htab_elem_free() by
analyzing the git history.

Thanks for the report.
Puranjay

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 13:39 ` Puranjay Mohan
@ 2026-03-26 14:58   ` Kumar Kartikeya Dwivedi
  2026-03-26 15:02   ` Puranjay Mohan
  2026-03-26 15:26   ` Mykyta Yatsenko
  2 siblings, 0 replies; 12+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2026-03-26 14:58 UTC (permalink / raw)
  To: Puranjay Mohan; +Cc: Aaron Esau, bpf, ast, daniel, andrii

On Thu, 26 Mar 2026 at 14:46, Puranjay Mohan <puranjay@kernel.org> wrote:
>
> Aaron Esau <aaron1esau@gmail.com> writes:
>
> > Reported-by: Aaron Esau <aaron1esau@gmail.com>
> >
> > htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
> > for in-place updates.
> >
> > The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
> > bucket lock, then dereferences the element via copy_map_value_locked().
> > A concurrent htab_map_delete_elem() can delete and free the element
> > between these steps.
> >
> > free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
> > the object to the per-CPU free list (not RCU-deferred). The memory may
> > be reallocated before copy_map_value_locked() executes, leading to
> > writes into a different element.
> >
> > When lookup succeeds (l_old != NULL), the in-place update path returns
> > early, so the “full lookup under lock” path is not taken.
> >
> > Race:
> >
> >   CPU 0: htab_map_update_elem (BPF_F_LOCK)
> >          lookup_nulls_elem_raw() → E (no bucket lock)
> >          ...
> >   CPU 1: htab_map_delete_elem()
> >          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
> >          free_htab_elem → bpf_mem_cache_free (immediate free)
> >   CPU 1: htab_map_update_elem (new key)
> >          alloc_htab_elem → reuses E
> >   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
> >
> > Reproduction:
> >
> >   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
> >      (max_entries=64, 7 u64 fields + lock).
> >   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
> >   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
> >   4. Threads C: same as A (pattern 0xCCCC...)
> >   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
> >   6. Run 60s on >=4 CPUs
> >
> > Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
> > I observed ~645 torn values in 2.5M checks (~0.026%).
> >
> > Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
>
> Although this is a real issue, your reproducer is not accurate, it will
> see torn writes even without the UAF issue, because the verifier thread
> is not taking the lock:
>
> So the torn write pattern CCCAAAA can mean:
>   1. Thread A finished writing AAAAAAA (while holding the lock)
>   2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
>   3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
>   4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock
>
> This race happens regardless of whether the element is freed/reused.  It
> would happen even without thread B (the delete+readd thread). The
> corruption source is the non-atomic read, not the UAF.
>
> If you change the preproducer like:
>
> -- >8 --
>
> --- repro.c     2026-03-26 05:22:49.012503218 -0700
> +++ repro2.c    2026-03-26 06:24:40.951044279 -0700
> @@ -227,6 +227,7 @@
>         attr.map_fd = fd;
>         attr.key = (uint64_t)(unsigned long)key;
>         attr.value = (uint64_t)(unsigned long)val;
> +       attr.flags = BPF_F_LOCK;
>         return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
>  }
>
> -- 8< --
>
> Now it will detect the correct UAF problem.
>
> I verified that this updated reproducer shows the problem, the following
> kernel diff fixes it:
>
> -- >8 --
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index bc6bc8bb871d..af33f62069f0 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -953,7 +953,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
>
>         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
>                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
> -       bpf_mem_cache_free(&htab->ma, l);
> +       bpf_mem_cache_free_rcu(&htab->ma, l);
>  }
>

Immediate value reuse is expected behavior. BPF_F_LOCK only serializes
writes to the map value from the syscall path, it doesn't guarantee
that a concurrent delete won't unlink the element (such that it gets
reused).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 13:39 ` Puranjay Mohan
  2026-03-26 14:58   ` Kumar Kartikeya Dwivedi
@ 2026-03-26 15:02   ` Puranjay Mohan
  2026-03-26 15:26   ` Mykyta Yatsenko
  2 siblings, 0 replies; 12+ messages in thread
From: Puranjay Mohan @ 2026-03-26 15:02 UTC (permalink / raw)
  To: Aaron Esau, bpf; +Cc: ast, daniel, andrii, Kumar Kartikeya Dwivedi

Puranjay Mohan <puranjay@kernel.org> writes:

> Aaron Esau <aaron1esau@gmail.com> writes:
>
>> Reported-by: Aaron Esau <aaron1esau@gmail.com>
>>
>> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
>> for in-place updates.
>>
>> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
>> bucket lock, then dereferences the element via copy_map_value_locked().
>> A concurrent htab_map_delete_elem() can delete and free the element
>> between these steps.
>>
>> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
>> the object to the per-CPU free list (not RCU-deferred). The memory may
>> be reallocated before copy_map_value_locked() executes, leading to
>> writes into a different element.
>>
>> When lookup succeeds (l_old != NULL), the in-place update path returns
>> early, so the “full lookup under lock” path is not taken.
>>
>> Race:
>>
>>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
>>          lookup_nulls_elem_raw() → E (no bucket lock)
>>          ...
>>   CPU 1: htab_map_delete_elem()
>>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
>>          free_htab_elem → bpf_mem_cache_free (immediate free)
>>   CPU 1: htab_map_update_elem (new key)
>>          alloc_htab_elem → reuses E
>>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
>>
>> Reproduction:
>>
>>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
>>      (max_entries=64, 7 u64 fields + lock).
>>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
>>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
>>   4. Threads C: same as A (pattern 0xCCCC...)
>>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
>>   6. Run 60s on >=4 CPUs
>>
>> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
>> I observed ~645 torn values in 2.5M checks (~0.026%).
>>
>> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
>
> Although this is a real issue, your reproducer is not accurate, it will
> see torn writes even without the UAF issue, because the verifier thread

After reading the code and discussing with Kumar, this is expected
behaviour. BPF_F_LOCK performs a lockless lookup and takes only the
element's embedded spin_lock for in-place value updates. It does not
synchronize against concurrent deletes, preallocated hash maps have had
the same semantics since BPF_F_LOCK was introduced. Programs that need
both parallel deletes and updates must handle that coordination
themselves.

Thanks,
Puranjay

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 13:39 ` Puranjay Mohan
  2026-03-26 14:58   ` Kumar Kartikeya Dwivedi
  2026-03-26 15:02   ` Puranjay Mohan
@ 2026-03-26 15:26   ` Mykyta Yatsenko
  2026-03-26 15:33     ` Puranjay Mohan
  2 siblings, 1 reply; 12+ messages in thread
From: Mykyta Yatsenko @ 2026-03-26 15:26 UTC (permalink / raw)
  To: Puranjay Mohan, Aaron Esau, bpf; +Cc: ast, daniel, andrii

Puranjay Mohan <puranjay@kernel.org> writes:

> Aaron Esau <aaron1esau@gmail.com> writes:
>
>> Reported-by: Aaron Esau <aaron1esau@gmail.com>
>>
>> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
>> for in-place updates.
>>
>> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
>> bucket lock, then dereferences the element via copy_map_value_locked().
>> A concurrent htab_map_delete_elem() can delete and free the element
>> between these steps.
>>
>> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
>> the object to the per-CPU free list (not RCU-deferred). The memory may
>> be reallocated before copy_map_value_locked() executes, leading to
>> writes into a different element.
>>
>> When lookup succeeds (l_old != NULL), the in-place update path returns
>> early, so the “full lookup under lock” path is not taken.
>>
>> Race:
>>
>>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
>>          lookup_nulls_elem_raw() → E (no bucket lock)
>>          ...
>>   CPU 1: htab_map_delete_elem()
>>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
>>          free_htab_elem → bpf_mem_cache_free (immediate free)
>>   CPU 1: htab_map_update_elem (new key)
>>          alloc_htab_elem → reuses E
>>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
>>
>> Reproduction:
>>
>>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
>>      (max_entries=64, 7 u64 fields + lock).
>>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
>>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
>>   4. Threads C: same as A (pattern 0xCCCC...)
>>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
>>   6. Run 60s on >=4 CPUs
>>
>> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
>> I observed ~645 torn values in 2.5M checks (~0.026%).
>>
>> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
>
> Although this is a real issue, your reproducer is not accurate, it will
> see torn writes even without the UAF issue, because the verifier thread
> is not taking the lock:
>
> So the torn write pattern CCCAAAA can mean:
>   1. Thread A finished writing AAAAAAA (while holding the lock)
>   2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
>   3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
>   4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock
>
> This race happens regardless of whether the element is freed/reused.  It
> would happen even without thread B (the delete+readd thread). The
> corruption source is the non-atomic read, not the UAF.
Have you confirmed torn reads even with BPF_F_LOCK flag on the
BPF_MAP_LOOKUP_ELEM_CMD? I understand there must not be any torn reads
with spinlock taken on the lookup path.

The reproducer looks like a good selftest to have, but it needs to be
ported to use libbpf, currently it looks too complex.
>
> If you change the preproducer like:
>
> -- >8 --
>
> --- repro.c     2026-03-26 05:22:49.012503218 -0700
> +++ repro2.c    2026-03-26 06:24:40.951044279 -0700
> @@ -227,6 +227,7 @@
>         attr.map_fd = fd;
>         attr.key = (uint64_t)(unsigned long)key;
>         attr.value = (uint64_t)(unsigned long)val;
> +       attr.flags = BPF_F_LOCK;
>         return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
>  }
>
> -- 8< --
>
> Now it will detect the correct UAF problem.
>
> I verified that this updated reproducer shows the problem, the following
> kernel diff fixes it:
>
> -- >8 --
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index bc6bc8bb871d..af33f62069f0 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -953,7 +953,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
>
>         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
>                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
> -       bpf_mem_cache_free(&htab->ma, l);
> +       bpf_mem_cache_free_rcu(&htab->ma, l);
>  }
>
>  static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
>
> -- 8< --
>
> Before:
>
> [root@alarm host0]# ./repro2
> Running 10 threads for 60 seconds...
>
> Total checks:    49228421
> Torn writes:     5470
> Max torn fields: 3 / 7
> Corruption rate: 0.011111%
>
> Cross-pattern breakdown:
>   A in B: 8595
>   C in B: 7826
>   Unknown: 1
>
> First 20 events:
>   [0] check #42061 seq=39070 CCCBBBB
>   [1] check #65714 seq=60575 CCCBBBB
>   [2] check #65287 seq=60575 CCCBBBB
>   [3] check #70474 seq=65793 AAABBBB
>   [4] check #70907 seq=65793 AAABBBB
>   [5] check #103389 seq=95745 AAABBBB
>   [6] check #107208 seq=98672 CCCBBBB
>   [7] check #108218 seq=100387 CCCBBBB
>   [8] check #111490 seq=103388 CCCBBBB
>   [9] check #140942 seq=128894 CCCBBBB
>   [10] check #164845 seq=151828 CCCBBBB
>   [11] check #163993 seq=151828 CCCBBBB
>   [12] check #169184 seq=155453 CCCBBBB
>   [13] check #171383 seq=158572 AAABBBB
>   [14] check #179943 seq=165425 CCCBBBB
>   [15] check #189218 seq=173926 CCCBBBB
>   [16] check #192119 seq=177892 CCCBBBB
>   [17] check #194253 seq=180562 AAABBBB
>   [18] check #202169 seq=187253 CCCBBBB
>   [19] check #205452 seq=189021 CCCBBBB
>
> CORRUPTION DETECTED
>
> After:
>
> [root@alarm host0]# ./repro2
> Running 10 threads for 60 seconds...
>
> Total checks:    108666576
> Torn writes:     0
> Max torn fields: 0 / 7
>
> No corruption detected (try more CPUs or longer run)
> [root@alarm host0]# nproc
> 16
>
> I will send a patch to fix this soon after validating the above kernel
> diff and figuring out how we got to this state in htab_elem_free() by
> analyzing the git history.
>
> Thanks for the report.
> Puranjay

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 15:26   ` Mykyta Yatsenko
@ 2026-03-26 15:33     ` Puranjay Mohan
  2026-03-26 15:43       ` Mykyta Yatsenko
  0 siblings, 1 reply; 12+ messages in thread
From: Puranjay Mohan @ 2026-03-26 15:33 UTC (permalink / raw)
  To: Mykyta Yatsenko; +Cc: Aaron Esau, bpf, ast, daniel, andrii

On Thu, Mar 26, 2026 at 3:26 PM Mykyta Yatsenko
<mykyta.yatsenko5@gmail.com> wrote:
>
> Puranjay Mohan <puranjay@kernel.org> writes:
>
> > Aaron Esau <aaron1esau@gmail.com> writes:
> >
> >> Reported-by: Aaron Esau <aaron1esau@gmail.com>
> >>
> >> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
> >> for in-place updates.
> >>
> >> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
> >> bucket lock, then dereferences the element via copy_map_value_locked().
> >> A concurrent htab_map_delete_elem() can delete and free the element
> >> between these steps.
> >>
> >> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
> >> the object to the per-CPU free list (not RCU-deferred). The memory may
> >> be reallocated before copy_map_value_locked() executes, leading to
> >> writes into a different element.
> >>
> >> When lookup succeeds (l_old != NULL), the in-place update path returns
> >> early, so the “full lookup under lock” path is not taken.
> >>
> >> Race:
> >>
> >>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
> >>          lookup_nulls_elem_raw() → E (no bucket lock)
> >>          ...
> >>   CPU 1: htab_map_delete_elem()
> >>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
> >>          free_htab_elem → bpf_mem_cache_free (immediate free)
> >>   CPU 1: htab_map_update_elem (new key)
> >>          alloc_htab_elem → reuses E
> >>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
> >>
> >> Reproduction:
> >>
> >>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
> >>      (max_entries=64, 7 u64 fields + lock).
> >>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
> >>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
> >>   4. Threads C: same as A (pattern 0xCCCC...)
> >>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
> >>   6. Run 60s on >=4 CPUs
> >>
> >> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
> >> I observed ~645 torn values in 2.5M checks (~0.026%).
> >>
> >> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
> >
> > Although this is a real issue, your reproducer is not accurate, it will
> > see torn writes even without the UAF issue, because the verifier thread
> > is not taking the lock:
> >
> > So the torn write pattern CCCAAAA can mean:
> >   1. Thread A finished writing AAAAAAA (while holding the lock)
> >   2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
> >   3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
> >   4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock
> >
> > This race happens regardless of whether the element is freed/reused.  It
> > would happen even without thread B (the delete+readd thread). The
> > corruption source is the non-atomic read, not the UAF.
> Have you confirmed torn reads even with BPF_F_LOCK flag on the
> BPF_MAP_LOOKUP_ELEM_CMD? I understand there must not be any torn reads
> with spinlock taken on the lookup path.
>
> The reproducer looks like a good selftest to have, but it needs to be
> ported to use libbpf, currently it looks too complex.

Yes, I have confirmed torn reads even with BPF_F_LOCK on
BPF_MAP_LOOKUP_ELEM_CMD, the results given below are with BPF_F_LOCK.
But this is expected behaviour, BPF_F_LOCK performs a lockless lookup
and takes only the element's embedded spin_lock for in-place value
updates. It does not synchronize against concurrent deletes.


> >
> > If you change the preproducer like:
> >
> > -- >8 --
> >
> > --- repro.c     2026-03-26 05:22:49.012503218 -0700
> > +++ repro2.c    2026-03-26 06:24:40.951044279 -0700
> > @@ -227,6 +227,7 @@
> >         attr.map_fd = fd;
> >         attr.key = (uint64_t)(unsigned long)key;
> >         attr.value = (uint64_t)(unsigned long)val;
> > +       attr.flags = BPF_F_LOCK;
> >         return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
> >  }
> >
> > -- 8< --
> >
> > Now it will detect the correct UAF problem.
> >
> > I verified that this updated reproducer shows the problem, the following
> > kernel diff fixes it:
> >
> > -- >8 --
> >
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index bc6bc8bb871d..af33f62069f0 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -953,7 +953,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
> >
> >         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
> >                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
> > -       bpf_mem_cache_free(&htab->ma, l);
> > +       bpf_mem_cache_free_rcu(&htab->ma, l);
> >  }
> >
> >  static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
> >
> > -- 8< --
> >
> > Before:
> >
> > [root@alarm host0]# ./repro2
> > Running 10 threads for 60 seconds...
> >
> > Total checks:    49228421
> > Torn writes:     5470
> > Max torn fields: 3 / 7
> > Corruption rate: 0.011111%
> >
> > Cross-pattern breakdown:
> >   A in B: 8595
> >   C in B: 7826
> >   Unknown: 1
> >
> > First 20 events:
> >   [0] check #42061 seq=39070 CCCBBBB
> >   [1] check #65714 seq=60575 CCCBBBB
> >   [2] check #65287 seq=60575 CCCBBBB
> >   [3] check #70474 seq=65793 AAABBBB
> >   [4] check #70907 seq=65793 AAABBBB
> >   [5] check #103389 seq=95745 AAABBBB
> >   [6] check #107208 seq=98672 CCCBBBB
> >   [7] check #108218 seq=100387 CCCBBBB
> >   [8] check #111490 seq=103388 CCCBBBB
> >   [9] check #140942 seq=128894 CCCBBBB
> >   [10] check #164845 seq=151828 CCCBBBB
> >   [11] check #163993 seq=151828 CCCBBBB
> >   [12] check #169184 seq=155453 CCCBBBB
> >   [13] check #171383 seq=158572 AAABBBB
> >   [14] check #179943 seq=165425 CCCBBBB
> >   [15] check #189218 seq=173926 CCCBBBB
> >   [16] check #192119 seq=177892 CCCBBBB
> >   [17] check #194253 seq=180562 AAABBBB
> >   [18] check #202169 seq=187253 CCCBBBB
> >   [19] check #205452 seq=189021 CCCBBBB
> >
> > CORRUPTION DETECTED
> >
> > After:
> >
> > [root@alarm host0]# ./repro2
> > Running 10 threads for 60 seconds...
> >
> > Total checks:    108666576
> > Torn writes:     0
> > Max torn fields: 0 / 7
> >
> > No corruption detected (try more CPUs or longer run)
> > [root@alarm host0]# nproc
> > 16
> >
> > I will send a patch to fix this soon after validating the above kernel
> > diff and figuring out how we got to this state in htab_elem_free() by
> > analyzing the git history.
> >
> > Thanks for the report.
> > Puranjay

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 15:33     ` Puranjay Mohan
@ 2026-03-26 15:43       ` Mykyta Yatsenko
  2026-03-26 15:47         ` Mykyta Yatsenko
  0 siblings, 1 reply; 12+ messages in thread
From: Mykyta Yatsenko @ 2026-03-26 15:43 UTC (permalink / raw)
  To: Puranjay Mohan; +Cc: Aaron Esau, bpf, ast, daniel, andrii

Puranjay Mohan <puranjay12@gmail.com> writes:

> On Thu, Mar 26, 2026 at 3:26 PM Mykyta Yatsenko
> <mykyta.yatsenko5@gmail.com> wrote:
>>
>> Puranjay Mohan <puranjay@kernel.org> writes:
>>
>> > Aaron Esau <aaron1esau@gmail.com> writes:
>> >
>> >> Reported-by: Aaron Esau <aaron1esau@gmail.com>
>> >>
>> >> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
>> >> for in-place updates.
>> >>
>> >> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
>> >> bucket lock, then dereferences the element via copy_map_value_locked().
>> >> A concurrent htab_map_delete_elem() can delete and free the element
>> >> between these steps.
>> >>
>> >> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
>> >> the object to the per-CPU free list (not RCU-deferred). The memory may
>> >> be reallocated before copy_map_value_locked() executes, leading to
>> >> writes into a different element.
>> >>
>> >> When lookup succeeds (l_old != NULL), the in-place update path returns
>> >> early, so the “full lookup under lock” path is not taken.
>> >>
>> >> Race:
>> >>
>> >>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
>> >>          lookup_nulls_elem_raw() → E (no bucket lock)
>> >>          ...
>> >>   CPU 1: htab_map_delete_elem()
>> >>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
>> >>          free_htab_elem → bpf_mem_cache_free (immediate free)
>> >>   CPU 1: htab_map_update_elem (new key)
>> >>          alloc_htab_elem → reuses E
>> >>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
>> >>
>> >> Reproduction:
>> >>
>> >>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
>> >>      (max_entries=64, 7 u64 fields + lock).
>> >>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
>> >>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
>> >>   4. Threads C: same as A (pattern 0xCCCC...)
>> >>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
>> >>   6. Run 60s on >=4 CPUs
>> >>
>> >> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
>> >> I observed ~645 torn values in 2.5M checks (~0.026%).
>> >>
>> >> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
>> >
>> > Although this is a real issue, your reproducer is not accurate, it will
>> > see torn writes even without the UAF issue, because the verifier thread
>> > is not taking the lock:
>> >
>> > So the torn write pattern CCCAAAA can mean:
>> >   1. Thread A finished writing AAAAAAA (while holding the lock)
>> >   2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
>> >   3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
>> >   4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock
>> >
>> > This race happens regardless of whether the element is freed/reused.  It
>> > would happen even without thread B (the delete+readd thread). The
>> > corruption source is the non-atomic read, not the UAF.
>> Have you confirmed torn reads even with BPF_F_LOCK flag on the
>> BPF_MAP_LOOKUP_ELEM_CMD? I understand there must not be any torn reads
>> with spinlock taken on the lookup path.
>>
>> The reproducer looks like a good selftest to have, but it needs to be
>> ported to use libbpf, currently it looks too complex.
>
> Yes, I have confirmed torn reads even with BPF_F_LOCK on
> BPF_MAP_LOOKUP_ELEM_CMD, the results given below are with BPF_F_LOCK.
> But this is expected behaviour, BPF_F_LOCK performs a lockless lookup
> and takes only the element's embedded spin_lock for in-place value
> updates. It does not synchronize against concurrent deletes.
>
It does not synchronize against concurrent deletes, yes, but with
BPF_F_LOCK on BPF_MAP_LOOKUP_ELEM_CMD, we still take the spinlock before
copying value from the map (syscall.c:354). Deletion does not mutate the
element, so when the value is reused, it should still take the same lock,
shouldn't it?
>
>> >
>> > If you change the preproducer like:
>> >
>> > -- >8 --
>> >
>> > --- repro.c     2026-03-26 05:22:49.012503218 -0700
>> > +++ repro2.c    2026-03-26 06:24:40.951044279 -0700
>> > @@ -227,6 +227,7 @@
>> >         attr.map_fd = fd;
>> >         attr.key = (uint64_t)(unsigned long)key;
>> >         attr.value = (uint64_t)(unsigned long)val;
>> > +       attr.flags = BPF_F_LOCK;
>> >         return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
>> >  }
>> >
>> > -- 8< --
>> >
>> > Now it will detect the correct UAF problem.
>> >
>> > I verified that this updated reproducer shows the problem, the following
>> > kernel diff fixes it:
>> >
>> > -- >8 --
>> >
>> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>> > index bc6bc8bb871d..af33f62069f0 100644
>> > --- a/kernel/bpf/hashtab.c
>> > +++ b/kernel/bpf/hashtab.c
>> > @@ -953,7 +953,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
>> >
>> >         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
>> >                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
>> > -       bpf_mem_cache_free(&htab->ma, l);
>> > +       bpf_mem_cache_free_rcu(&htab->ma, l);
>> >  }
>> >
>> >  static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
>> >
>> > -- 8< --
>> >
>> > Before:
>> >
>> > [root@alarm host0]# ./repro2
>> > Running 10 threads for 60 seconds...
>> >
>> > Total checks:    49228421
>> > Torn writes:     5470
>> > Max torn fields: 3 / 7
>> > Corruption rate: 0.011111%
>> >
>> > Cross-pattern breakdown:
>> >   A in B: 8595
>> >   C in B: 7826
>> >   Unknown: 1
>> >
>> > First 20 events:
>> >   [0] check #42061 seq=39070 CCCBBBB
>> >   [1] check #65714 seq=60575 CCCBBBB
>> >   [2] check #65287 seq=60575 CCCBBBB
>> >   [3] check #70474 seq=65793 AAABBBB
>> >   [4] check #70907 seq=65793 AAABBBB
>> >   [5] check #103389 seq=95745 AAABBBB
>> >   [6] check #107208 seq=98672 CCCBBBB
>> >   [7] check #108218 seq=100387 CCCBBBB
>> >   [8] check #111490 seq=103388 CCCBBBB
>> >   [9] check #140942 seq=128894 CCCBBBB
>> >   [10] check #164845 seq=151828 CCCBBBB
>> >   [11] check #163993 seq=151828 CCCBBBB
>> >   [12] check #169184 seq=155453 CCCBBBB
>> >   [13] check #171383 seq=158572 AAABBBB
>> >   [14] check #179943 seq=165425 CCCBBBB
>> >   [15] check #189218 seq=173926 CCCBBBB
>> >   [16] check #192119 seq=177892 CCCBBBB
>> >   [17] check #194253 seq=180562 AAABBBB
>> >   [18] check #202169 seq=187253 CCCBBBB
>> >   [19] check #205452 seq=189021 CCCBBBB
>> >
>> > CORRUPTION DETECTED
>> >
>> > After:
>> >
>> > [root@alarm host0]# ./repro2
>> > Running 10 threads for 60 seconds...
>> >
>> > Total checks:    108666576
>> > Torn writes:     0
>> > Max torn fields: 0 / 7
>> >
>> > No corruption detected (try more CPUs or longer run)
>> > [root@alarm host0]# nproc
>> > 16
>> >
>> > I will send a patch to fix this soon after validating the above kernel
>> > diff and figuring out how we got to this state in htab_elem_free() by
>> > analyzing the git history.
>> >
>> > Thanks for the report.
>> > Puranjay

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 15:43       ` Mykyta Yatsenko
@ 2026-03-26 15:47         ` Mykyta Yatsenko
  2026-03-26 15:57           ` Puranjay Mohan
  0 siblings, 1 reply; 12+ messages in thread
From: Mykyta Yatsenko @ 2026-03-26 15:47 UTC (permalink / raw)
  To: Puranjay Mohan; +Cc: Aaron Esau, bpf, ast, daniel, andrii

Mykyta Yatsenko <mykyta.yatsenko5@gmail.com> writes:

> Puranjay Mohan <puranjay12@gmail.com> writes:
>
>> On Thu, Mar 26, 2026 at 3:26 PM Mykyta Yatsenko
>> <mykyta.yatsenko5@gmail.com> wrote:
>>>
>>> Puranjay Mohan <puranjay@kernel.org> writes:
>>>
>>> > Aaron Esau <aaron1esau@gmail.com> writes:
>>> >
>>> >> Reported-by: Aaron Esau <aaron1esau@gmail.com>
>>> >>
>>> >> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
>>> >> for in-place updates.
>>> >>
>>> >> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
>>> >> bucket lock, then dereferences the element via copy_map_value_locked().
>>> >> A concurrent htab_map_delete_elem() can delete and free the element
>>> >> between these steps.
>>> >>
>>> >> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
>>> >> the object to the per-CPU free list (not RCU-deferred). The memory may
>>> >> be reallocated before copy_map_value_locked() executes, leading to
>>> >> writes into a different element.
>>> >>
>>> >> When lookup succeeds (l_old != NULL), the in-place update path returns
>>> >> early, so the “full lookup under lock” path is not taken.
>>> >>
>>> >> Race:
>>> >>
>>> >>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
>>> >>          lookup_nulls_elem_raw() → E (no bucket lock)
>>> >>          ...
>>> >>   CPU 1: htab_map_delete_elem()
>>> >>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
>>> >>          free_htab_elem → bpf_mem_cache_free (immediate free)
>>> >>   CPU 1: htab_map_update_elem (new key)
>>> >>          alloc_htab_elem → reuses E
>>> >>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
>>> >>
>>> >> Reproduction:
>>> >>
>>> >>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
>>> >>      (max_entries=64, 7 u64 fields + lock).
>>> >>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
>>> >>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
>>> >>   4. Threads C: same as A (pattern 0xCCCC...)
>>> >>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
>>> >>   6. Run 60s on >=4 CPUs
>>> >>
>>> >> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
>>> >> I observed ~645 torn values in 2.5M checks (~0.026%).
>>> >>
>>> >> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
>>> >
>>> > Although this is a real issue, your reproducer is not accurate, it will
>>> > see torn writes even without the UAF issue, because the verifier thread
>>> > is not taking the lock:
>>> >
>>> > So the torn write pattern CCCAAAA can mean:
>>> >   1. Thread A finished writing AAAAAAA (while holding the lock)
>>> >   2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
>>> >   3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
>>> >   4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock
>>> >
>>> > This race happens regardless of whether the element is freed/reused.  It
>>> > would happen even without thread B (the delete+readd thread). The
>>> > corruption source is the non-atomic read, not the UAF.
>>> Have you confirmed torn reads even with BPF_F_LOCK flag on the
>>> BPF_MAP_LOOKUP_ELEM_CMD? I understand there must not be any torn reads
>>> with spinlock taken on the lookup path.
>>>
>>> The reproducer looks like a good selftest to have, but it needs to be
>>> ported to use libbpf, currently it looks too complex.
>>
>> Yes, I have confirmed torn reads even with BPF_F_LOCK on
>> BPF_MAP_LOOKUP_ELEM_CMD, the results given below are with BPF_F_LOCK.
>> But this is expected behaviour, BPF_F_LOCK performs a lockless lookup
>> and takes only the element's embedded spin_lock for in-place value
>> updates. It does not synchronize against concurrent deletes.
>>
> It does not synchronize against concurrent deletes, yes, but with
> BPF_F_LOCK on BPF_MAP_LOOKUP_ELEM_CMD, we still take the spinlock before
> copying value from the map (syscall.c:354). Deletion does not mutate the
> element, so when the value is reused, it should still take the same lock,
> shouldn't it?

Ok, I think this is just a wonky reproducer:
fill_value(&val, PATTERN_B, seq++);
map_update(g_map_fd, &g_key, &val, BPF_ANY);

should have been BPF_ANY | BPF_F_LOCK
>>
>>> >
>>> > If you change the preproducer like:
>>> >
>>> > -- >8 --
>>> >
>>> > --- repro.c     2026-03-26 05:22:49.012503218 -0700
>>> > +++ repro2.c    2026-03-26 06:24:40.951044279 -0700
>>> > @@ -227,6 +227,7 @@
>>> >         attr.map_fd = fd;
>>> >         attr.key = (uint64_t)(unsigned long)key;
>>> >         attr.value = (uint64_t)(unsigned long)val;
>>> > +       attr.flags = BPF_F_LOCK;
>>> >         return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
>>> >  }
>>> >
>>> > -- 8< --
>>> >
>>> > Now it will detect the correct UAF problem.
>>> >
>>> > I verified that this updated reproducer shows the problem, the following
>>> > kernel diff fixes it:
>>> >
>>> > -- >8 --
>>> >
>>> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>> > index bc6bc8bb871d..af33f62069f0 100644
>>> > --- a/kernel/bpf/hashtab.c
>>> > +++ b/kernel/bpf/hashtab.c
>>> > @@ -953,7 +953,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
>>> >
>>> >         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
>>> >                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
>>> > -       bpf_mem_cache_free(&htab->ma, l);
>>> > +       bpf_mem_cache_free_rcu(&htab->ma, l);
>>> >  }
>>> >
>>> >  static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
>>> >
>>> > -- 8< --
>>> >
>>> > Before:
>>> >
>>> > [root@alarm host0]# ./repro2
>>> > Running 10 threads for 60 seconds...
>>> >
>>> > Total checks:    49228421
>>> > Torn writes:     5470
>>> > Max torn fields: 3 / 7
>>> > Corruption rate: 0.011111%
>>> >
>>> > Cross-pattern breakdown:
>>> >   A in B: 8595
>>> >   C in B: 7826
>>> >   Unknown: 1
>>> >
>>> > First 20 events:
>>> >   [0] check #42061 seq=39070 CCCBBBB
>>> >   [1] check #65714 seq=60575 CCCBBBB
>>> >   [2] check #65287 seq=60575 CCCBBBB
>>> >   [3] check #70474 seq=65793 AAABBBB
>>> >   [4] check #70907 seq=65793 AAABBBB
>>> >   [5] check #103389 seq=95745 AAABBBB
>>> >   [6] check #107208 seq=98672 CCCBBBB
>>> >   [7] check #108218 seq=100387 CCCBBBB
>>> >   [8] check #111490 seq=103388 CCCBBBB
>>> >   [9] check #140942 seq=128894 CCCBBBB
>>> >   [10] check #164845 seq=151828 CCCBBBB
>>> >   [11] check #163993 seq=151828 CCCBBBB
>>> >   [12] check #169184 seq=155453 CCCBBBB
>>> >   [13] check #171383 seq=158572 AAABBBB
>>> >   [14] check #179943 seq=165425 CCCBBBB
>>> >   [15] check #189218 seq=173926 CCCBBBB
>>> >   [16] check #192119 seq=177892 CCCBBBB
>>> >   [17] check #194253 seq=180562 AAABBBB
>>> >   [18] check #202169 seq=187253 CCCBBBB
>>> >   [19] check #205452 seq=189021 CCCBBBB
>>> >
>>> > CORRUPTION DETECTED
>>> >
>>> > After:
>>> >
>>> > [root@alarm host0]# ./repro2
>>> > Running 10 threads for 60 seconds...
>>> >
>>> > Total checks:    108666576
>>> > Torn writes:     0
>>> > Max torn fields: 0 / 7
>>> >
>>> > No corruption detected (try more CPUs or longer run)
>>> > [root@alarm host0]# nproc
>>> > 16
>>> >
>>> > I will send a patch to fix this soon after validating the above kernel
>>> > diff and figuring out how we got to this state in htab_elem_free() by
>>> > analyzing the git history.
>>> >
>>> > Thanks for the report.
>>> > Puranjay

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 15:47         ` Mykyta Yatsenko
@ 2026-03-26 15:57           ` Puranjay Mohan
  2026-03-27  2:44             ` Aaron Esau
  0 siblings, 1 reply; 12+ messages in thread
From: Puranjay Mohan @ 2026-03-26 15:57 UTC (permalink / raw)
  To: Mykyta Yatsenko; +Cc: Aaron Esau, bpf, ast, daniel, andrii

On Thu, Mar 26, 2026 at 3:47 PM Mykyta Yatsenko
<mykyta.yatsenko5@gmail.com> wrote:
>
> Mykyta Yatsenko <mykyta.yatsenko5@gmail.com> writes:
>
> > Puranjay Mohan <puranjay12@gmail.com> writes:
> >
> >> On Thu, Mar 26, 2026 at 3:26 PM Mykyta Yatsenko
> >> <mykyta.yatsenko5@gmail.com> wrote:
> >>>
> >>> Puranjay Mohan <puranjay@kernel.org> writes:
> >>>
> >>> > Aaron Esau <aaron1esau@gmail.com> writes:
> >>> >
> >>> >> Reported-by: Aaron Esau <aaron1esau@gmail.com>
> >>> >>
> >>> >> htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
> >>> >> for in-place updates.
> >>> >>
> >>> >> The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
> >>> >> bucket lock, then dereferences the element via copy_map_value_locked().
> >>> >> A concurrent htab_map_delete_elem() can delete and free the element
> >>> >> between these steps.
> >>> >>
> >>> >> free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
> >>> >> the object to the per-CPU free list (not RCU-deferred). The memory may
> >>> >> be reallocated before copy_map_value_locked() executes, leading to
> >>> >> writes into a different element.
> >>> >>
> >>> >> When lookup succeeds (l_old != NULL), the in-place update path returns
> >>> >> early, so the “full lookup under lock” path is not taken.
> >>> >>
> >>> >> Race:
> >>> >>
> >>> >>   CPU 0: htab_map_update_elem (BPF_F_LOCK)
> >>> >>          lookup_nulls_elem_raw() → E (no bucket lock)
> >>> >>          ...
> >>> >>   CPU 1: htab_map_delete_elem()
> >>> >>          htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
> >>> >>          free_htab_elem → bpf_mem_cache_free (immediate free)
> >>> >>   CPU 1: htab_map_update_elem (new key)
> >>> >>          alloc_htab_elem → reuses E
> >>> >>   CPU 0: copy_map_value_locked(E, ...) → writes into reused object
> >>> >>
> >>> >> Reproduction:
> >>> >>
> >>> >>   1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
> >>> >>      (max_entries=64, 7 u64 fields + lock).
> >>> >>   2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
> >>> >>   3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
> >>> >>   4. Threads C: same as A (pattern 0xCCCC...)
> >>> >>   5. Verifier threads: LOOKUP loop, detect mixed-pattern values
> >>> >>   6. Run 60s on >=4 CPUs
> >>> >>
> >>> >> Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
> >>> >> I observed ~645 torn values in 2.5M checks (~0.026%).
> >>> >>
> >>> >> Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")
> >>> >
> >>> > Although this is a real issue, your reproducer is not accurate, it will
> >>> > see torn writes even without the UAF issue, because the verifier thread
> >>> > is not taking the lock:
> >>> >
> >>> > So the torn write pattern CCCAAAA can mean:
> >>> >   1. Thread A finished writing AAAAAAA (while holding the lock)
> >>> >   2. Thread C acquired the lock and started writing: field[0]=C, field[1]=C, field[2]=C...
> >>> >   3. The verifier thread reads (no lock): sees field[0]=C, field[1]=C, field[2]=C, field[3]=A, field[4]=A, field[5]=A, field[6]=A
> >>> >   4. Thread C finishes: field[3]=C, field[4]=C, field[5]=C, field[6]=C, releases lock
> >>> >
> >>> > This race happens regardless of whether the element is freed/reused.  It
> >>> > would happen even without thread B (the delete+readd thread). The
> >>> > corruption source is the non-atomic read, not the UAF.
> >>> Have you confirmed torn reads even with BPF_F_LOCK flag on the
> >>> BPF_MAP_LOOKUP_ELEM_CMD? I understand there must not be any torn reads
> >>> with spinlock taken on the lookup path.
> >>>
> >>> The reproducer looks like a good selftest to have, but it needs to be
> >>> ported to use libbpf, currently it looks too complex.
> >>
> >> Yes, I have confirmed torn reads even with BPF_F_LOCK on
> >> BPF_MAP_LOOKUP_ELEM_CMD, the results given below are with BPF_F_LOCK.
> >> But this is expected behaviour, BPF_F_LOCK performs a lockless lookup
> >> and takes only the element's embedded spin_lock for in-place value
> >> updates. It does not synchronize against concurrent deletes.
> >>
> > It does not synchronize against concurrent deletes, yes, but with
> > BPF_F_LOCK on BPF_MAP_LOOKUP_ELEM_CMD, we still take the spinlock before
> > copying value from the map (syscall.c:354). Deletion does not mutate the
> > element, so when the value is reused, it should still take the same lock,
> > shouldn't it?
>
> Ok, I think this is just a wonky reproducer:
> fill_value(&val, PATTERN_B, seq++);
> map_update(g_map_fd, &g_key, &val, BPF_ANY);
>
> should have been BPF_ANY | BPF_F_LOCK

Yes, if all the users have BPF_F_LOCK then it will work because the
re-used element will have the same lock and everything is serialized.
But this reproducer is trying to show that the elements are being
re-used, which they are.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-26 15:57           ` Puranjay Mohan
@ 2026-03-27  2:44             ` Aaron Esau
  2026-03-27  3:21               ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 12+ messages in thread
From: Aaron Esau @ 2026-03-27  2:44 UTC (permalink / raw)
  To: Puranjay Mohan; +Cc: Mykyta Yatsenko, bpf, ast, daniel, andrii

[-- Attachment #1: Type: text/plain, Size: 1336 bytes --]

Puranjay Mohan <puranjay12@gmail.com> writes:

> Yes, if all the users have BPF_F_LOCK then it will work because the
> re-used element will have the same lock and everything is serialized.
> But this reproducer is trying to show that the elements are being
> re-used, which they are.

Updated reproducer attached. All operations now use BPF_F_LOCK (updates,
re-add after delete, and lookups), so value access is serialized via the
element's embedded spin_lock. The value size is ~4000 bytes. Torn writes
are still observed (3 in 2.7M).

With full spin_lock coverage, the only remaining unlocked write path is
alloc_htab_elem() -> copy_map_value(), which copies into a newly
allocated element prior to insertion. For this to race with a locked
reader or writer, the reader must be dereferencing a stale pointer to
memory that has been freed and reallocated -- a use-after-free.

The root cause is bpf_mem_cache_free() in htab_elem_free(), which
immediately returns memory to the per-CPU freelist rather than deferring
via RCU. This allows reuse while the syscall path is still within an RCU
read-side critical section. Preallocated maps do not have this problem
because freed elements remain within a pool of valid htab_elems.

Switching to bpf_mem_cache_free_rcu(), as Puranjay showed earlier in this
thread, eliminates the corruption.

[-- Attachment #2: poc-bpf-hashtab-uaf_v2.c --]
[-- Type: text/x-csrc, Size: 12622 bytes --]

/*
 * BPF hashtab UAF — torn write reproducer (v2)
 *
 * Races BPF_F_LOCK in-place updates against concurrent delete+re-add
 * on the same key. All operations use BPF_F_LOCK so every value access
 * goes through the element's embedded spin_lock. Torn writes therefore
 * prove the lockless lookup returned a stale (freed/reallocated) element,
 * since alloc_htab_elem()->copy_map_value() is the only path that writes
 * without the spin_lock.
 *
 * v2 changes from original:
 *   - BPF_F_LOCK on lookup (serializes reads via spin_lock)
 *   - BPF_F_LOCK on re-add (falls through to normal create path)
 *   - ~4000 byte value to widen the copy_map_value race window
 *
 * Requires: CONFIG_BPF_SYSCALL=y, root/CAP_BPF, SMP (>= 4 CPUs)
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sched.h>
#include <signal.h>

#ifndef __NR_bpf
#define __NR_bpf 321
#endif

enum bpf_cmd_num {
	BPF_MAP_CREATE_CMD       = 0,
	BPF_MAP_LOOKUP_ELEM_CMD  = 1,
	BPF_MAP_UPDATE_ELEM_CMD  = 2,
	BPF_MAP_DELETE_ELEM_CMD  = 3,
	BPF_BTF_LOAD_CMD         = 18,
};

#define BPF_MAP_TYPE_HASH    1
#define BPF_ANY     0
#define BPF_EXIST   2
#define BPF_F_LOCK  4
#define BPF_F_NO_PREALLOC (1U << 0)

#define BTF_MAGIC   0xeB9F
#define BTF_VERSION 1
#define BTF_KIND_INT    1
#define BTF_KIND_STRUCT 4
#define BTF_INT_SIGNED (1 << 0)

struct btf_header {
	uint16_t magic;
	uint8_t  version;
	uint8_t  flags;
	uint32_t hdr_len;
	uint32_t type_off;
	uint32_t type_len;
	uint32_t str_off;
	uint32_t str_len;
};

struct btf_type {
	uint32_t name_off;
	uint32_t info;
	union { uint32_t size; uint32_t type; };
};

struct btf_member {
	uint32_t name_off;
	uint32_t type;
	uint32_t offset;
};

static inline uint32_t btf_info_enc(uint32_t kind, uint32_t vlen)
{
	return (kind << 24) | (vlen & 0xffff);
}

struct bpf_attr_map_create {
	uint32_t map_type;
	uint32_t key_size;
	uint32_t value_size;
	uint32_t max_entries;
	uint32_t map_flags;
	uint32_t inner_map_fd;
	uint32_t numa_node;
	char     map_name[16];
	uint32_t map_ifindex;
	uint32_t btf_fd;
	uint32_t btf_key_type_id;
	uint32_t btf_value_type_id;
};

struct bpf_attr_btf_load {
	uint64_t btf;
	uint64_t btf_log_buf;
	uint32_t btf_size;
	uint32_t btf_log_size;
	uint32_t btf_log_level;
};

struct bpf_attr_map_elem {
	uint32_t map_fd;
	uint32_t __pad0;
	uint64_t key;
	union {
		uint64_t value;
		uint64_t next_key;
	};
	uint64_t flags;
};

static long bpf_sys(int cmd, void *attr, unsigned int size)
{
	return syscall(__NR_bpf, cmd, attr, size);
}

/* ~4000 byte value to widen the copy_map_value race window.
 * Fits in the BPF allocator's 4096-byte cache bucket. */
#define NUM_DATA_FIELDS 498
struct map_value {
	uint32_t lock;
	uint32_t seq;
	uint64_t data[NUM_DATA_FIELDS];
};

#define VALUE_SIZE sizeof(struct map_value)

#define PATTERN_A 0xAAAAAAAAAAAAAAAAULL
#define PATTERN_B 0xBBBBBBBBBBBBBBBBULL
#define PATTERN_C 0xCCCCCCCCCCCCCCCCULL

static int load_btf(void)
{
	static const char strings[] =
		"\0" "int\0" "bpf_spin_lock\0" "val\0"
		"__u32\0" "map_value\0" "lock\0" "data\0";

	struct btf_type t1 = { 1, btf_info_enc(BTF_KIND_INT, 0), {4} };
	uint32_t t1_extra = (BTF_INT_SIGNED << 24) | 32;
	struct btf_type t2 = { 5, btf_info_enc(BTF_KIND_STRUCT, 1), {4} };
	struct btf_member t2_m = { 19, 3, 0 };
	struct btf_type t3 = { 23, btf_info_enc(BTF_KIND_INT, 0), {4} };
	uint32_t t3_extra = 32;
	struct btf_type t4 = { 29, btf_info_enc(BTF_KIND_STRUCT, 2), {VALUE_SIZE} };
	struct btf_member t4_m1 = { 39, 2, 0 };
	struct btf_member t4_m2 = { 44, 3, 32 };

	uint32_t type_len =
		sizeof(t1) + sizeof(t1_extra) +
		sizeof(t2) + sizeof(t2_m) +
		sizeof(t3) + sizeof(t3_extra) +
		sizeof(t4) + sizeof(t4_m1) + sizeof(t4_m2);
	uint32_t str_len = sizeof(strings);
	uint32_t blob_sz = sizeof(struct btf_header) + type_len + str_len;

	char *blob = calloc(1, blob_sz);
	if (!blob) return -1;

	struct btf_header *hdr = (struct btf_header *)blob;
	hdr->magic = BTF_MAGIC;
	hdr->version = BTF_VERSION;
	hdr->hdr_len = sizeof(struct btf_header);
	hdr->type_off = 0;
	hdr->type_len = type_len;
	hdr->str_off = type_len;
	hdr->str_len = str_len;

	char *p = blob + sizeof(struct btf_header);
#define EMIT(x) do { memcpy(p, &(x), sizeof(x)); p += sizeof(x); } while(0)
	EMIT(t1); EMIT(t1_extra);
	EMIT(t2); EMIT(t2_m);
	EMIT(t3); EMIT(t3_extra);
	EMIT(t4); EMIT(t4_m1); EMIT(t4_m2);
#undef EMIT
	memcpy(p, strings, str_len);

	char log_buf[4096] = {};
	struct bpf_attr_btf_load attr = {};
	attr.btf = (uint64_t)(unsigned long)blob;
	attr.btf_size = blob_sz;
	attr.btf_log_buf = (uint64_t)(unsigned long)log_buf;
	attr.btf_log_size = sizeof(log_buf);
	attr.btf_log_level = 1;

	int fd = bpf_sys(BPF_BTF_LOAD_CMD, &attr, sizeof(attr));
	if (fd < 0)
		fprintf(stderr, "BTF load failed: %s\n", strerror(errno));
	free(blob);
	return fd;
}

static int create_map(int btf_fd, uint32_t max_entries, uint32_t map_flags)
{
	struct bpf_attr_map_create attr = {};
	attr.map_type = BPF_MAP_TYPE_HASH;
	attr.key_size = sizeof(uint32_t);
	attr.value_size = VALUE_SIZE;
	attr.max_entries = max_entries;
	attr.map_flags = map_flags;
	attr.btf_fd = btf_fd;
	attr.btf_key_type_id = 1;
	attr.btf_value_type_id = 4;

	int fd = bpf_sys(BPF_MAP_CREATE_CMD, &attr, sizeof(attr));
	if (fd < 0)
		fprintf(stderr, "Map create failed: %s\n", strerror(errno));
	return fd;
}

static int map_update(int fd, uint32_t *key, struct map_value *val, uint64_t flags)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	attr.value = (uint64_t)(unsigned long)val;
	attr.flags = flags;
	return bpf_sys(BPF_MAP_UPDATE_ELEM_CMD, &attr, sizeof(attr));
}

static int map_delete(int fd, uint32_t *key)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	return bpf_sys(BPF_MAP_DELETE_ELEM_CMD, &attr, sizeof(attr));
}

static int map_lookup(int fd, uint32_t *key, struct map_value *val, uint64_t flags)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	attr.value = (uint64_t)(unsigned long)val;
	attr.flags = flags;
	return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
}

static volatile int *g_stop;
static int g_map_fd;
static uint32_t g_key = 1;

struct shared_stats {
	volatile uint64_t total_checks;
	volatile uint64_t torn_writes;
	volatile uint64_t max_torn_fields;
};
static struct shared_stats *g_stats;

#define MAX_LOG_EVENTS 20
struct corruption_event {
	uint64_t check_num;
	uint32_t seq;
	int      a, b, c, other;
};
static volatile int *g_log_count;
static struct corruption_event *g_log;

#define STACK_SIZE (1024 * 1024)

/* mmap per-thread value buffer (can't use stack — 4K value with CLONE_VM) */
static struct map_value *alloc_val(void)
{
	return mmap(NULL, sizeof(struct map_value), PROT_READ | PROT_WRITE,
		    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}

static void fill_value(struct map_value *v, uint64_t pattern, uint32_t seq)
{
	v->lock = 0;
	v->seq = seq;
	for (int i = 0; i < NUM_DATA_FIELDS; i++)
		v->data[i] = pattern;
}

static int worker_update(void *arg)
{
	uint64_t pattern = (uint64_t)(unsigned long)arg;
	struct map_value *val = alloc_val();
	uint32_t seq = 0;

	while (!*g_stop) {
		fill_value(val, pattern, seq++);
		map_update(g_map_fd, &g_key, val, BPF_F_LOCK | BPF_EXIST);
	}
	return 0;
}

static int worker_delete_readd(void *arg)
{
	(void)arg;
	struct map_value *val = alloc_val();
	uint32_t seq = 0;

	while (!*g_stop) {
		map_delete(g_map_fd, &g_key);
		fill_value(val, PATTERN_B, seq++);
		/* BPF_F_LOCK with non-existent element falls through to the
		 * normal alloc_htab_elem() creation path (hashtab.c:1119). */
		map_update(g_map_fd, &g_key, val, BPF_F_LOCK);
	}
	return 0;
}

static int worker_pressure(void *arg)
{
	int btf_fd = (int)(long)arg;
	struct map_value *val = alloc_val();
	memset(val, 0, sizeof(*val));

	int fd = create_map(btf_fd, 256, BPF_F_NO_PREALLOC);
	if (fd < 0)
		return 1;

	while (!*g_stop) {
		for (uint32_t i = 0; i < 200; i++) {
			uint32_t k = i;
			map_update(fd, &k, val, BPF_ANY);
		}
		for (uint32_t i = 0; i < 200; i++) {
			uint32_t k = i;
			map_delete(fd, &k);
		}
	}

	close(fd);
	return 0;
}

static int worker_verify(void *arg)
{
	(void)arg;
	struct map_value *val = alloc_val();
	uint64_t checks = 0;

	while (!*g_stop) {
		if (map_lookup(g_map_fd, &g_key, val, BPF_F_LOCK) != 0) {
			checks++;
			continue;
		}

		int a = 0, b = 0, c = 0, other = 0;
		for (int i = 0; i < NUM_DATA_FIELDS; i++) {
			if (val->data[i] == PATTERN_A) a++;
			else if (val->data[i] == PATTERN_B) b++;
			else if (val->data[i] == PATTERN_C) c++;
			else other++;
		}

		int distinct = (a > 0) + (b > 0) + (c > 0);
		if (distinct >= 2 || other > 0) {
			g_stats->torn_writes++;

			uint64_t torn = NUM_DATA_FIELDS -
				(a >= b && a >= c ? a : b >= c ? b : c);
			if (torn > g_stats->max_torn_fields)
				g_stats->max_torn_fields = torn;

			int idx = __sync_fetch_and_add(g_log_count, 1);
			if (idx < MAX_LOG_EVENTS) {
				g_log[idx].check_num = checks;
				g_log[idx].seq = val->seq;
				g_log[idx].a = a;
				g_log[idx].b = b;
				g_log[idx].c = c;
				g_log[idx].other = other;
			}
		}

		checks++;
	}

	g_stats->total_checks += checks;
	return 0;
}

static pid_t spawn_worker(int (*fn)(void *), void *arg)
{
	void *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
	if (stack == MAP_FAILED) {
		perror("mmap stack");
		return -1;
	}
	pid_t pid = clone(fn, (char *)stack + STACK_SIZE,
			  CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | SIGCHLD,
			  arg);
	if (pid < 0)
		perror("clone");
	return pid;
}

#define RACE_SECONDS 60
#define NUM_UPDATE_A 2
#define NUM_DELETE_B 2
#define NUM_UPDATE_C 2
#define NUM_PRESSURE 2
#define NUM_VERIFY   2
#define TOTAL (NUM_UPDATE_A + NUM_DELETE_B + NUM_UPDATE_C + NUM_PRESSURE + NUM_VERIFY)

int main(void)
{
	size_t shm_sz = 4096 * 4;
	void *shm = mmap(NULL, shm_sz, PROT_READ | PROT_WRITE,
			 MAP_SHARED | MAP_ANONYMOUS, -1, 0);
	if (shm == MAP_FAILED) {
		perror("mmap shared");
		return 1;
	}
	memset(shm, 0, shm_sz);

	g_stop = (volatile int *)shm;
	g_stats = (struct shared_stats *)((char *)shm + 64);
	g_log_count = (volatile int *)((char *)shm + 64 + sizeof(struct shared_stats));
	g_log = (struct corruption_event *)((char *)shm + 4096);

	struct bpf_attr_map_create test = {};
	if (bpf_sys(BPF_MAP_CREATE_CMD, &test, sizeof(test)) < 0 && errno == ENOSYS) {
		fprintf(stderr, "BPF syscall not available\n");
		return 1;
	}

	int btf_fd = load_btf();
	if (btf_fd < 0)
		return 1;

	g_map_fd = create_map(btf_fd, 8, BPF_F_NO_PREALLOC);
	if (g_map_fd < 0)
		return 1;

	struct map_value *init_val = alloc_val();
	fill_value(init_val, PATTERN_B, 0);
	if (map_update(g_map_fd, &g_key, init_val, BPF_ANY) < 0) {
		fprintf(stderr, "Initial update failed: %s\n", strerror(errno));
		return 1;
	}

	printf("Running %d threads for %d seconds (value_size=%lu)...\n",
	       TOTAL, RACE_SECONDS, (unsigned long)VALUE_SIZE);

	pid_t pids[TOTAL];
	int idx = 0;

	for (int i = 0; i < NUM_UPDATE_A; i++)
		pids[idx++] = spawn_worker(worker_update, (void *)PATTERN_A);
	for (int i = 0; i < NUM_DELETE_B; i++)
		pids[idx++] = spawn_worker(worker_delete_readd, NULL);
	for (int i = 0; i < NUM_UPDATE_C; i++)
		pids[idx++] = spawn_worker(worker_update, (void *)PATTERN_C);
	for (int i = 0; i < NUM_PRESSURE; i++)
		pids[idx++] = spawn_worker(worker_pressure, (void *)(long)btf_fd);
	for (int i = 0; i < NUM_VERIFY; i++)
		pids[idx++] = spawn_worker(worker_verify, NULL);

	sleep(RACE_SECONDS);
	*g_stop = 1;

	for (int i = 0; i < TOTAL; i++)
		if (pids[i] > 0)
			waitpid(pids[i], NULL, 0);

	int logged = *g_log_count;
	if (logged > MAX_LOG_EVENTS) logged = MAX_LOG_EVENTS;

	printf("\nTotal checks:    %lu\n", g_stats->total_checks);
	printf("Torn writes:     %lu\n", g_stats->torn_writes);
	printf("Max torn fields: %lu / %d\n", g_stats->max_torn_fields,
	       NUM_DATA_FIELDS);

	if (g_stats->torn_writes > 0) {
		printf("Corruption rate: %.6f%%\n",
		       (double)g_stats->torn_writes /
		       g_stats->total_checks * 100.0);
	}

	for (int e = 0; e < logged; e++)
		printf("  [%d] check #%lu seq=%u A=%d B=%d C=%d ?=%d\n",
		       e, g_log[e].check_num, g_log[e].seq,
		       g_log[e].a, g_log[e].b, g_log[e].c, g_log[e].other);

	printf("\n%s\n", g_stats->torn_writes > 0 ?
	       "CORRUPTION DETECTED" :
	       "No corruption detected (try more CPUs or longer run)");

	close(g_map_fd);
	close(btf_fd);
	return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-27  2:44             ` Aaron Esau
@ 2026-03-27  3:21               ` Kumar Kartikeya Dwivedi
  2026-03-27 16:09                 ` Mykyta Yatsenko
  0 siblings, 1 reply; 12+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2026-03-27  3:21 UTC (permalink / raw)
  To: Aaron Esau; +Cc: Puranjay Mohan, Mykyta Yatsenko, bpf, ast, daniel, andrii

On Fri, 27 Mar 2026 at 03:54, Aaron Esau <aaron1esau@gmail.com> wrote:
>
> Puranjay Mohan <puranjay12@gmail.com> writes:
>
> > Yes, if all the users have BPF_F_LOCK then it will work because the
> > re-used element will have the same lock and everything is serialized.
> > But this reproducer is trying to show that the elements are being
> > re-used, which they are.
>
> Updated reproducer attached. All operations now use BPF_F_LOCK (updates,
> re-add after delete, and lookups), so value access is serialized via the
> element's embedded spin_lock. The value size is ~4000 bytes. Torn writes
> are still observed (3 in 2.7M).
>
> With full spin_lock coverage, the only remaining unlocked write path is
> alloc_htab_elem() -> copy_map_value(), which copies into a newly
> allocated element prior to insertion. For this to race with a locked
> reader or writer, the reader must be dereferencing a stale pointer to
> memory that has been freed and reallocated -- a use-after-free.
>
> The root cause is bpf_mem_cache_free() in htab_elem_free(), which
> immediately returns memory to the per-CPU freelist rather than deferring
> via RCU. This allows reuse while the syscall path is still within an RCU
> read-side critical section. Preallocated maps do not have this problem
> because freed elements remain within a pool of valid htab_elems.
>
> Switching to bpf_mem_cache_free_rcu(), as Puranjay showed earlier in this
> thread, eliminates the corruption.

As I said, the reuse is intended behavior. We won't be switching to
bpf_mem_cache_free_rcu().
This is mostly for performance reasons, in case you're perplexed as to why.
That said, we could do copy_map_value_locked() if map_flags &
BPF_F_LOCK to fix this particular case.
Such that if the value is being reused we will still have serialization.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
  2026-03-27  3:21               ` Kumar Kartikeya Dwivedi
@ 2026-03-27 16:09                 ` Mykyta Yatsenko
  0 siblings, 0 replies; 12+ messages in thread
From: Mykyta Yatsenko @ 2026-03-27 16:09 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, Aaron Esau
  Cc: Puranjay Mohan, bpf, ast, daniel, andrii

Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:

> On Fri, 27 Mar 2026 at 03:54, Aaron Esau <aaron1esau@gmail.com> wrote:
>>
>> Puranjay Mohan <puranjay12@gmail.com> writes:
>>
>> > Yes, if all the users have BPF_F_LOCK then it will work because the
>> > re-used element will have the same lock and everything is serialized.
>> > But this reproducer is trying to show that the elements are being
>> > re-used, which they are.
>>
>> Updated reproducer attached. All operations now use BPF_F_LOCK (updates,
>> re-add after delete, and lookups), so value access is serialized via the
>> element's embedded spin_lock. The value size is ~4000 bytes. Torn writes
>> are still observed (3 in 2.7M).
>>
>> With full spin_lock coverage, the only remaining unlocked write path is
>> alloc_htab_elem() -> copy_map_value(), which copies into a newly
>> allocated element prior to insertion. For this to race with a locked
>> reader or writer, the reader must be dereferencing a stale pointer to
>> memory that has been freed and reallocated -- a use-after-free.
>>
>> The root cause is bpf_mem_cache_free() in htab_elem_free(), which
>> immediately returns memory to the per-CPU freelist rather than deferring
>> via RCU. This allows reuse while the syscall path is still within an RCU
>> read-side critical section. Preallocated maps do not have this problem
>> because freed elements remain within a pool of valid htab_elems.
>>
>> Switching to bpf_mem_cache_free_rcu(), as Puranjay showed earlier in this
>> thread, eliminates the corruption.
>
> As I said, the reuse is intended behavior. We won't be switching to
> bpf_mem_cache_free_rcu().
> This is mostly for performance reasons, in case you're perplexed as to why.
> That said, we could do copy_map_value_locked() if map_flags &
> BPF_F_LOCK to fix this particular case.
> Such that if the value is being reused we will still have serialization.
I think we should do copy_map_value_locked() on the reuse path.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-03-27 16:09 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-26  8:49 [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path Aaron Esau
2026-03-26 13:39 ` Puranjay Mohan
2026-03-26 14:58   ` Kumar Kartikeya Dwivedi
2026-03-26 15:02   ` Puranjay Mohan
2026-03-26 15:26   ` Mykyta Yatsenko
2026-03-26 15:33     ` Puranjay Mohan
2026-03-26 15:43       ` Mykyta Yatsenko
2026-03-26 15:47         ` Mykyta Yatsenko
2026-03-26 15:57           ` Puranjay Mohan
2026-03-27  2:44             ` Aaron Esau
2026-03-27  3:21               ` Kumar Kartikeya Dwivedi
2026-03-27 16:09                 ` Mykyta Yatsenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox