* [PATCH blktests] create a test for direct io offsets
@ 2025-10-14 20:54 Keith Busch
2025-10-14 21:21 ` Bart Van Assche
` (3 more replies)
0 siblings, 4 replies; 14+ messages in thread
From: Keith Busch @ 2025-10-14 20:54 UTC (permalink / raw)
To: linux-block, shinichiro.kawasaki; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Tests direct IO using various memory and length alignments against the
device's queue limits.
This should work on raw block devices, their partitions, or through
files on mounted filesystems. Much of the code is dedicated to
automatically finding the underlying block device's queue limits so that
it can work in a variety of environments.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
src/.gitignore | 1 +
src/Makefile | 1 +
src/dio-offsets.c | 952 ++++++++++++++++++++++++++++++++++++++++++++
tests/block/042 | 20 +
tests/block/042.out | 2 +
5 files changed, 976 insertions(+)
create mode 100644 src/dio-offsets.c
create mode 100644 tests/block/042
create mode 100644 tests/block/042.out
diff --git a/src/.gitignore b/src/.gitignore
index 2ece754..865675c 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -1,3 +1,4 @@
+/dio-offsets
/discontiguous-io
/loblksize
/loop_change_fd
diff --git a/src/Makefile b/src/Makefile
index ba0d9b7..179a673 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,6 +9,7 @@ HAVE_C_MACRO = $(shell if echo "$(H)include <$(1)>" | \
then echo 1;else echo 0; fi)
C_TARGETS := \
+ dio-offsets \
loblksize \
loop_change_fd \
loop_get_status_null \
diff --git a/src/dio-offsets.c b/src/dio-offsets.c
new file mode 100644
index 0000000..5961232
--- /dev/null
+++ b/src/dio-offsets.c
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Meta Platforms, Inc. All Rights Reserved.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <mntent.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/sysmacros.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+
+#define power_of_2(x) ((x) && !((x) & ((x) - 1)))
+
+static unsigned long logical_block_size;
+static unsigned long dma_alignment;
+static unsigned long virt_boundary;
+static unsigned long max_segments;
+static unsigned long max_bytes;
+static int kernel_major;
+static int kernel_minor;
+static size_t buf_size;
+static long pagesize;
+static void *out_buf;
+static void *in_buf;
+static int test_fd;
+
+static void init_kernel_version()
+{
+ struct utsname buffer;
+ char *major_version_str;
+ char *minor_version_str;
+ if (uname(&buffer) != 0)
+ err(errno, "uname");
+
+ major_version_str = strtok(buffer.release, ".");
+ minor_version_str = strtok(NULL, ".");
+
+ kernel_major = strtol(major_version_str, NULL, 0);
+ kernel_minor = strtol(minor_version_str, NULL, 0);
+}
+
+static void find_mount_point(const char *filepath, char *mount_point,
+ size_t mp_len)
+{
+ char path[PATH_MAX - 1], abs_path[PATH_MAX], *pos;
+ struct stat file_stat, mp_stat, parent_stat;
+
+ if (stat(filepath, &file_stat) != 0)
+ err(errno, "stat");
+
+ strncpy(path, filepath, sizeof(path) - 1);
+ path[sizeof(path) - 1] = '\0';
+ if (realpath(path, abs_path) == NULL)
+ err(ENOENT, "realpath");
+
+ strncpy(path, abs_path, sizeof(path) - 1);
+ path[sizeof(path) - 1] = '\0';
+ while (1) {
+ if (stat(path, &mp_stat))
+ err(errno, "stat on path");
+
+ pos = strrchr(path, '/');
+ if (pos == path) {
+ strcpy(path, "/");
+ break;
+ }
+
+ if (pos == NULL)
+ break;
+
+ *pos = '\0';
+ if (path[0] == '\0')
+ strcpy(path, "/");
+
+ if (stat(path, &parent_stat))
+ err(errno, "stat on parent");
+
+ if (parent_stat.st_dev != mp_stat.st_dev) {
+ *pos = '/';
+ break;
+ }
+ }
+
+ strncpy(mount_point, path, mp_len - 1);
+ mount_point[mp_len - 1] = '\0';
+}
+
+static void find_block_device_from_mount(const char *mount_point, char *block_dev,
+ size_t bd_len)
+{
+ struct mntent *ent;
+ FILE *mounts;
+
+ mounts = setmntent("/proc/mounts", "r");
+ if (!mounts)
+ err(ENOENT, "setmntent");
+
+ while ((ent = getmntent(mounts)) != NULL) {
+ if (strcmp(ent->mnt_dir, mount_point) == 0) {
+ strncpy(block_dev, ent->mnt_fsname, bd_len - 1);
+ block_dev[bd_len - 1] = '\0';
+ endmntent(mounts);
+ return;
+ }
+ }
+
+ endmntent(mounts);
+ err(ENOENT, "%s: not found", __func__);
+}
+
+static void resolve_to_sysblock(const char *dev_path, char *path, size_t path_len)
+{
+ char resolved[PATH_MAX], class[PATH_MAX], target[PATH_MAX], base[256];
+ char *dev_name, *p, *slash, *block_pos;
+ struct stat st;
+ size_t len;
+
+ if (realpath(dev_path, resolved) != NULL)
+ dev_path = resolved;
+
+ dev_name = strrchr(dev_path, '/');
+ if (dev_name)
+ dev_name++;
+ else
+ dev_name = (char *)dev_path;
+
+ strncpy(base, dev_name, sizeof(base) - 1);
+ base[sizeof(base) - 1] = '\0';
+
+ len = strlen(base);
+ p = base + len - 1;
+ while (p > base && *p >= '0' && *p <= '9')
+ p--;
+
+ if (p > base && *p == 'p' && *(p + 1) != '\0')
+ *p = '\0';
+ else if (len >= 4 && p > base && p < base + len - 1) {
+ if ((strncmp(base, "sd", 2) == 0 ||
+ strncmp(base, "hd", 2) == 0 ||
+ strncmp(base, "vd", 2) == 0) &&
+ (*p >= 'a' && *p <= 'z'))
+ *(p + 1) = '\0';
+ }
+
+ if (snprintf(path, path_len, "/sys/block/%s/queue", base) < 0)
+ err(errno, "base too long");
+ if (stat(path, &st) == 0 && S_ISDIR(st.st_mode))
+ return;
+
+ if (snprintf(class, sizeof(class), "/sys/class/block/%s", dev_name) < 0)
+ err(errno, "class too long");
+ if (stat(class, &st) == 0) {
+ len = readlink(class, target, sizeof(target) - 1);
+ if (len != -1) {
+ target[len] = '\0';
+ block_pos = strstr(target, "/block/");
+ if (block_pos) {
+ block_pos += 7;
+ slash = strchr(block_pos, '/');
+ if (slash) {
+ len = slash - block_pos;
+ if (len < sizeof(base)) {
+ strncpy(base, block_pos, len);
+ base[len] = '\0';
+ snprintf(path, path_len,
+ "/sys/block/%s/queue", base);
+ if (stat(path, &st) == 0 &&
+ S_ISDIR(st.st_mode))
+ return;
+ }
+ } else {
+ snprintf(path, path_len,
+ "/sys/block/%s/queue", block_pos);
+ if (stat(path, &st) == 0 &&
+ S_ISDIR(st.st_mode)) {
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ if (strncmp(dev_name, "dm-", 3) == 0) {
+ if (snprintf(path, path_len, "/sys/block/%s/queue", dev_name) < 0)
+ err(errno, "%s", dev_name);
+ if (stat(path, &st) == 0 && S_ISDIR(st.st_mode))
+ return;
+ }
+
+ err(ENOENT, "%s: not found", dev_name);
+}
+
+static void find_block_device(const char *filepath, char *path, size_t path_len)
+{
+ char mount_point[PATH_MAX], block_dev[PATH_MAX];
+ struct stat st;
+
+ if (stat(filepath, &st) == 0 && S_ISBLK(st.st_mode)) {
+ resolve_to_sysblock(filepath, path, path_len);
+ return;
+ }
+
+ find_mount_point(filepath, mount_point, sizeof(mount_point));
+ find_block_device_from_mount(mount_point, block_dev, sizeof(block_dev));
+ resolve_to_sysblock(block_dev, path, path_len);
+}
+
+static void read_sysfs_attr(char *path, unsigned long *value)
+{
+ FILE *f;
+ int ret;
+
+ f = fopen(path, "r");
+ if (!f)
+ err(ENOENT, "%s", path);
+
+ ret = fscanf(f, "%lu", value);
+ fclose(f);
+ if (ret != 1)
+ err(ENOENT, "%s", basename(path));
+}
+
+static void read_queue_attrs(const char *path)
+{
+ char attr[PATH_MAX];
+
+ if (snprintf(attr, sizeof(attr), "%s/max_segments", path) < 0)
+ err(errno, "max_segments");
+ read_sysfs_attr(attr, &max_segments);
+
+ if (snprintf(attr, sizeof(attr), "%s/dma_alignment", path) < 0)
+ err(errno, "dma_alignment");
+ read_sysfs_attr(attr, &dma_alignment);
+
+ if (snprintf(attr, sizeof(attr), "%s/virt_boundary_mask", path) < 0)
+ err(errno, "virt_boundary_mask");
+ read_sysfs_attr(attr, &virt_boundary);
+
+ if (snprintf(attr, sizeof(attr), "%s/logical_block_size", path) < 0)
+ err(errno, "logical_block_size");
+ read_sysfs_attr(attr, &logical_block_size);
+
+ if (snprintf(attr, sizeof(attr), "%s/max_sectors_kb", path) < 0)
+ err(errno, "max_sectors_kb");
+ read_sysfs_attr(attr, &max_bytes);
+
+ max_bytes *= 1024;
+ dma_alignment++;
+ virt_boundary++;
+
+ /*
+ printf("logical_block_size:%lu dma_alignment:%lu virt_boundary:%lu max_segments:%lu max_bytes:%lu\n",
+ logical_block_size, dma_alignment, virt_boundary, max_segments, max_bytes);
+ */
+}
+
+static void init_args(char **argv)
+{
+ char sys_path[PATH_MAX];
+
+ test_fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC | O_DIRECT);
+ if (test_fd < 0)
+ err(errno, "%s: failed to open %s", __func__, argv[1]);
+
+ init_kernel_version();
+ find_block_device(argv[1], sys_path, sizeof(sys_path));
+ read_queue_attrs(sys_path);
+
+ if (!power_of_2(virt_boundary) ||
+ !power_of_2(dma_alignment) ||
+ !power_of_2(logical_block_size))
+ err(EINVAL, "%s: bad parameters", __func__);
+
+ if (virt_boundary > 1 && virt_boundary < logical_block_size)
+ err(EINVAL, "%s: virt_boundary:%lu logical_block_size:%lu", __func__,
+ virt_boundary, logical_block_size);
+
+ if (dma_alignment > logical_block_size)
+ err(EINVAL, "%s: dma_alignment:%lu logical_block_size:%lu", __func__,
+ dma_alignment, logical_block_size);
+
+ if (max_segments > 4096)
+ max_segments = 4096;
+ if (max_bytes > 16384 * 1024)
+ max_bytes = 16384 * 1024;
+ if (max_bytes & (logical_block_size - 1))
+ max_bytes -= max_bytes & (logical_block_size - 1);
+ pagesize = sysconf(_SC_PAGE_SIZE);
+}
+
+static void init_buffers()
+{
+ unsigned long lb_mask = logical_block_size - 1;
+ int fd, ret;
+
+ buf_size = max_bytes * max_segments / 2;
+ if (buf_size < logical_block_size * max_segments)
+ err(EINVAL, "%s: logical block size is too big", __func__);
+
+ if (buf_size < logical_block_size * 1024 * 4)
+ buf_size = logical_block_size * 1024 * 4;
+
+ if (buf_size & lb_mask)
+ buf_size = (buf_size + lb_mask) & ~(lb_mask);
+
+ ret = posix_memalign((void **)&in_buf, pagesize, buf_size);
+ if (ret)
+ err(EINVAL, "%s: failed to allocate in-buf", __func__);
+
+ ret = posix_memalign((void **)&out_buf, pagesize, buf_size);
+ if (ret)
+ err(EINVAL, "%s: failed to allocate out-buf", __func__);
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0)
+ err(EINVAL, "%s: failed to open urandom", __func__);
+
+ ret = read(fd, out_buf, buf_size);
+ if (ret < 0)
+ err(EINVAL, "%s: failed to randomize output buffer", __func__);
+
+ close(fd);
+}
+
+static void __compare(void *a, void *b, size_t size, const char *test)
+{
+ if (!memcmp(a, b, size))
+ return;
+ err(EIO, "%s: data corruption", test);
+}
+#define compare(a, b, size) __compare(a, b, size, __func__)
+
+/*
+ * Test using page aligned buffers, single source
+ *
+ * Total size is aligned to a logical block size and exceeds the max transfer
+ * size as well as the max segments. This should test the kernel's split bio
+ * construction and bio splitting for exceeding these limits.
+ */
+static void test_full_size_aligned()
+{
+ int ret;
+
+ memset(in_buf, 0, buf_size);
+ ret = pwrite(test_fd, out_buf, buf_size, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to write buf", __func__);
+
+ ret = pread(test_fd, in_buf, buf_size, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ compare(out_buf, in_buf, buf_size);
+}
+
+/*
+ * Test using dma aligned buffers, single source
+ *
+ * This tests the kernel's dio memory alignment
+ */
+static void test_dma_aligned()
+{
+ int ret;
+
+ memset(in_buf, 0, buf_size);
+ ret = pwrite(test_fd, out_buf + dma_alignment, max_bytes, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to write buf", __func__);
+
+ ret = pread(test_fd, in_buf + dma_alignment, max_bytes, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ compare(out_buf + dma_alignment, in_buf + dma_alignment, max_bytes);
+}
+
+/*
+ * Test using page aligned buffers + logicaly block sized vectored source
+ *
+ * This tests discontiguous vectored sources
+ */
+static void test_page_aligned_vectors()
+{
+ const int vecs = 4;
+
+ int i, ret, offset;
+ struct iovec iov[vecs];
+
+ memset(in_buf, 0, buf_size);
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 4;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size * 2;
+ }
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to write buf", __func__);
+
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 4;
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = logical_block_size * 2;
+ }
+
+ ret = preadv(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 4;
+ compare(in_buf + offset, out_buf + offset, logical_block_size * 2);
+ }
+}
+
+/*
+ * Test using dma aligned buffers, vectored source
+ *
+ * This tests discontiguous vectored sources with incrementing dma aligned
+ * offsets
+ */
+static void test_dma_aligned_vectors()
+{
+ const int vecs = 4;
+
+ int i, ret, offset;
+ struct iovec iov[vecs];
+
+ memset(in_buf, 0, buf_size);
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 8 + dma_alignment * (i + 1);
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size * 2;
+ }
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to write buf", __func__);
+
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 8 + dma_alignment * (i + 1);
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = logical_block_size * 2;
+ }
+
+ ret = preadv(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 8 + dma_alignment * (i + 1);
+ compare(in_buf + offset, out_buf + offset, logical_block_size * 2);
+ }
+}
+
+/*
+ * Test vectored read with a total size aligned to a block, but some individual
+ * vectors will not be aligned to to the block size.
+ *
+ * All the middle vectors start and end on page boundaries which should
+ * satisfy any virt_boundary condition. This test will fail prior to kernel
+ * 6.18.
+ */
+static void test_unaligned_page_vectors()
+{
+ const int vecs = 4;
+
+ int i, ret, offset, mult;
+ struct iovec iov[vecs];
+ bool should_fail = true;
+
+ if (kernel_major > 6 || (kernel_major == 6 && kernel_minor >= 18))
+ should_fail = false;
+
+ i = 0;
+ memset(in_buf, 0, buf_size);
+ mult = pagesize / logical_block_size;
+ if (mult < 2)
+ mult = 2;
+
+ offset = pagesize - (logical_block_size / 4);
+ if (offset & (dma_alignment - 1))
+ offset = pagesize - dma_alignment;
+
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = pagesize - offset;
+
+ for (i = 1; i < vecs - 1; i++) {
+ offset = logical_block_size * i * 8 * mult;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size * mult;
+ }
+
+ offset = logical_block_size * i * 8 * mult;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size * mult - iov[0].iov_len;
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0) {
+ if (should_fail)
+ return;
+ err(errno, "%s: failed to write buf", __func__);
+ }
+
+ i = 0;
+ offset = pagesize - (logical_block_size / 4);
+ if (offset & (dma_alignment - 1))
+ offset = pagesize - dma_alignment;
+
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = pagesize - offset;
+
+ for (i = 1; i < vecs - 1; i++) {
+ offset = logical_block_size * i * 8 * mult;
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = logical_block_size * mult;
+ }
+
+ offset = logical_block_size * i * 8 * mult;
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = logical_block_size * mult - iov[0].iov_len;
+
+ ret = preadv(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ i = 0;
+ offset = pagesize - (logical_block_size / 4);
+ if (offset & (dma_alignment - 1))
+ offset = pagesize - dma_alignment;
+
+ compare(in_buf + offset, out_buf + offset, iov[i].iov_len);
+ for (i = 1; i < vecs - 1; i++) {
+ offset = logical_block_size * i * 8 * mult;
+ compare(in_buf + offset, out_buf + offset, iov[i].iov_len);
+ }
+ offset = logical_block_size * i * 8 * mult;
+ compare(in_buf + offset, out_buf + offset, iov[i].iov_len);
+}
+
+/*
+ * Total size is a logical block size multiple, but none of the vectors are.
+ *
+ * Total vectors will be less than the max. The vectors will be dma aligned. If
+ * a virtual boundary exists, this should fail, otherwise it should succceed on
+ * kernels 6.18 and newer.
+ */
+static void test_unaligned_vectors()
+{
+ const int vecs = 4;
+
+ bool should_fail = true;
+ struct iovec iov[vecs];
+ int i, ret, offset;
+
+ if ((kernel_major > 6 || (kernel_major == 6 && kernel_minor >= 18)) &&
+ virt_boundary <= 1)
+ should_fail = false;
+
+ memset(in_buf, 0, buf_size);
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 8;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size / 2;
+ }
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0) {
+ if (should_fail)
+ return;
+ err(errno, "%s: failed to write buf", __func__);
+ }
+
+ if (should_fail)
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded ret:%d",
+ __func__, ret);
+
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 8;
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = logical_block_size / 2;
+ }
+
+ ret = preadv(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ for (i = 0; i < vecs; i++) {
+ offset = logical_block_size * i * 8;
+ compare(in_buf + offset, out_buf + offset, logical_block_size / 2);
+ }
+}
+
+/*
+ * Provide an invalid iov_base at the beginning to test the kernel catching it
+ * while building a bio.
+ */
+static void test_invalid_starting_addr()
+{
+ const int vecs = 4;
+
+ int i, ret, offset;
+ struct iovec iov[vecs];
+
+ i = 0;
+ iov[i].iov_base = 0;
+ iov[i].iov_len = logical_block_size;
+
+ for (i = 1; i < vecs; i++) {
+ offset = logical_block_size * i * 8;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size;
+ }
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ return;
+
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with NULL address ret:%d",
+ __func__, ret);
+}
+
+/*
+ * Provide an invalid iov_base in the middle to test the kernel catching it
+ * while building split bios. Ensure it is split by sending enough vectors to
+ * exceed bio's MAX_VEC; this should cause part of the io to dispatch.
+ */
+static void test_invalid_middle_addr()
+{
+ const int vecs = 1024;
+
+ int i, ret, offset;
+ struct iovec iov[vecs];
+
+ for (i = 0; i < vecs / 2 + 1; i++) {
+ offset = logical_block_size * i * 2;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size;
+ }
+
+ offset = logical_block_size * i * 2;
+ iov[i].iov_base = 0;
+ iov[i].iov_len = logical_block_size;
+
+ for (++i; i < vecs; i++) {
+ offset = logical_block_size * i * 2;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = logical_block_size;
+ }
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ return;
+
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with NULL address ret:%d",
+ __func__, ret);
+}
+
+/*
+ * Test with an invalid DMA address. Should get caught early when splitting. If
+ * the device supports byte aligned memory (which is unusual), then this should
+ * be successful.
+ */
+static void test_invalid_dma_alignment()
+{
+ int ret, offset;
+ size_t size;
+ bool should_fail = dma_alignment > 1;
+
+ memset(in_buf, 0, buf_size);
+ offset = 2 * dma_alignment - 1;
+ size = logical_block_size * 256;
+ ret = pwrite(test_fd, out_buf + offset, size, 0);
+ if (ret < 0) {
+ if (should_fail)
+ return;
+ err(errno, "%s: failed to write buf", __func__);
+ }
+
+ if (should_fail)
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with invalid DMA offset address, ret:%d",
+ __func__, ret);
+
+ ret = pread(test_fd, in_buf + offset, size, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ compare(out_buf + offset, in_buf + offset, size);
+}
+
+/*
+ * Test with invalid DMA alignment in the middle. This should get split with
+ * the first part being dispatched, and the 2nd one failing without dispatch.
+ */
+static void test_invalid_dma_vector_alignment()
+{
+ const int vecs = 5;
+
+ bool should_fail = dma_alignment > 1;
+ struct iovec iov[vecs];
+ int ret, offset;
+
+ offset = dma_alignment * 2 - 1;
+ memset(in_buf, 0, buf_size);
+
+ iov[0].iov_base = out_buf;
+ iov[0].iov_len = max_bytes;
+
+ iov[1].iov_base = out_buf + max_bytes * 2;
+ iov[1].iov_len = max_bytes;
+
+ iov[2].iov_base = out_buf + max_bytes * 4 + offset;
+ iov[2].iov_len = max_bytes;
+
+ iov[3].iov_base = out_buf + max_bytes * 6;
+ iov[3].iov_len = max_bytes;
+
+ iov[4].iov_base = out_buf + max_bytes * 8;
+ iov[4].iov_len = max_bytes;
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0) {
+ if (should_fail)
+ return;
+ err(errno, "%s: failed to write buf", __func__);
+ }
+ if (should_fail)
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with invalid DMA offset address ret:%d",
+ __func__, ret);
+
+ iov[0].iov_base = in_buf;
+ iov[0].iov_len = max_bytes;
+
+ iov[1].iov_base = in_buf + max_bytes * 2;
+ iov[1].iov_len = max_bytes;
+
+ iov[2].iov_base = in_buf + max_bytes * 4 + offset;
+ iov[2].iov_len = max_bytes;
+
+ iov[3].iov_base = in_buf + max_bytes * 6;
+ iov[3].iov_len = max_bytes;
+
+ iov[4].iov_base = in_buf + max_bytes * 8;
+ iov[4].iov_len = max_bytes;
+
+ ret = preadv(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ compare(out_buf, in_buf, max_bytes);
+ compare(out_buf + max_bytes * 2, in_buf + max_bytes * 2, max_bytes);
+ compare(out_buf + max_bytes * 4 + offset, in_buf + max_bytes * 4 + offset, max_bytes);
+ compare(out_buf + max_bytes * 6, in_buf + max_bytes * 6, max_bytes);
+ compare(out_buf + max_bytes * 8, in_buf + max_bytes * 8, max_bytes);
+}
+
+/*
+ * Test a bunch of small vectors if the device dma alignemnt allows it. We'll
+ * try to force a MAX_IOV split that can't form a valid IO so expect a failure.
+ */
+static void test_max_vector_limits()
+{
+ const int vecs = 320;
+
+ int ret, i, offset, iovpb, iov_size;
+ bool should_fail = true;
+ struct iovec iov[vecs];
+
+ memset(in_buf, 0, buf_size);
+ iovpb = logical_block_size / dma_alignment;
+ iov_size = logical_block_size / iovpb;
+
+ if ((pagesize / iov_size) < 256 &&
+ iov_size >= virt_boundary)
+ should_fail = false;
+
+ for (i = 0; i < vecs; i++) {
+ offset = i * iov_size * 2;
+ iov[i].iov_base = out_buf + offset;
+ iov[i].iov_len = iov_size;
+ }
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0) {
+ if (should_fail)
+ return;
+ err(errno, "%s: failed to write buf", __func__);
+ }
+
+ if (should_fail)
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with excess vectors ret:%d",
+ __func__, ret);
+
+ for (i = 0; i < vecs; i++) {
+ offset = i * iov_size * 2;
+ iov[i].iov_base = in_buf + offset;
+ iov[i].iov_len = iov_size;
+ }
+
+ ret = preadv(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ for (i = 0; i < vecs; i++) {
+ offset = i * iov_size * 2;
+ compare(in_buf + offset, out_buf + offset, logical_block_size / 2);
+ }
+}
+
+/*
+ * Start with a valid vector that can be split into a dispatched IO, but poison
+ * the rest with an invalid DMA offset testing the kernel's late catch.
+ */
+static void test_invalid_dma_vector_alignment_large()
+{
+ const int vecs = 4;
+
+ struct iovec iov[vecs];
+ int i, ret;
+
+ i = 0;
+ iov[i].iov_base = out_buf;
+ iov[i].iov_len = max_bytes - logical_block_size;
+
+ i++;
+ iov[i].iov_base = out_buf + max_bytes + logical_block_size;
+ iov[i].iov_len = logical_block_size;
+
+ i++;
+ iov[i].iov_base = iov[1].iov_base + pagesize * 2 + (dma_alignment - 1);
+ iov[i].iov_len = logical_block_size;
+
+ i++;
+ iov[i].iov_base = out_buf + max_bytes * 8;
+ iov[i].iov_len = logical_block_size;
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ return;
+
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with NULL address ret:%d",
+ __func__, ret);
+}
+
+/*
+ * Total size is block aligned, addresses are dma aligned, but invidual vector
+ * sizes may not be dma aligned. If device has byte sized dma alignment, this
+ * should succeed. If not, part of this should get dispatched, and the other
+ * part should fail.
+ */
+static void test_invalid_dma_vector_length()
+{
+ const int vecs = 4;
+
+ bool should_fail = dma_alignment > 1;
+ struct iovec iov[vecs];
+ int ret;
+
+ iov[0].iov_base = out_buf;
+ iov[0].iov_len = max_bytes * 2 - max_bytes / 2;
+
+ iov[1].iov_base = out_buf + max_bytes * 4;
+ iov[1].iov_len = logical_block_size * 2 - (dma_alignment + 1);
+
+ iov[2].iov_base = out_buf + max_bytes * 8;
+ iov[2].iov_len = logical_block_size * 2 + (dma_alignment + 1);
+
+ iov[3].iov_base = out_buf + max_bytes * 12;
+ iov[3].iov_len = max_bytes - max_bytes / 2;
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0) {
+ if (should_fail)
+ return;
+ err(errno, "%s: failed to write buf", __func__);
+ }
+
+ if (should_fail)
+ err(ENOTSUP, "%s: write buf unexpectedly succeeded with invalid DMA offset address ret:%d",
+ __func__, ret);
+
+ iov[0].iov_base = in_buf;
+ iov[0].iov_len = max_bytes * 2 - max_bytes / 2;
+
+ iov[1].iov_base = in_buf + max_bytes * 4;
+ iov[1].iov_len = logical_block_size * 2 - (dma_alignment + 1);
+
+ iov[2].iov_base = in_buf + max_bytes * 8;
+ iov[2].iov_len = logical_block_size * 2 + (dma_alignment + 1);
+
+ iov[3].iov_base = in_buf + max_bytes * 12;
+ iov[3].iov_len = max_bytes - max_bytes / 2;
+
+ ret = pwritev(test_fd, iov, vecs, 0);
+ if (ret < 0)
+ err(errno, "%s: failed to read buf", __func__);
+
+ compare(out_buf, in_buf, iov[0].iov_len);
+ compare(out_buf + max_bytes * 4, in_buf + max_bytes * 4, iov[1].iov_len);
+ compare(out_buf + max_bytes * 8, in_buf + max_bytes * 8, iov[2].iov_len);
+ compare(out_buf + max_bytes * 12, in_buf + max_bytes * 12, iov[3].iov_len);
+}
+
+static void run_tests()
+{
+ test_full_size_aligned();
+ test_dma_aligned();
+ test_page_aligned_vectors();
+ test_dma_aligned_vectors();
+ test_unaligned_page_vectors();
+ test_unaligned_vectors();
+ test_invalid_starting_addr();
+ test_invalid_middle_addr();
+ test_invalid_dma_alignment();
+ test_invalid_dma_vector_alignment();
+ test_max_vector_limits();
+ test_invalid_dma_vector_alignment_large();
+ test_invalid_dma_vector_length();
+}
+
+/* ./$prog-name file */
+int main(int argc, char **argv)
+{
+ if (argc < 2)
+ errx(EINVAL, "expect argments: file");
+
+ init_args(argv);
+ init_buffers();
+ run_tests();
+ close(test_fd);
+ free(out_buf);
+ free(in_buf);
+
+ return 0;
+}
+
diff --git a/tests/block/042 b/tests/block/042
new file mode 100644
index 0000000..9c05643
--- /dev/null
+++ b/tests/block/042
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+. tests/block/rc
+
+DESCRIPTION="Test unusual direct-io offsets"
+QUICK=1
+
+device_requires() {
+ _require_test_dev_sysfs
+}
+
+test_device() {
+ echo "Running ${TEST_NAME}"
+
+ if ! src/dio-offsets ${TEST_DEV}; then
+ echo "src/dio-offsets failed"
+ fi
+
+ echo "Test complete"
+}
diff --git a/tests/block/042.out b/tests/block/042.out
new file mode 100644
index 0000000..b35c7ce
--- /dev/null
+++ b/tests/block/042.out
@@ -0,0 +1,2 @@
+Running block/042
+Test complete
--
2.47.3
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-14 20:54 [PATCH blktests] create a test for direct io offsets Keith Busch
@ 2025-10-14 21:21 ` Bart Van Assche
2025-10-14 21:59 ` Keith Busch
2025-10-17 11:13 ` Christoph Hellwig
` (2 subsequent siblings)
3 siblings, 1 reply; 14+ messages in thread
From: Bart Van Assche @ 2025-10-14 21:21 UTC (permalink / raw)
To: Keith Busch, linux-block, shinichiro.kawasaki; +Cc: Keith Busch
On 10/14/25 1:54 PM, Keith Busch wrote:
> +static void find_mount_point(const char *filepath, char *mount_point,
> + size_t mp_len)
> +{
> + char path[PATH_MAX - 1], abs_path[PATH_MAX], *pos;
> + struct stat file_stat, mp_stat, parent_stat;
Why C instead of C++? Any code that manipulates strings is usually
significantly easier to write in C++ rather than C.
> + strncpy(mount_point, path, mp_len - 1);
> + mount_point[mp_len - 1] = '\0';
Why strncpy() instead of strdup()?
> + len = strlen(base);
> + p = base + len - 1;
> + while (p > base && *p >= '0' && *p <= '9')
> + p--;
> +
> + if (p > base && *p == 'p' && *(p + 1) != '\0')
> + *p = '\0';
> + else if (len >= 4 && p > base && p < base + len - 1) {
> + if ((strncmp(base, "sd", 2) == 0 ||
> + strncmp(base, "hd", 2) == 0 ||
> + strncmp(base, "vd", 2) == 0) &&
> + (*p >= 'a' && *p <= 'z'))
> + *(p + 1) = '\0';
> + }
Deriving the disk name from a partition name by stripping a suffix is
fragile. A better way is to iterate over /sys/class/block/*/*. See also
the PartitionParent() function in
https://cs.android.com/android/platform/superproject/main/+/main:system/core/fs_mgr/blockdev.cpp.
> +static void read_sysfs_attr(char *path, unsigned long *value)
> +{
> + FILE *f;
> + int ret;
> +
> + f = fopen(path, "r");
> + if (!f)
> + err(ENOENT, "%s", path);
> +
> + ret = fscanf(f, "%lu", value);
> + fclose(f);
> + if (ret != 1)
> + err(ENOENT, "%s", basename(path));
> +}
Why is the result stored in a pointer argument instead of returning it
as return value?
> +static void read_queue_attrs(const char *path)
> +{
> + char attr[PATH_MAX];
> +
> + if (snprintf(attr, sizeof(attr), "%s/max_segments", path) < 0)
> + err(errno, "max_segments");
> + read_sysfs_attr(attr, &max_segments);
Has it been considered to make read_sysfs_attr() accept a format string
+ arguments? I think that would make the code in this function more
compact.
Thanks,
Bart.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-14 21:21 ` Bart Van Assche
@ 2025-10-14 21:59 ` Keith Busch
0 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2025-10-14 21:59 UTC (permalink / raw)
To: Bart Van Assche; +Cc: Keith Busch, linux-block, shinichiro.kawasaki
On Tue, Oct 14, 2025 at 02:21:20PM -0700, Bart Van Assche wrote:
> On 10/14/25 1:54 PM, Keith Busch wrote:
> > +static void find_mount_point(const char *filepath, char *mount_point,
> > + size_t mp_len)
> > +{
> > + char path[PATH_MAX - 1], abs_path[PATH_MAX], *pos;
> > + struct stat file_stat, mp_stat, parent_stat;
>
> Why C instead of C++? Any code that manipulates strings is usually
> significantly easier to write in C++ rather than C.
The only reason I have is that I'm not a very good C++ coder.
> > + strncpy(mount_point, path, mp_len - 1);
> > + mount_point[mp_len - 1] = '\0';
>
> Why strncpy() instead of strdup()?
I'm just using stack strings here, avoiding any heap allocations.
> > + else if (len >= 4 && p > base && p < base + len - 1) {
> > + if ((strncmp(base, "sd", 2) == 0 ||
> > + strncmp(base, "hd", 2) == 0 ||
> > + strncmp(base, "vd", 2) == 0) &&
> > + (*p >= 'a' && *p <= 'z'))
> > + *(p + 1) = '\0';
> > + }
>
> Deriving the disk name from a partition name by stripping a suffix is
> fragile.
Completely agree.
> A better way is to iterate over /sys/class/block/*/*. See also
> the PartitionParent() function in https://cs.android.com/android/platform/superproject/main/+/main:system/core/fs_mgr/blockdev.cpp.
Thanks for the pointer.
> > +static void read_sysfs_attr(char *path, unsigned long *value)
> > +{
> > + FILE *f;
> > + int ret;
> > +
> > + f = fopen(path, "r");
> > + if (!f)
> > + err(ENOENT, "%s", path);
> > +
> > + ret = fscanf(f, "%lu", value);
> > + fclose(f);
> > + if (ret != 1)
> > + err(ENOENT, "%s", basename(path));
> > +}
>
> Why is the result stored in a pointer argument instead of returning it
> as return value?
No particular reason.
> > +static void read_queue_attrs(const char *path)
> > +{
> > + char attr[PATH_MAX];
> > +
> > + if (snprintf(attr, sizeof(attr), "%s/max_segments", path) < 0)
> > + err(errno, "max_segments");
> > + read_sysfs_attr(attr, &max_segments);
>
> Has it been considered to make read_sysfs_attr() accept a format string
> + arguments? I think that would make the code in this function more
> compact.
That sounds good to me.
Though all the comments are about the boiler plate parameter extraction.
That was supposed to be the least interesting part about this patch :)
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-14 20:54 [PATCH blktests] create a test for direct io offsets Keith Busch
2025-10-14 21:21 ` Bart Van Assche
@ 2025-10-17 11:13 ` Christoph Hellwig
2025-10-20 16:20 ` Keith Busch
2025-10-20 12:40 ` Shinichiro Kawasaki
2025-10-20 23:25 ` Chaitanya Kulkarni
3 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2025-10-17 11:13 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, shinichiro.kawasaki, Keith Busch
> +static void init_kernel_version()
> +{
> + struct utsname buffer;
> + char *major_version_str;
> + char *minor_version_str;
> + if (uname(&buffer) != 0)
> + err(errno, "uname");
> +
> + major_version_str = strtok(buffer.release, ".");
> + minor_version_str = strtok(NULL, ".");
> +
> + kernel_major = strtol(major_version_str, NULL, 0);
> + kernel_minor = strtol(minor_version_str, NULL, 0);
> +}
Testing for specific kernel versions is probably going to fall
flat when this stuff gets backported.. I just realize that maybe
we just need a statx / fsxattr flag to report that this is supported
to make everyones life easier? statx probably won't make Christian
happy, but now that the fsxattr stuff is in common code that seems
like an easy enough option to rush into 6.18 still.
I.e., we should find a way to remove all the need to find the kernel
version, mount point and bdev from mountpoint. Especially as the
latter won't work for multi-device configurations like btrfs or
the XFS rtdev. My rule of thumb for new I/O path features is that
the application should be able to discover everything it needs just
using the FD used for I/O as that is a huge differenciator for
usability.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-14 20:54 [PATCH blktests] create a test for direct io offsets Keith Busch
2025-10-14 21:21 ` Bart Van Assche
2025-10-17 11:13 ` Christoph Hellwig
@ 2025-10-20 12:40 ` Shinichiro Kawasaki
2025-10-20 21:03 ` Keith Busch
2025-10-20 23:25 ` Chaitanya Kulkarni
3 siblings, 1 reply; 14+ messages in thread
From: Shinichiro Kawasaki @ 2025-10-20 12:40 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block@vger.kernel.org, Keith Busch
On Oct 14, 2025 / 13:54, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
Thanks you for the patch. This test case looks valuable to me. Please find my
review comments in line and see if they make sense.
Did you run the new test case with SCSI/SATA devices? I ran the test on
v6.18-rc2 kernel for QEMU SATA and SCSI devices and observed failures.
The devices have virt_boundary=1, and it looks causing a failure. The SCSI
device has a large buffer size (192MiB) which looks larger than the limit
pwrite() and pread() can handle. This caused another failure, probably.
Nit: I suggest to add the prefix "block:" to the commit message title to clarify
which group the new test case will belong to.
>
> Tests direct IO using various memory and length alignments against the
> device's queue limits.
>
> This should work on raw block devices, their partitions, or through
> files on mounted filesystems. Much of the code is dedicated to
> automatically finding the underlying block device's queue limits so that
> it can work in a variety of environments.
I wish the corresponding kernel side change is noted here. I guess the change
is this one [1], isn't it? If so, I suggest to add the link in the commit
message.
[1] https://lore.kernel.org/linux-block/20250827141258.63501-1-kbusch@meta.com/
[...]
> diff --git a/src/dio-offsets.c b/src/dio-offsets.c
> new file mode 100644
> index 0000000..5961232
> --- /dev/null
> +++ b/src/dio-offsets.c
> @@ -0,0 +1,952 @@
> +// SPDX-License-Identifier: GPL-2.0
Nit: GPL-2.0 is fine, but many of the blktests files have GPL-3.0+. If there is
no strong reasoning, I suggest GPL-3.0+.
> +/*
> + * Copyright (c) 2025 Meta Platforms, Inc. All Rights Reserved.
> + */
...
> +static void init_buffers()
> +{
> + unsigned long lb_mask = logical_block_size - 1;
> + int fd, ret;
> +
> + buf_size = max_bytes * max_segments / 2;
> + if (buf_size < logical_block_size * max_segments)
> + err(EINVAL, "%s: logical block size is too big", __func__);
> +
> + if (buf_size < logical_block_size * 1024 * 4)
> + buf_size = logical_block_size * 1024 * 4;
> +
> + if (buf_size & lb_mask)
> + buf_size = (buf_size + lb_mask) & ~(lb_mask);
> +
> + ret = posix_memalign((void **)&in_buf, pagesize, buf_size);
> + if (ret)
Nit: Spaces are used for indent in the two lines above. There are about a
dozen of other places where spaces are used for indent.
> + err(EINVAL, "%s: failed to allocate in-buf", __func__);
> +
> + ret = posix_memalign((void **)&out_buf, pagesize, buf_size);
> + if (ret)
> + err(EINVAL, "%s: failed to allocate out-buf", __func__);
> +
> + fd = open("/dev/urandom", O_RDONLY);
> + if (fd < 0)
> + err(EINVAL, "%s: failed to open urandom", __func__);
> +
> + ret = read(fd, out_buf, buf_size);
> + if (ret < 0)
> + err(EINVAL, "%s: failed to randomize output buffer", __func__);
> +
> + close(fd);
> +}
> +
> +static void __compare(void *a, void *b, size_t size, const char *test)
> +{
> + if (!memcmp(a, b, size))
> + return;
> + err(EIO, "%s: data corruption", test);
> +}
> +#define compare(a, b, size) __compare(a, b, size, __func__)
> +
> +/*
> + * Test using page aligned buffers, single source
> + *
> + * Total size is aligned to a logical block size and exceeds the max transfer
> + * size as well as the max segments. This should test the kernel's split bio
> + * construction and bio splitting for exceeding these limits.
> + */
> +static void test_full_size_aligned()
> +{
> + int ret;
> +
> + memset(in_buf, 0, buf_size);
> + ret = pwrite(test_fd, out_buf, buf_size, 0);
As I noted before, when I tried with QEMU SAS drive, the buf_size is large
(192MiB) and pwrite returned the value smaller than that (128MiB). This
caused the compare() failure below. How about to check the return values
from pwrite() and pread(), and pass it to compare()?
> + if (ret < 0)
> + err(errno, "%s: failed to write buf", __func__);
> +
> + ret = pread(test_fd, in_buf, buf_size, 0);
> + if (ret < 0)
> + err(errno, "%s: failed to read buf", __func__);
> +
> + compare(out_buf, in_buf, buf_size);
> +}
> +
...
> +
> +/*
> + * Total size is a logical block size multiple, but none of the vectors are.
> + *
> + * Total vectors will be less than the max. The vectors will be dma aligned. If
> + * a virtual boundary exists, this should fail, otherwise it should succceed on
> + * kernels 6.18 and newer.
> + */
> +static void test_unaligned_vectors()
> +{
> + const int vecs = 4;
> +
> + bool should_fail = true;
> + struct iovec iov[vecs];
> + int i, ret, offset;
> +
> + if ((kernel_major > 6 || (kernel_major == 6 && kernel_minor >= 18)) &&
> + virt_boundary <= 1)
> + should_fail = false;
> +
> + memset(in_buf, 0, buf_size);
> + for (i = 0; i < vecs; i++) {
> + offset = logical_block_size * i * 8;
> + iov[i].iov_base = out_buf + offset;
> + iov[i].iov_len = logical_block_size / 2;
> + }
> +
> + ret = pwritev(test_fd, iov, vecs, 0);
> + if (ret < 0) {
> + if (should_fail)
> + return;
> + err(errno, "%s: failed to write buf", __func__);
When I ran the test with QEMU SATA drive, it failed here. The drive
had virt_boundary=1. I'm not sure if it means test side bug or kernel side bug.
> + }
> +
> + if (should_fail)
> + err(ENOTSUP, "%s: write buf unexpectedly succeeded ret:%d",
> + __func__, ret);
> +
> + for (i = 0; i < vecs; i++) {
> + offset = logical_block_size * i * 8;
> + iov[i].iov_base = in_buf + offset;
> + iov[i].iov_len = logical_block_size / 2;
> + }
> +
> + ret = preadv(test_fd, iov, vecs, 0);
> + if (ret < 0)
> + err(errno, "%s: failed to read buf", __func__);
> +
> + for (i = 0; i < vecs; i++) {
> + offset = logical_block_size * i * 8;
> + compare(in_buf + offset, out_buf + offset, logical_block_size / 2);
> + }
> +}
...
> +/* ./$prog-name file */
> +int main(int argc, char **argv)
> +{
> + if (argc < 2)
> + errx(EINVAL, "expect argments: file");
> +
> + init_args(argv);
> + init_buffers();
> + run_tests();
> + close(test_fd);
> + free(out_buf);
> + free(in_buf);
> +
> + return 0;
> +}
> +
Nit: When I applied the patch, the empty line above at the end of the new file
was reported as a warning.
> diff --git a/tests/block/042 b/tests/block/042
> new file mode 100644
Nit: I suggest the file mode 755 for the test scritps.
> index 0000000..9c05643
> --- /dev/null
> +++ b/tests/block/042
> @@ -0,0 +1,20 @@
> +#!/bin/bash
Even though this is a tiny script, I suggest to add a SPDX License Identifire,
and your copyright here. Also, I suggest to add a short description here,
copying from the commit message, like,
# Tests direct IO using various memory and length alignments against the
# device's queue limits.
> +
> +. tests/block/rc
> +
> +DESCRIPTION="Test unusual direct-io offsets"
> +QUICK=1
> +
> +device_requires() {
> + _require_test_dev_sysfs
_require_test_dev_sysfs() should have an argument to specify a sysfs
attirbute file. Did you miss it? Otherwise, device_requires() can be dropped.
Also, spaces are used for indent in the line above.
> +}
> +
> +test_device() {
> + echo "Running ${TEST_NAME}"
> +
> + if ! src/dio-offsets ${TEST_DEV}; then
The line above caused a shellcheck warning. Please double quote the reference to
TEST_DEV.
Thanks!
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-17 11:13 ` Christoph Hellwig
@ 2025-10-20 16:20 ` Keith Busch
2025-10-21 5:28 ` Christoph Hellwig
0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2025-10-20 16:20 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Keith Busch, linux-block, shinichiro.kawasaki
On Fri, Oct 17, 2025 at 04:13:32AM -0700, Christoph Hellwig wrote:
> > +static void init_kernel_version()
> > +{
> > + struct utsname buffer;
> > + char *major_version_str;
> > + char *minor_version_str;
> > + if (uname(&buffer) != 0)
> > + err(errno, "uname");
> > +
> > + major_version_str = strtok(buffer.release, ".");
> > + minor_version_str = strtok(NULL, ".");
> > +
> > + kernel_major = strtol(major_version_str, NULL, 0);
> > + kernel_minor = strtol(minor_version_str, NULL, 0);
> > +}
>
> Testing for specific kernel versions is probably going to fall
> flat when this stuff gets backported.. I just realize that maybe
> we just need a statx / fsxattr flag to report that this is supported
> to make everyones life easier? statx probably won't make Christian
> happy, but now that the fsxattr stuff is in common code that seems
> like an easy enough option to rush into 6.18 still.
In addition to a flag to say direct-io can use block data split among
multiple vectors, there's two currently unreported attributes you need
to know in order to successfuly use direct-io like this: max sectors and
virtual boundary mask. The max sectors attribute is needed to know
exactly how many vectors you can insert before the total length needs to
add up to a block size.
Adding these to statx or fsxattr will help remove a ton of crap from
this test since I could remove all the stuff extracting it out of sysfs
from any given file.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-20 12:40 ` Shinichiro Kawasaki
@ 2025-10-20 21:03 ` Keith Busch
2025-10-21 1:32 ` Shinichiro Kawasaki
0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2025-10-20 21:03 UTC (permalink / raw)
To: Shinichiro Kawasaki; +Cc: Keith Busch, linux-block@vger.kernel.org
On Mon, Oct 20, 2025 at 12:40:07PM +0000, Shinichiro Kawasaki wrote:
> > +static void test_full_size_aligned()
> > +{
> > + int ret;
> > +
> > + memset(in_buf, 0, buf_size);
> > + ret = pwrite(test_fd, out_buf, buf_size, 0);
>
> As I noted before, when I tried with QEMU SAS drive, the buf_size is large
> (192MiB) and pwrite returned the value smaller than that (128MiB). This
> caused the compare() failure below. How about to check the return values
> from pwrite() and pread(), and pass it to compare()?
Oh, short writes are not expected. I should check that the actual
written matched the requested total. I wonder why it's bailing out part
way through, though.
> > + if (ret < 0) {
> > + if (should_fail)
> > + return;
> > + err(errno, "%s: failed to write buf", __func__);
>
> When I ran the test with QEMU SATA drive, it failed here. The drive
> had virt_boundary=1. I'm not sure if it means test side bug or kernel side bug.
Hm, I tested QEMU's AHCI and that seems okay. Is that what you're using
to attach the "SATA" drive? I'd like to recreate this and figure out
what's going on here.
If this test says virt_boundary is 1 for your device, that should mean
there is no virtual bounday to consider; this test should succeed with
those reported queue limits.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-14 20:54 [PATCH blktests] create a test for direct io offsets Keith Busch
` (2 preceding siblings ...)
2025-10-20 12:40 ` Shinichiro Kawasaki
@ 2025-10-20 23:25 ` Chaitanya Kulkarni
3 siblings, 0 replies; 14+ messages in thread
From: Chaitanya Kulkarni @ 2025-10-20 23:25 UTC (permalink / raw)
To: Keith Busch
Cc: Keith Busch, linux-block@vger.kernel.org,
shinichiro.kawasaki@wdc.com
> +static long pagesize;
> +static void *out_buf;
> +static void *in_buf;
> +static int test_fd;
> +
for the sake of completeness see if it makes sense, totally untested :-
diff --git a/src/dio-offsets.c b/src/dio-offsets.c
index 5961232..a8c4d3e 100644
--- a/src/dio-offsets.c
+++ b/src/dio-offsets.c
@@ -157,7 +157,7 @@ static size_t buf_size;
static long pagesize;
static void *out_buf;
static void *in_buf;
-static int test_fd;
+static int test_fd = -1;
static void init_kernel_version()
@@ -881,6 +881,20 @@ static void run_tests()
test_unaligned_vectors();
}
+/*
+ * Cleanup handler for atexit()
+ *
+ * Ensures resources are freed on all exit paths, including when err()/errx()
+ * cause early termination. This prevents file descriptor and memory leaks.
+ */
+static void cleanup(void)
+{
+ if (test_fd >= 0)
+ close(test_fd);
+ free(in_buf);
+ free(out_buf);
+}
+
int main(int argc, char **argv)
{
if (argc < 2)
errx(EINVAL, "expect argments: file");
+
+ atexit(cleanup);
init_args(argv);
init_buffers();
run_tests();
close(test_fd);
free(out_buf);
> +
> +static void read_queue_attrs(const char *path)
> +{
> + char attr[PATH_MAX];
> +
> + if (snprintf(attr, sizeof(attr), "%s/max_segments", path) < 0)
> + err(errno, "max_segments");
> + read_sysfs_attr(attr, &max_segments);
> +
> + if (snprintf(attr, sizeof(attr), "%s/dma_alignment", path) < 0)
> + err(errno, "dma_alignment");
> + read_sysfs_attr(attr, &dma_alignment);
> +
> + if (snprintf(attr, sizeof(attr), "%s/virt_boundary_mask", path) < 0)
> + err(errno, "virt_boundary_mask");
> + read_sysfs_attr(attr, &virt_boundary);
> +
> + if (snprintf(attr, sizeof(attr), "%s/logical_block_size", path) < 0)
> + err(errno, "logical_block_size");
> + read_sysfs_attr(attr, &logical_block_size);
> +
> + if (snprintf(attr, sizeof(attr), "%s/max_sectors_kb", path) < 0)
> + err(errno, "max_sectors_kb");
> + read_sysfs_attr(attr, &max_bytes);
> +
> + max_bytes *= 1024;
> + dma_alignment++;
> + virt_boundary++;
> +
> + /*
> + printf("logical_block_size:%lu dma_alignment:%lu virt_boundary:%lu max_segments:%lu max_bytes:%lu\n",
> + logical_block_size, dma_alignment, virt_boundary, max_segments, max_bytes);
> + */
debug comment ? either remove of guard with g_debug ?
> +}
> +
> +static void init_args(char **argv)
> +{
> + char sys_path[PATH_MAX];
> +
> + test_fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC | O_DIRECT);
> + if (test_fd < 0)
> + err(errno, "%s: failed to open %s", __func__, argv[1]);
> +
> + init_kernel_version();
> + find_block_device(argv[1], sys_path, sizeof(sys_path));
> + read_queue_attrs(sys_path);
> +
> + if (!power_of_2(virt_boundary) ||
> + !power_of_2(dma_alignment) ||
> + !power_of_2(logical_block_size))
> + err(EINVAL, "%s: bad parameters", __func__);
> +
> + if (virt_boundary > 1 && virt_boundary < logical_block_size)
> + err(EINVAL, "%s: virt_boundary:%lu logical_block_size:%lu", __func__,
> + virt_boundary, logical_block_size);
> +
> + if (dma_alignment > logical_block_size)
> + err(EINVAL, "%s: dma_alignment:%lu logical_block_size:%lu", __func__,
> + dma_alignment, logical_block_size);
> +
> + if (max_segments > 4096)
> + max_segments = 4096;
> + if (max_bytes > 16384 * 1024)
> + max_bytes = 16384 * 1024;
> + if (max_bytes & (logical_block_size - 1))
> + max_bytes -= max_bytes & (logical_block_size - 1);
> + pagesize = sysconf(_SC_PAGE_SIZE);
> +}
> +
> +static void init_buffers()
> +{
> + unsigned long lb_mask = logical_block_size - 1;
> + int fd, ret;
> +
> + buf_size = max_bytes * max_segments / 2;
> + if (buf_size < logical_block_size * max_segments)
> + err(EINVAL, "%s: logical block size is too big", __func__);
> +
> + if (buf_size < logical_block_size * 1024 * 4)
> + buf_size = logical_block_size * 1024 * 4;
> +
> + if (buf_size & lb_mask)
> + buf_size = (buf_size + lb_mask) & ~(lb_mask);
> +
> + ret = posix_memalign((void **)&in_buf, pagesize, buf_size);
> + if (ret)
> + err(EINVAL, "%s: failed to allocate in-buf", __func__);
> +
> + ret = posix_memalign((void **)&out_buf, pagesize, buf_size);
> + if (ret)
> + err(EINVAL, "%s: failed to allocate out-buf", __func__);
> +
> + fd = open("/dev/urandom", O_RDONLY);
> + if (fd < 0)
> + err(EINVAL, "%s: failed to open urandom", __func__);
> +
> + ret = read(fd, out_buf, buf_size);
> + if (ret < 0)
> + err(EINVAL, "%s: failed to randomize output buffer", __func__);
> +
> + close(fd);
> +}
> +
> +static void __compare(void *a, void *b, size_t size, const char *test)
> +{
> + if (!memcmp(a, b, size))
> + return;
> + err(EIO, "%s: data corruption", test);
> +}
> +#define compare(a, b, size) __compare(a, b, size, __func__)
> +
> +/*
> + * Test using page aligned buffers, single source
> + *
> + * Total size is aligned to a logical block size and exceeds the max transfer
> + * size as well as the max segments. This should test the kernel's split bio
> + * construction and bio splitting for exceeding these limits.
> + */
> +static void test_full_size_aligned()
> +{
> + int ret;
> +
> + memset(in_buf, 0, buf_size);
> + ret = pwrite(test_fd, out_buf, buf_size, 0);
> + if (ret < 0)
> + err(errno, "%s: failed to write buf", __func__);
> +
> + ret = pread(test_fd, in_buf, buf_size, 0);
> + if (ret < 0)
> + err(errno, "%s: failed to read buf", __func__);
> +
> + compare(out_buf, in_buf, buf_size);
> +}
> +
do we need to check for partial complete I/O ? something like this totally untested:-
iff --git a/src/dio-offsets.c b/src/dio-offsets.c
index a8c4d3e..b2f3e4a 100644
--- a/src/dio-offsets.c
+++ b/src/dio-offsets.c
@@ -280,6 +280,42 @@ static void init_args(char **argv)
err(errno, "%s: failed to open %s", __func__, argv[1]);
}
+/*
+ * Verify that pread/pwrite transferred the expected number of bytes.
+ *
+ * POSIX allows short transfers even on success (ret >= 0). For direct I/O
+ * testing, we need exact transfers - any short transfer indicates a problem.
+ */
+static void check_io_result(ssize_t ret, size_t expected, const char *op,
+ const char *func)
+{
+ if (ret < 0)
+ err(errno, "%s: %s failed", func, op);
+ if (ret != (ssize_t)expected)
+ errx(EIO, "%s: short %s: expected %zu bytes, got %zd bytes",
+ func, op, expected, ret);
+}
+
+/*
+ * Calculate total size of iovec array and verify transfer completed fully.
+ *
+ * For preadv/pwritev, we need to sum all iov_len fields to get the expected
+ * total transfer size, then verify the actual return matches.
+ */
+static void check_iov_result(ssize_t ret, const struct iovec *iov, int iovcnt,
+ const char *op, const char *func)
+{
+ size_t expected = 0;
+ int i;
+
+ for (i = 0; i < iovcnt; i++)
+ expected += iov[i].iov_len;
+
+ if (ret < 0)
+ err(errno, "%s: %s failed", func, op);
+ if (ret != (ssize_t)expected)
+ errx(EIO, "%s: short %s: expected %zu bytes, got %zd bytes",
+ func, op, expected, ret);
+}
static void init_buffers()
{
@@ -468,10 +504,8 @@ static void test_full_size_aligned()
memset(in_buf, 0, buf_size);
ret = pwrite(test_fd, out_buf, buf_size, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_io_result(ret, buf_size, "write", __func__);
ret = pread(test_fd, in_buf, buf_size, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_io_result(ret, buf_size, "read", __func__);
compare(out_buf, in_buf, buf_size);
@@ -489,10 +523,8 @@ static void test_dma_aligned()
memset(in_buf, 0, buf_size);
ret = pwrite(test_fd, out_buf + dma_alignment, max_bytes, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_io_result(ret, max_bytes, "write", __func__);
ret = pread(test_fd, in_buf + dma_alignment, max_bytes, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_io_result(ret, max_bytes, "read", __func__);
compare(out_buf + dma_alignment, in_buf + dma_alignment, max_bytes);
@@ -515,8 +547,7 @@ static void test_page_aligned_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, vecs, "writev", __func__);
for (i = 0; i < vecs; i++) {
offset = logical_block_size * i * 4;
@@ -525,8 +556,7 @@ static void test_page_aligned_vectors()
}
ret = preadv(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_iov_result(ret, iov, vecs, "readv", __func__);
for (i = 0; i < vecs; i++) {
offset = logical_block_size * i * 4;
@@ -551,8 +581,7 @@ static void test_dma_aligned_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, vecs, "writev", __func__);
for (i = 0; i < vecs; i++) {
offset = logical_block_size * i * 8 + dma_alignment * (i + 1);
@@ -561,8 +590,7 @@ static void test_dma_aligned_vectors()
}
ret = preadv(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_iov_result(ret, iov, vecs, "readv", __func__);
for (i = 0; i < vecs; i++) {
offset = logical_block_size * i * 8 + dma_alignment * (i + 1);
@@ -599,11 +627,13 @@ static void test_unaligned_page_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0) {
- if (!should_fail)
- err(errno, "%s: unexpected failure", __func__);
+ if (ret < 0 && should_fail)
return;
- }
+ check_iov_result(ret, iov, vecs, "writev", __func__);
if (should_fail)
errx(EINVAL, "%s: expected failure, but succeeded", __func__);
@@ -615,8 +639,7 @@ static void test_unaligned_page_vectors()
}
ret = preadv(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_iov_result(ret, iov, vecs, "readv", __func__);
for (i = 0; i < vecs; i++) {
offset = logical_block_size * i * 8;
@@ -653,11 +676,13 @@ static void test_unaligned_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0) {
- if (!should_fail)
- err(errno, "%s: unexpected failure", __func__);
+ if (ret < 0 && should_fail)
return;
- }
+ check_iov_result(ret, iov, vecs, "writev", __func__);
if (should_fail)
errx(EINVAL, "%s: expected failure, but succeeded", __func__);
@@ -669,8 +688,7 @@ static void test_unaligned_vectors()
}
ret = preadv(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_iov_result(ret, iov, vecs, "readv", __func__);
for (i = 0; i < vecs; i++) {
offset = logical_block_size * i * 4 + logical_block_size / 2;
@@ -693,8 +711,7 @@ static void test_virt_boundary_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, vecs, "writev", __func__);
for (i = 0; i < vecs; i++)
iov[i].iov_base = in_buf + (virt_boundary + 1) * i;
@@ -719,8 +736,7 @@ static void test_max_segments()
}
ret = pwritev(test_fd, iov, max_segments, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, max_segments, "writev", __func__);
}
/*
@@ -737,10 +753,8 @@ static void test_single_block(unsigned long offset, unsigned long size)
memset(in_buf, 0, buf_size);
ret = pwrite(test_fd, out_buf + offset, size, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_io_result(ret, size, "write", __func__);
ret = pread(test_fd, in_buf + offset, size, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_io_result(ret, size, "read", __func__);
compare(in_buf + offset, out_buf + offset, size);
@@ -776,11 +790,13 @@ static void test_unaligned_single_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0) {
- if (!should_fail)
- err(errno, "%s: unexpected failure", __func__);
+ if (ret < 0 && should_fail)
continue;
- }
+ check_iov_result(ret, iov, vecs, "writev", __func__);
if (should_fail)
errx(EINVAL, "%s: expected failure, but succeeded", __func__);
@@ -793,8 +803,7 @@ static void test_unaligned_single_vectors()
}
ret = preadv(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_iov_result(ret, iov, vecs, "readv", __func__);
for (j = 0; j < vecs; j++) {
offset = i + j;
@@ -833,11 +842,13 @@ static void test_aligned_mixed_vectors()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0) {
- if (!should_fail)
- err(errno, "%s: unexpected failure", __func__);
+ if (ret < 0 && should_fail)
continue;
- }
+ check_iov_result(ret, iov, vecs, "writev", __func__);
if (should_fail)
errx(EINVAL, "%s: expected failure, but succeeded", __func__);
@@ -850,8 +855,7 @@ static void test_aligned_mixed_vectors()
}
ret = preadv(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to read buf", __func__);
+ check_iov_result(ret, iov, vecs, "readv", __func__);
for (j = 0; j < vecs; j++) {
offset = (i + j) * pagesize;
@@ -875,8 +879,7 @@ static void test_max_bytes()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, vecs, "writev", __func__);
}
static void test_max_bytes_plus_one()
@@ -893,8 +896,7 @@ static void test_max_bytes_plus_one()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, vecs, "writev", __func__);
}
static void test_max_bytes_minus_one()
@@ -909,8 +911,7 @@ static void test_max_bytes_minus_one()
}
ret = pwritev(test_fd, iov, vecs, 0);
- if (ret < 0)
- err(errno, "%s: failed to write buf", __func__);
+ check_iov_result(ret, iov, vecs, "writev", __func__);
}
static void run_tests()
-ck
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-20 21:03 ` Keith Busch
@ 2025-10-21 1:32 ` Shinichiro Kawasaki
0 siblings, 0 replies; 14+ messages in thread
From: Shinichiro Kawasaki @ 2025-10-21 1:32 UTC (permalink / raw)
To: Keith Busch; +Cc: Keith Busch, linux-block@vger.kernel.org
On Oct 20, 2025 / 15:03, Keith Busch wrote:
> On Mon, Oct 20, 2025 at 12:40:07PM +0000, Shinichiro Kawasaki wrote:
[...]
> > > + if (ret < 0) {
> > > + if (should_fail)
> > > + return;
> > > + err(errno, "%s: failed to write buf", __func__);
> >
> > When I ran the test with QEMU SATA drive, it failed here. The drive
> > had virt_boundary=1. I'm not sure if it means test side bug or kernel side bug.
>
> Hm, I tested QEMU's AHCI and that seems okay. Is that what you're using
> to attach the "SATA" drive? I'd like to recreate this and figure out
> what's going on here.
I used the option below to prepare the QEMU SATA emulation drive:
-drive id=sata_disk,file=/home/shin/Blktests/imgs/sata_testnode2.img,if=none -device ahci,id=ahci -device ide-hd,drive=sata_disk,bus=ahci.0
I also tried the simpler option below and it also triggered the failure:
-drive file=/home/shin/Blktests/imgs/hdd_testnode1_2.img,index=1
I also ran the test case with a baremetal machine and a real SATA HDD,
then I observed the failure. FYI, the blktests output looks like this:
block/042 => sda (Test unusual direct-io offsets) [failed]
runtime 7.281s ... 7.484s
--- tests/block/042.out 2025-10-20 19:57:01.314867376 +0900
+++ /home/shin/kts/kernel-test-suite/src/blktests/results/sda/block/042.out.bad 2025-10-21 10:22:22.226811120 +0900
@@ -1,2 +1,4 @@
Running block/042
+dio-offsets: test_unaligned_vectors: failed to write buf: Invalid argument
+src/dio-offsets failed
Test complete
I enabled the commented out debug print in dio-offsets.c, and it added the
output below:
+logical_block_size:512 dma_alignment:512 virt_boundary:1 max_segments:168 max_bytes:4194304
>
> If this test says virt_boundary is 1 for your device, that should mean
> there is no virtual bounday to consider; this test should succeed with
> those reported queue limits.
I see, then there should be some other cause...
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-20 16:20 ` Keith Busch
@ 2025-10-21 5:28 ` Christoph Hellwig
2025-10-21 21:22 ` Keith Busch
0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2025-10-21 5:28 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Keith Busch, linux-block, shinichiro.kawasaki
On Mon, Oct 20, 2025 at 10:20:40AM -0600, Keith Busch wrote:
> In addition to a flag to say direct-io can use block data split among
> multiple vectors, there's two currently unreported attributes you need
> to know in order to successfuly use direct-io like this: max sectors and
> virtual boundary mask. The max sectors attribute is needed to know
> exactly how many vectors you can insert before the total length needs to
> add up to a block size.
>
> Adding these to statx or fsxattr will help remove a ton of crap from
> this test since I could remove all the stuff extracting it out of sysfs
> from any given file.
Yes. And all the applications would have to do the same things, so this
seems like a huge win. Any chance you could try to get this done ASAP
so that we could make the interface fully discoverable before 6.18 is
released?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-21 5:28 ` Christoph Hellwig
@ 2025-10-21 21:22 ` Keith Busch
2025-10-22 4:46 ` Christoph Hellwig
0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2025-10-21 21:22 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Keith Busch, linux-block, shinichiro.kawasaki
On Mon, Oct 20, 2025 at 10:28:09PM -0700, Christoph Hellwig wrote:
> seems like a huge win. Any chance you could try to get this done ASAP
> so that we could make the interface fully discoverable before 6.18 is
> released?
I just want to make sure I am aligned to what you have in mind. Is this
something like this what you're looking for? This reports the kernel's
ability to handle a dio with memory that is discontiguous for a single
device "sector", and reports the virtual gap requirements.
---
diff --git a/block/bdev.c b/block/bdev.c
index 810707cca9703..5881edc3bf928 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1330,6 +1330,12 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
stat->result_mask |= STATX_DIOALIGN;
}
+ if (request_mask & STATX_DIO_VIRT_SPLIT) {
+ stat->dio_virt_boundary_mask = queue_virt_boundary(
+ bdev->bd_queue);
+ stat->result_mask |= STATX_DIO_VIRT_SPLIT;
+ }
+
if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) {
struct request_queue *bd_queue = bdev->bd_queue;
@@ -1339,6 +1345,10 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
0);
}
+ if (bdev_max_segments(bdev) > 1)
+ stat->attributes |= STATX_ATTR_DIO_VEC_SPLIT;
+ stat->attributes_mask |= STATX_ATTR_DIO_VEC_SPLIT;
+
stat->blksize = bdev_io_min(bdev);
blkdev_put_no_open(bdev);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e99306a8f47ce..aec3ed6356aa8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6105,6 +6105,13 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
}
}
+ if (request_mask & STATX_DIO_VIRT_SPLIT) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ stat->dio_virt_boundary_mask = queue_virt_boundary(bdev->bd_queue);
+ stat->result_mask |= STATX_DIO_VIRT_SPLIT;
+ }
+
if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int awu_min = 0, awu_max = 0;
diff --git a/fs/stat.c b/fs/stat.c
index 6c79661e1b961..7f0e12b3b82e8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -738,6 +738,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
tmp.stx_dio_mem_align = stat->dio_mem_align;
+ tmp.stx_dio_virtual_boundary_mask = stat->dio_virt_boundary_mask;
tmp.stx_dio_offset_align = stat->dio_offset_align;
tmp.stx_dio_read_offset_align = stat->dio_read_offset_align;
tmp.stx_subvol = stat->subvol;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index caff0125faeac..ce3234d1f9377 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -599,6 +599,18 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align;
}
+static void
+xfs_report_dio_split(
+ struct xfs_inode *ip,
+ struct kstat *stat)
+{
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->dio_virt_boundary_mask = queue_virt_boundary(bdev->bd_queue);
+ stat->result_mask |= STATX_DIO_VIRT_SPLIT;
+}
+
unsigned int
xfs_get_atomic_write_min(
struct xfs_inode *ip)
@@ -743,6 +755,8 @@ xfs_vn_getattr(
xfs_report_dioalign(ip, stat);
if (request_mask & STATX_WRITE_ATOMIC)
xfs_report_atomic_write(ip, stat);
+ if (request_mask & STATX_DIO_VIRT_SPLIT)
+ xfs_report_dio_split(ip, stat);
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
diff --git a/include/linux/stat.h b/include/linux/stat.h
index e3d00e7bb26d9..57088eedb6f92 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -55,6 +55,7 @@ struct kstat {
u32 dio_mem_align;
u32 dio_offset_align;
u32 dio_read_offset_align;
+ u32 dio_virt_boundary_mask;
u32 atomic_write_unit_min;
u32 atomic_write_unit_max;
u32 atomic_write_unit_max_opt;
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1686861aae20a..3f79359cf7154 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -184,7 +184,9 @@ struct statx {
/* Optimised max atomic write unit in bytes */
__u32 stx_atomic_write_unit_max_opt;
- __u32 __spare2[1];
+
+ /* Virtual boundary mask between io vectors that require a split */
+ __u32 stx_dio_virtual_boundary_mask;
/* 0xc0 */
__u64 __spare3[8]; /* Spare space for future expansion */
@@ -219,6 +221,7 @@ struct statx {
#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */
#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */
#define STATX_DIO_READ_ALIGN 0x00020000U /* Want/got dio read alignment info */
+#define STATX_DIO_VIRT_SPLIT 0x00040000U /* Want/got dio virtual boundary split */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
@@ -255,6 +258,7 @@ struct statx {
#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */
#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */
#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */
+#define STATX_ATTR_DIO_VEC_SPLIT 0x00800000 /* File supports vector crossing dio payloads */
#endif /* _UAPI_LINUX_STAT_H */
--
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-21 21:22 ` Keith Busch
@ 2025-10-22 4:46 ` Christoph Hellwig
2025-11-17 21:53 ` Keith Busch
0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2025-10-22 4:46 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Keith Busch, linux-block, shinichiro.kawasaki
On Tue, Oct 21, 2025 at 03:22:08PM -0600, Keith Busch wrote:
> On Mon, Oct 20, 2025 at 10:28:09PM -0700, Christoph Hellwig wrote:
> > seems like a huge win. Any chance you could try to get this done ASAP
> > so that we could make the interface fully discoverable before 6.18 is
> > released?
>
> I just want to make sure I am aligned to what you have in mind. Is this
> something like this what you're looking for? This reports the kernel's
> ability to handle a dio with memory that is discontiguous for a single
> device "sector", and reports the virtual gap requirements.
So, I think Christian really did not want more random stuff in statx,
which would lead to using fsxattr instead.
Question: Should we event advertize virt_boundary based unaligned
segment support? It is almost impossible to use correctly unless I'm
missing something, so the better idea might be to just not offer the
value when using PRPs?
> + if (request_mask & STATX_DIO_VIRT_SPLIT) {
> + stat->dio_virt_boundary_mask = queue_virt_boundary(
> + bdev->bd_queue);
Consumer code like file systems, or the bdev fake file system should
never see the queue (and yes, I realize the atomic code just below got
this wrong as well). Please either add a bdev_virt_boundary wrapper,
or use bdev_limit and just access the limits directly.
> + if (bdev_max_segments(bdev) > 1)
> + stat->attributes |= STATX_ATTR_DIO_VEC_SPLIT;
> + stat->attributes_mask |= STATX_ATTR_DIO_VEC_SPLIT;
> + if (request_mask & STATX_DIO_VIRT_SPLIT) {
> + struct block_device *bdev = inode->i_sb->s_bdev;
> +
> + stat->dio_virt_boundary_mask = queue_virt_boundary(bdev->bd_queue);
> + stat->result_mask |= STATX_DIO_VIRT_SPLIT;
> + }
> +static void
> +xfs_report_dio_split(
> + struct xfs_inode *ip,
> + struct kstat *stat)
> +{
> + struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> + struct block_device *bdev = target->bt_bdev;
> +
> + stat->dio_virt_boundary_mask = queue_virt_boundary(bdev->bd_queue);
> + stat->result_mask |= STATX_DIO_VIRT_SPLIT;
> +}
> +
Why does the bdev set different flags from the file system? Shouldn't
the capabilities be the same? And we'll probably want a little helper
to set these based on the bdev instead of opencoding the logic in
multiple places.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-10-22 4:46 ` Christoph Hellwig
@ 2025-11-17 21:53 ` Keith Busch
2025-11-18 5:54 ` Christoph Hellwig
0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2025-11-17 21:53 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Keith Busch, linux-block, shinichiro.kawasaki
On Tue, Oct 21, 2025 at 09:46:30PM -0700, Christoph Hellwig wrote:
> On Tue, Oct 21, 2025 at 03:22:08PM -0600, Keith Busch wrote:
> > On Mon, Oct 20, 2025 at 10:28:09PM -0700, Christoph Hellwig wrote:
> > > seems like a huge win. Any chance you could try to get this done ASAP
> > > so that we could make the interface fully discoverable before 6.18 is
> > > released?
> >
> > I just want to make sure I am aligned to what you have in mind. Is this
> > something like this what you're looking for? This reports the kernel's
> > ability to handle a dio with memory that is discontiguous for a single
> > device "sector", and reports the virtual gap requirements.
>
> So, I think Christian really did not want more random stuff in statx,
> which would lead to using fsxattr instead.
I haven't forgotten about this. I was hoping I would make sense of the
request. It looks like only xfs makes use of fsxattr (it's the only one
that calls copy_fsxattr_to_user()). Is the intention that every
filesystem needs to implement support for fsxattr then?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH blktests] create a test for direct io offsets
2025-11-17 21:53 ` Keith Busch
@ 2025-11-18 5:54 ` Christoph Hellwig
0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2025-11-18 5:54 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Keith Busch, linux-block, shinichiro.kawasaki
On Mon, Nov 17, 2025 at 02:53:49PM -0700, Keith Busch wrote:
> On Tue, Oct 21, 2025 at 09:46:30PM -0700, Christoph Hellwig wrote:
> > On Tue, Oct 21, 2025 at 03:22:08PM -0600, Keith Busch wrote:
> > > On Mon, Oct 20, 2025 at 10:28:09PM -0700, Christoph Hellwig wrote:
> > > > seems like a huge win. Any chance you could try to get this done ASAP
> > > > so that we could make the interface fully discoverable before 6.18 is
> > > > released?
> > >
> > > I just want to make sure I am aligned to what you have in mind. Is this
> > > something like this what you're looking for? This reports the kernel's
> > > ability to handle a dio with memory that is discontiguous for a single
> > > device "sector", and reports the virtual gap requirements.
> >
> > So, I think Christian really did not want more random stuff in statx,
> > which would lead to using fsxattr instead.
>
> I haven't forgotten about this. I was hoping I would make sense of the
> request. It looks like only xfs makes use of fsxattr (it's the only one
> that calls copy_fsxattr_to_user()). Is the intention that every
> filesystem needs to implement support for fsxattr then?
In addition to XFS, the generic ioctl_fsgetxattr, which is directly
weird up to FS_IOC_FSGETXATTR processing uses it, XFS only has it
because it has a special version of that for xattrs.
->fileattr_get is already implemented by most file systems, so
FS_IOC_FSGETXATTR works widely.
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2025-11-18 5:54 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-14 20:54 [PATCH blktests] create a test for direct io offsets Keith Busch
2025-10-14 21:21 ` Bart Van Assche
2025-10-14 21:59 ` Keith Busch
2025-10-17 11:13 ` Christoph Hellwig
2025-10-20 16:20 ` Keith Busch
2025-10-21 5:28 ` Christoph Hellwig
2025-10-21 21:22 ` Keith Busch
2025-10-22 4:46 ` Christoph Hellwig
2025-11-17 21:53 ` Keith Busch
2025-11-18 5:54 ` Christoph Hellwig
2025-10-20 12:40 ` Shinichiro Kawasaki
2025-10-20 21:03 ` Keith Busch
2025-10-21 1:32 ` Shinichiro Kawasaki
2025-10-20 23:25 ` Chaitanya Kulkarni
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox