public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] generic: test reads racing with slow reflink operations
@ 2023-10-19 20:09 Darrick J. Wong
  2023-10-20  6:09 ` Christoph Hellwig
  2023-10-20 13:37 ` Catherine Hoang
  0 siblings, 2 replies; 5+ messages in thread
From: Darrick J. Wong @ 2023-10-19 20:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Catherine Hoang, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

XFS has a rather slow reflink operation.  While a reflink operation is
running, other programs cannot read the contents of the source file,
which is causing latency spikes.  Catherine Hoang wrote a patch to
permit reads, since the source file contents do  not change.  This is a
functionality test for that patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 src/Makefile              |    2 
 src/t_reflink_read_race.c |  343 +++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/1953        |   74 ++++++++++
 tests/generic/1953.out    |    6 +
 4 files changed, 424 insertions(+), 1 deletion(-)
 create mode 100644 src/t_reflink_read_race.c
 create mode 100755 tests/generic/1953
 create mode 100644 tests/generic/1953.out

diff --git a/src/Makefile b/src/Makefile
index 72c8a13007..b5e2b84dae 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -33,7 +33,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
 	attr_replace_test swapon mkswap t_attr_corruption t_open_tmpfiles \
 	fscrypt-crypt-util bulkstat_null_ocount splice-test chprojid_fail \
 	detached_mounts_propagation ext4_resize t_readdir_3 splice2pipe \
-	uuid_ioctl usemem_and_swapoff
+	uuid_ioctl usemem_and_swapoff t_reflink_read_race
 
 EXTRA_EXECS = dmerror fill2attr fill2fs fill2fs_check scaleread.sh \
 	      btrfs_crc32c_forged_name.py popdir.pl popattr.py \
diff --git a/src/t_reflink_read_race.c b/src/t_reflink_read_race.c
new file mode 100644
index 0000000000..acf8f8f73c
--- /dev/null
+++ b/src/t_reflink_read_race.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Race reads with reflink to see if reads continue while we're cloning.
+ * Copyright 2023 Oracle.  All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#ifndef FICLONE
+# define FICLONE	_IOW(0x94, 9, int)
+#endif
+
+static pid_t mypid;
+
+static FILE *outcome_fp;
+
+/* Significant timestamps.  Initialized to zero */
+static double program_start, clone_start, clone_end;
+
+/* Coordinates access to timestamps */
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+struct readinfo {
+	int fd;
+	unsigned int blocksize;
+	char *descr;
+};
+
+struct cloneinfo {
+	int src_fd, dst_fd;
+};
+
+#define MSEC_PER_SEC	1000
+#define NSEC_PER_MSEC	1000000
+
+/*
+ * Assume that it shouldn't take longer than 100ms for the FICLONE ioctl to
+ * enter the kernel and take locks on an uncontended file.  This is also the
+ * required CLOCK_MONOTONIC granularity.
+ */
+#define EARLIEST_READ_MS	(MSEC_PER_SEC / 10)
+
+/*
+ * We want to be reasonably sure that the FICLONE takes long enough that any
+ * read initiated after clone operation locked the source file could have
+ * completed a disk IO before the clone finishes.  Therefore, we require that
+ * the clone operation must take at least 4x the maximum setup time.
+ */
+#define MINIMUM_CLONE_MS	(EARLIEST_READ_MS * 5)
+
+static double timespec_to_msec(const struct timespec *ts)
+{
+	return (ts->tv_sec * (double)MSEC_PER_SEC) +
+	       (ts->tv_nsec / (double)NSEC_PER_MSEC);
+}
+
+static void sleep_ms(unsigned int len)
+{
+	struct timespec time = {
+		.tv_nsec = len * NSEC_PER_MSEC,
+	};
+
+	nanosleep(&time, NULL);
+}
+
+static void outcome(const char *str)
+{
+	fprintf(outcome_fp, "%s\n", str);
+	fflush(outcome_fp);
+}
+
+static void *reader_fn(void *arg)
+{
+	struct timespec now;
+	struct readinfo *ri = arg;
+	char *buf = malloc(ri->blocksize);
+	loff_t pos = 0;
+	ssize_t copied;
+	int ret;
+
+	if (!buf) {
+		perror("alloc buffer");
+		goto kill_error;
+	}
+
+	/* Wait for the FICLONE to start */
+	pthread_mutex_lock(&lock);
+	while (clone_start < program_start) {
+		pthread_mutex_unlock(&lock);
+#ifdef DEBUG
+		printf("%s read waiting for clone to start; cs=%.2f ps=%.2f\n",
+				ri->descr, clone_start, program_start);
+		fflush(stdout);
+#endif
+		sleep_ms(1);
+		pthread_mutex_lock(&lock);
+	}
+	pthread_mutex_unlock(&lock);
+	sleep_ms(EARLIEST_READ_MS);
+
+	/* Read from the file... */
+	while ((copied = read(ri->fd, buf, ri->blocksize)) > 0) {
+		double read_completion;
+
+		ret = clock_gettime(CLOCK_MONOTONIC, &now);
+		if (ret) {
+			perror("clock_gettime");
+			goto kill_error;
+		}
+		read_completion = timespec_to_msec(&now);
+
+		/*
+		 * If clone_end is still zero, the FICLONE is still running.
+		 * If the read completion occurred a safe duration after the
+		 * start of the ioctl, then report that as an early read
+		 * completion.
+		 */
+		pthread_mutex_lock(&lock);
+		if (clone_end < program_start &&
+		    read_completion > clone_start + EARLIEST_READ_MS) {
+			pthread_mutex_unlock(&lock);
+
+			/* clone still running... */
+			printf("finished %s read early at %.2fms\n",
+					ri->descr,
+					read_completion - program_start);
+			fflush(stdout);
+			outcome("finished read early");
+			goto kill_done;
+		}
+		pthread_mutex_unlock(&lock);
+
+		sleep_ms(1);
+		pos += copied;
+	}
+	if (copied < 0) {
+		perror("read");
+		goto kill_error;
+	}
+
+	return NULL;
+kill_error:
+	outcome("reader error");
+kill_done:
+	kill(mypid, SIGTERM);
+	return NULL;
+}
+
+static void *clone_fn(void *arg)
+{
+	struct timespec now;
+	struct cloneinfo *ci = arg;
+	int ret;
+
+	/* Record the approximate start time of this thread */
+	ret = clock_gettime(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_gettime clone start");
+		goto kill_error;
+	}
+	pthread_mutex_lock(&lock);
+	clone_start = timespec_to_msec(&now);
+	pthread_mutex_unlock(&lock);
+
+	printf("started clone at %.2fms\n", clone_start - program_start);
+	fflush(stdout);
+
+	/* Kernel call, only killable with a fatal signal */
+	ret = ioctl(ci->dst_fd, FICLONE, ci->src_fd);
+	if (ret) {
+		perror("FICLONE");
+		goto kill_error;
+	}
+
+	/* If the ioctl completes, note the completion time */
+	ret = clock_gettime(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_gettime clone end");
+		goto kill_error;
+	}
+
+	pthread_mutex_lock(&lock);
+	clone_end = timespec_to_msec(&now);
+	pthread_mutex_unlock(&lock);
+
+	printf("finished clone at %.2fms\n",
+			clone_end - program_start);
+	fflush(stdout);
+
+	/* Complain if we didn't take long enough to clone. */
+	if (clone_end < clone_start + MINIMUM_CLONE_MS) {
+		printf("clone did not take enough time (%.2fms)\n",
+				clone_end - clone_start);
+		fflush(stdout);
+		outcome("clone too fast");
+		goto kill_done;
+	}
+
+	outcome("clone finished before any reads");
+	goto kill_done;
+
+kill_error:
+	outcome("clone error");
+kill_done:
+	kill(mypid, SIGTERM);
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	struct cloneinfo ci;
+	struct readinfo bufio = { .descr = "buffered" };
+	struct readinfo directio = { .descr = "directio" };
+	struct timespec now;
+	pthread_t clone_thread, bufio_thread, directio_thread;
+	double clockres;
+	int ret;
+
+	if (argc != 4) {
+		printf("Usage: %s src_file dst_file outcome_file\n", argv[0]);
+		return 1;
+	}
+
+	mypid = getpid();
+
+	/* Check for sufficient clock precision. */
+	ret = clock_getres(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_getres MONOTONIC");
+		return 2;
+	}
+	printf("CLOCK_MONOTONIC resolution: %llus %luns\n",
+			(unsigned long long)now.tv_sec,
+			(unsigned long)now.tv_nsec);
+	fflush(stdout);
+
+	clockres = timespec_to_msec(&now);
+	if (clockres > EARLIEST_READ_MS) {
+		fprintf(stderr, "insufficient CLOCK_MONOTONIC resolution\n");
+		return 2;
+	}
+
+	/*
+	 * Measure program start time since the MONOTONIC time base is not
+	 * all that well defined.
+	 */
+	ret = clock_gettime(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_gettime MONOTONIC");
+		return 2;
+	}
+	if (now.tv_sec == 0 && now.tv_nsec == 0) {
+		fprintf(stderr, "Uhoh, start time is zero?!\n");
+		return 2;
+	}
+	program_start = timespec_to_msec(&now);
+
+	outcome_fp = fopen(argv[3], "w");
+	if (!outcome_fp) {
+		perror(argv[3]);
+		return 2;
+	}
+
+	/* Open separate fds for each thread */
+	ci.src_fd = open(argv[1], O_RDONLY);
+	if (ci.src_fd < 0) {
+		perror(argv[1]);
+		return 2;
+	}
+
+	ci.dst_fd = open(argv[2], O_RDWR | O_CREAT, 0600);
+	if (ci.dst_fd < 0) {
+		perror(argv[2]);
+		return 2;
+	}
+
+	bufio.fd = open(argv[1], O_RDONLY);
+	if (bufio.fd < 0) {
+		perror(argv[1]);
+		return 2;
+	}
+	bufio.blocksize = 37;
+
+	directio.fd = open(argv[1], O_RDONLY | O_DIRECT);
+	if (directio.fd < 0) {
+		perror(argv[1]);
+		return 2;
+	}
+	directio.blocksize = 512;
+
+	/* Start threads */
+	ret = pthread_create(&clone_thread, NULL, clone_fn, &ci);
+	if (ret) {
+		fprintf(stderr, "create clone thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_create(&bufio_thread, NULL, reader_fn, &bufio);
+	if (ret) {
+		fprintf(stderr, "create bufio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_create(&directio_thread, NULL, reader_fn, &directio);
+	if (ret) {
+		fprintf(stderr, "create directio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	/* Wait for threads */
+	ret = pthread_join(clone_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "join clone thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_join(bufio_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "join bufio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_join(directio_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "join directio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	printf("Program should have killed itself?\n");
+	outcome("program failed to end early");
+	return 0;
+}
diff --git a/tests/generic/1953 b/tests/generic/1953
new file mode 100755
index 0000000000..058538e6fe
--- /dev/null
+++ b/tests/generic/1953
@@ -0,0 +1,74 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023, Oracle and/or its affiliates.  All Rights Reserved.
+#
+# FS QA Test No. 1953
+#
+# Race file reads with a very slow reflink operation to see if the reads
+# actually complete while the reflink is ongoing.  This is a functionality
+# test for XFS commit f3ba4762fa56 "xfs: allow read IO and FICLONE to run
+# concurrently".
+#
+. ./common/preamble
+_begin_fstest auto clone punch
+
+# Import common functions.
+. ./common/filter
+. ./common/attr
+. ./common/reflink
+
+# real QA test starts here
+_require_scratch_reflink
+_require_cp_reflink
+_require_xfs_io_command "fpunch"
+_require_test_program "punch-alternating"
+_require_test_program "t_reflink_read_race"
+
+rm -f "$seqres.full"
+
+echo "Format and mount"
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount >> "$seqres.full" 2>&1
+
+testdir="$SCRATCH_MNT/test-$seq"
+mkdir "$testdir"
+
+calc_space() {
+	blocks_needed=$(( 2 ** (fnr + 1) ))
+	space_needed=$((blocks_needed * blksz * 5 / 4))
+}
+
+# Figure out the number of blocks that we need to get the reflink runtime above
+# 1 seconds
+echo "Create a many-block file"
+for ((fnr = 1; fnr < 40; fnr++)); do
+	free_blocks=$(stat -f -c '%a' "$testdir")
+	blksz=$(_get_file_block_size "$testdir")
+	space_avail=$((free_blocks * blksz))
+	calc_space
+	test $space_needed -gt $space_avail && \
+		_notrun "Insufficient space for stress test; would only create $blocks_needed extents."
+
+	off=$(( (2 ** fnr) * blksz))
+	$XFS_IO_PROG -f -c "pwrite -S 0x61 -b 4194304 $off $off" "$testdir/file1" >> "$seqres.full"
+	"$here/src/punch-alternating" "$testdir/file1" >> "$seqres.full"
+
+	timeout 1s cp --reflink=always "$testdir/file1" "$testdir/garbage" || break
+done
+echo "fnr=$fnr" >> $seqres.full
+
+echo "Reflink the big file"
+$here/src/t_reflink_read_race "$testdir/file1" "$testdir/file2" \
+	"$testdir/outcome" &>> $seqres.full
+
+if [ ! -e "$testdir/outcome" ]; then
+	echo "Could not set up program"
+elif grep -q "finished read early" "$testdir/outcome"; then
+	echo "test completed successfully"
+else
+	cat "$testdir/outcome"
+fi
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/1953.out b/tests/generic/1953.out
new file mode 100644
index 0000000000..8eaacb7ff0
--- /dev/null
+++ b/tests/generic/1953.out
@@ -0,0 +1,6 @@
+QA output created by 1953
+Format and mount
+Create a many-block file
+Reflink the big file
+Terminated
+test completed successfully

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] generic: test reads racing with slow reflink operations
  2023-10-19 20:09 [RFC PATCH] generic: test reads racing with slow reflink operations Darrick J. Wong
@ 2023-10-20  6:09 ` Christoph Hellwig
  2023-10-20 15:22   ` Darrick J. Wong
  2023-10-20 13:37 ` Catherine Hoang
  1 sibling, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2023-10-20  6:09 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Catherine Hoang, linux-xfs

On Thu, Oct 19, 2023 at 01:09:13PM -0700, Darrick J. Wong wrote:
> permit reads, since the source file contents do  not change.  This is a

duplicate white space before "not"

> +#ifndef FICLONE
> +# define FICLONE	_IOW(0x94, 9, int)
> +#endif

Can't we assume system headers with FICLONE by now?

> +# real QA test starts here
> +_require_scratch_reflink

This needs a _require_scratch before the _require_scratch_reflink.

Otherwise this looks good to me:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] generic: test reads racing with slow reflink operations
  2023-10-19 20:09 [RFC PATCH] generic: test reads racing with slow reflink operations Darrick J. Wong
  2023-10-20  6:09 ` Christoph Hellwig
@ 2023-10-20 13:37 ` Catherine Hoang
  1 sibling, 0 replies; 5+ messages in thread
From: Catherine Hoang @ 2023-10-20 13:37 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, linux-xfs@vger.kernel.org

> On Oct 19, 2023, at 1:09 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> From: Darrick J. Wong <djwong@kernel.org>
> 
> XFS has a rather slow reflink operation.  While a reflink operation is
> running, other programs cannot read the contents of the source file,
> which is causing latency spikes.  Catherine Hoang wrote a patch to
> permit reads, since the source file contents do  not change.  This is a
> functionality test for that patch.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>

Looks good
Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
> src/Makefile              |    2 
> src/t_reflink_read_race.c |  343 +++++++++++++++++++++++++++++++++++++++++++++
> tests/generic/1953        |   74 ++++++++++
> tests/generic/1953.out    |    6 +
> 4 files changed, 424 insertions(+), 1 deletion(-)
> create mode 100644 src/t_reflink_read_race.c
> create mode 100755 tests/generic/1953
> create mode 100644 tests/generic/1953.out
> 
> diff --git a/src/Makefile b/src/Makefile
> index 72c8a13007..b5e2b84dae 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -33,7 +33,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
> attr_replace_test swapon mkswap t_attr_corruption t_open_tmpfiles \
> fscrypt-crypt-util bulkstat_null_ocount splice-test chprojid_fail \
> detached_mounts_propagation ext4_resize t_readdir_3 splice2pipe \
> - uuid_ioctl usemem_and_swapoff
> + uuid_ioctl usemem_and_swapoff t_reflink_read_race
> 
> EXTRA_EXECS = dmerror fill2attr fill2fs fill2fs_check scaleread.sh \
>      btrfs_crc32c_forged_name.py popdir.pl popattr.py \
> diff --git a/src/t_reflink_read_race.c b/src/t_reflink_read_race.c
> new file mode 100644
> index 0000000000..acf8f8f73c
> --- /dev/null
> +++ b/src/t_reflink_read_race.c
> @@ -0,0 +1,343 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Race reads with reflink to see if reads continue while we're cloning.
> + * Copyright 2023 Oracle.  All rights reserved.
> + */
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <linux/fs.h>
> +#include <time.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <fcntl.h>
> +#include <pthread.h>
> +#include <stdio.h>
> +#include <errno.h>
> +#include <string.h>
> +
> +#ifndef FICLONE
> +# define FICLONE _IOW(0x94, 9, int)
> +#endif
> +
> +static pid_t mypid;
> +
> +static FILE *outcome_fp;
> +
> +/* Significant timestamps.  Initialized to zero */
> +static double program_start, clone_start, clone_end;
> +
> +/* Coordinates access to timestamps */
> +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
> +
> +struct readinfo {
> + int fd;
> + unsigned int blocksize;
> + char *descr;
> +};
> +
> +struct cloneinfo {
> + int src_fd, dst_fd;
> +};
> +
> +#define MSEC_PER_SEC 1000
> +#define NSEC_PER_MSEC 1000000
> +
> +/*
> + * Assume that it shouldn't take longer than 100ms for the FICLONE ioctl to
> + * enter the kernel and take locks on an uncontended file.  This is also the
> + * required CLOCK_MONOTONIC granularity.
> + */
> +#define EARLIEST_READ_MS (MSEC_PER_SEC / 10)
> +
> +/*
> + * We want to be reasonably sure that the FICLONE takes long enough that any
> + * read initiated after clone operation locked the source file could have
> + * completed a disk IO before the clone finishes.  Therefore, we require that
> + * the clone operation must take at least 4x the maximum setup time.
> + */
> +#define MINIMUM_CLONE_MS (EARLIEST_READ_MS * 5)
> +
> +static double timespec_to_msec(const struct timespec *ts)
> +{
> + return (ts->tv_sec * (double)MSEC_PER_SEC) +
> +       (ts->tv_nsec / (double)NSEC_PER_MSEC);
> +}
> +
> +static void sleep_ms(unsigned int len)
> +{
> + struct timespec time = {
> + .tv_nsec = len * NSEC_PER_MSEC,
> + };
> +
> + nanosleep(&time, NULL);
> +}
> +
> +static void outcome(const char *str)
> +{
> + fprintf(outcome_fp, "%s\n", str);
> + fflush(outcome_fp);
> +}
> +
> +static void *reader_fn(void *arg)
> +{
> + struct timespec now;
> + struct readinfo *ri = arg;
> + char *buf = malloc(ri->blocksize);
> + loff_t pos = 0;
> + ssize_t copied;
> + int ret;
> +
> + if (!buf) {
> + perror("alloc buffer");
> + goto kill_error;
> + }
> +
> + /* Wait for the FICLONE to start */
> + pthread_mutex_lock(&lock);
> + while (clone_start < program_start) {
> + pthread_mutex_unlock(&lock);
> +#ifdef DEBUG
> + printf("%s read waiting for clone to start; cs=%.2f ps=%.2f\n",
> + ri->descr, clone_start, program_start);
> + fflush(stdout);
> +#endif
> + sleep_ms(1);
> + pthread_mutex_lock(&lock);
> + }
> + pthread_mutex_unlock(&lock);
> + sleep_ms(EARLIEST_READ_MS);
> +
> + /* Read from the file... */
> + while ((copied = read(ri->fd, buf, ri->blocksize)) > 0) {
> + double read_completion;
> +
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime");
> + goto kill_error;
> + }
> + read_completion = timespec_to_msec(&now);
> +
> + /*
> + * If clone_end is still zero, the FICLONE is still running.
> + * If the read completion occurred a safe duration after the
> + * start of the ioctl, then report that as an early read
> + * completion.
> + */
> + pthread_mutex_lock(&lock);
> + if (clone_end < program_start &&
> +    read_completion > clone_start + EARLIEST_READ_MS) {
> + pthread_mutex_unlock(&lock);
> +
> + /* clone still running... */
> + printf("finished %s read early at %.2fms\n",
> + ri->descr,
> + read_completion - program_start);
> + fflush(stdout);
> + outcome("finished read early");
> + goto kill_done;
> + }
> + pthread_mutex_unlock(&lock);
> +
> + sleep_ms(1);
> + pos += copied;
> + }
> + if (copied < 0) {
> + perror("read");
> + goto kill_error;
> + }
> +
> + return NULL;
> +kill_error:
> + outcome("reader error");
> +kill_done:
> + kill(mypid, SIGTERM);
> + return NULL;
> +}
> +
> +static void *clone_fn(void *arg)
> +{
> + struct timespec now;
> + struct cloneinfo *ci = arg;
> + int ret;
> +
> + /* Record the approximate start time of this thread */
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime clone start");
> + goto kill_error;
> + }
> + pthread_mutex_lock(&lock);
> + clone_start = timespec_to_msec(&now);
> + pthread_mutex_unlock(&lock);
> +
> + printf("started clone at %.2fms\n", clone_start - program_start);
> + fflush(stdout);
> +
> + /* Kernel call, only killable with a fatal signal */
> + ret = ioctl(ci->dst_fd, FICLONE, ci->src_fd);
> + if (ret) {
> + perror("FICLONE");
> + goto kill_error;
> + }
> +
> + /* If the ioctl completes, note the completion time */
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime clone end");
> + goto kill_error;
> + }
> +
> + pthread_mutex_lock(&lock);
> + clone_end = timespec_to_msec(&now);
> + pthread_mutex_unlock(&lock);
> +
> + printf("finished clone at %.2fms\n",
> + clone_end - program_start);
> + fflush(stdout);
> +
> + /* Complain if we didn't take long enough to clone. */
> + if (clone_end < clone_start + MINIMUM_CLONE_MS) {
> + printf("clone did not take enough time (%.2fms)\n",
> + clone_end - clone_start);
> + fflush(stdout);
> + outcome("clone too fast");
> + goto kill_done;
> + }
> +
> + outcome("clone finished before any reads");
> + goto kill_done;
> +
> +kill_error:
> + outcome("clone error");
> +kill_done:
> + kill(mypid, SIGTERM);
> + return NULL;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + struct cloneinfo ci;
> + struct readinfo bufio = { .descr = "buffered" };
> + struct readinfo directio = { .descr = "directio" };
> + struct timespec now;
> + pthread_t clone_thread, bufio_thread, directio_thread;
> + double clockres;
> + int ret;
> +
> + if (argc != 4) {
> + printf("Usage: %s src_file dst_file outcome_file\n", argv[0]);
> + return 1;
> + }
> +
> + mypid = getpid();
> +
> + /* Check for sufficient clock precision. */
> + ret = clock_getres(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_getres MONOTONIC");
> + return 2;
> + }
> + printf("CLOCK_MONOTONIC resolution: %llus %luns\n",
> + (unsigned long long)now.tv_sec,
> + (unsigned long)now.tv_nsec);
> + fflush(stdout);
> +
> + clockres = timespec_to_msec(&now);
> + if (clockres > EARLIEST_READ_MS) {
> + fprintf(stderr, "insufficient CLOCK_MONOTONIC resolution\n");
> + return 2;
> + }
> +
> + /*
> + * Measure program start time since the MONOTONIC time base is not
> + * all that well defined.
> + */
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime MONOTONIC");
> + return 2;
> + }
> + if (now.tv_sec == 0 && now.tv_nsec == 0) {
> + fprintf(stderr, "Uhoh, start time is zero?!\n");
> + return 2;
> + }
> + program_start = timespec_to_msec(&now);
> +
> + outcome_fp = fopen(argv[3], "w");
> + if (!outcome_fp) {
> + perror(argv[3]);
> + return 2;
> + }
> +
> + /* Open separate fds for each thread */
> + ci.src_fd = open(argv[1], O_RDONLY);
> + if (ci.src_fd < 0) {
> + perror(argv[1]);
> + return 2;
> + }
> +
> + ci.dst_fd = open(argv[2], O_RDWR | O_CREAT, 0600);
> + if (ci.dst_fd < 0) {
> + perror(argv[2]);
> + return 2;
> + }
> +
> + bufio.fd = open(argv[1], O_RDONLY);
> + if (bufio.fd < 0) {
> + perror(argv[1]);
> + return 2;
> + }
> + bufio.blocksize = 37;
> +
> + directio.fd = open(argv[1], O_RDONLY | O_DIRECT);
> + if (directio.fd < 0) {
> + perror(argv[1]);
> + return 2;
> + }
> + directio.blocksize = 512;
> +
> + /* Start threads */
> + ret = pthread_create(&clone_thread, NULL, clone_fn, &ci);
> + if (ret) {
> + fprintf(stderr, "create clone thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_create(&bufio_thread, NULL, reader_fn, &bufio);
> + if (ret) {
> + fprintf(stderr, "create bufio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_create(&directio_thread, NULL, reader_fn, &directio);
> + if (ret) {
> + fprintf(stderr, "create directio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + /* Wait for threads */
> + ret = pthread_join(clone_thread, NULL);
> + if (ret) {
> + fprintf(stderr, "join clone thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_join(bufio_thread, NULL);
> + if (ret) {
> + fprintf(stderr, "join bufio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_join(directio_thread, NULL);
> + if (ret) {
> + fprintf(stderr, "join directio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + printf("Program should have killed itself?\n");
> + outcome("program failed to end early");
> + return 0;
> +}
> diff --git a/tests/generic/1953 b/tests/generic/1953
> new file mode 100755
> index 0000000000..058538e6fe
> --- /dev/null
> +++ b/tests/generic/1953
> @@ -0,0 +1,74 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2023, Oracle and/or its affiliates.  All Rights Reserved.
> +#
> +# FS QA Test No. 1953
> +#
> +# Race file reads with a very slow reflink operation to see if the reads
> +# actually complete while the reflink is ongoing.  This is a functionality
> +# test for XFS commit f3ba4762fa56 "xfs: allow read IO and FICLONE to run
> +# concurrently".
> +#
> +. ./common/preamble
> +_begin_fstest auto clone punch
> +
> +# Import common functions.
> +. ./common/filter
> +. ./common/attr
> +. ./common/reflink
> +
> +# real QA test starts here
> +_require_scratch_reflink
> +_require_cp_reflink
> +_require_xfs_io_command "fpunch"
> +_require_test_program "punch-alternating"
> +_require_test_program "t_reflink_read_race"
> +
> +rm -f "$seqres.full"
> +
> +echo "Format and mount"
> +_scratch_mkfs > "$seqres.full" 2>&1
> +_scratch_mount >> "$seqres.full" 2>&1
> +
> +testdir="$SCRATCH_MNT/test-$seq"
> +mkdir "$testdir"
> +
> +calc_space() {
> + blocks_needed=$(( 2 ** (fnr + 1) ))
> + space_needed=$((blocks_needed * blksz * 5 / 4))
> +}
> +
> +# Figure out the number of blocks that we need to get the reflink runtime above
> +# 1 seconds
> +echo "Create a many-block file"
> +for ((fnr = 1; fnr < 40; fnr++)); do
> + free_blocks=$(stat -f -c '%a' "$testdir")
> + blksz=$(_get_file_block_size "$testdir")
> + space_avail=$((free_blocks * blksz))
> + calc_space
> + test $space_needed -gt $space_avail && \
> + _notrun "Insufficient space for stress test; would only create $blocks_needed extents."
> +
> + off=$(( (2 ** fnr) * blksz))
> + $XFS_IO_PROG -f -c "pwrite -S 0x61 -b 4194304 $off $off" "$testdir/file1" >> "$seqres.full"
> + "$here/src/punch-alternating" "$testdir/file1" >> "$seqres.full"
> +
> + timeout 1s cp --reflink=always "$testdir/file1" "$testdir/garbage" || break
> +done
> +echo "fnr=$fnr" >> $seqres.full
> +
> +echo "Reflink the big file"
> +$here/src/t_reflink_read_race "$testdir/file1" "$testdir/file2" \
> + "$testdir/outcome" &>> $seqres.full
> +
> +if [ ! -e "$testdir/outcome" ]; then
> + echo "Could not set up program"
> +elif grep -q "finished read early" "$testdir/outcome"; then
> + echo "test completed successfully"
> +else
> + cat "$testdir/outcome"
> +fi
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/1953.out b/tests/generic/1953.out
> new file mode 100644
> index 0000000000..8eaacb7ff0
> --- /dev/null
> +++ b/tests/generic/1953.out
> @@ -0,0 +1,6 @@
> +QA output created by 1953
> +Format and mount
> +Create a many-block file
> +Reflink the big file
> +Terminated
> +test completed successfully


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] generic: test reads racing with slow reflink operations
  2023-10-20  6:09 ` Christoph Hellwig
@ 2023-10-20 15:22   ` Darrick J. Wong
  2023-10-22 22:20     ` Dave Chinner
  0 siblings, 1 reply; 5+ messages in thread
From: Darrick J. Wong @ 2023-10-20 15:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Catherine Hoang, linux-xfs

On Thu, Oct 19, 2023 at 11:09:49PM -0700, Christoph Hellwig wrote:
> On Thu, Oct 19, 2023 at 01:09:13PM -0700, Darrick J. Wong wrote:
> > permit reads, since the source file contents do  not change.  This is a
> 
> duplicate white space before "not"

Fixed.

> > +#ifndef FICLONE
> > +# define FICLONE	_IOW(0x94, 9, int)
> > +#endif
> 
> Can't we assume system headers with FICLONE by now?

That depends on the long tail of RHEL7.  But you're right, it has been
years now, I'll just kill this hunk and see who complains. :)

> > +# real QA test starts here
> > +_require_scratch_reflink
> 
> This needs a _require_scratch before the _require_scratch_reflink.

Ok, will do.

> Otherwise this looks good to me:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Thanks!

--D

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] generic: test reads racing with slow reflink operations
  2023-10-20 15:22   ` Darrick J. Wong
@ 2023-10-22 22:20     ` Dave Chinner
  0 siblings, 0 replies; 5+ messages in thread
From: Dave Chinner @ 2023-10-22 22:20 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Catherine Hoang, linux-xfs

On Fri, Oct 20, 2023 at 08:22:48AM -0700, Darrick J. Wong wrote:
> On Thu, Oct 19, 2023 at 11:09:49PM -0700, Christoph Hellwig wrote:
> > On Thu, Oct 19, 2023 at 01:09:13PM -0700, Darrick J. Wong wrote:
> > > permit reads, since the source file contents do  not change.  This is a
> > 
> > duplicate white space before "not"
> 
> Fixed.
> 
> > > +#ifndef FICLONE
> > > +# define FICLONE	_IOW(0x94, 9, int)
> > > +#endif
> > 
> > Can't we assume system headers with FICLONE by now?
> 
> That depends on the long tail of RHEL7.  But you're right, it has been
> years now, I'll just kill this hunk and see who complains. :)

*cough*

Just add an autoconf rule, yes?

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-10-22 22:20 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-10-19 20:09 [RFC PATCH] generic: test reads racing with slow reflink operations Darrick J. Wong
2023-10-20  6:09 ` Christoph Hellwig
2023-10-20 15:22   ` Darrick J. Wong
2023-10-22 22:20     ` Dave Chinner
2023-10-20 13:37 ` Catherine Hoang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox