* [PATCH][cr-test]: filelock1: Test restore of adivsory locks during restart
@ 2010-01-20 23:06 Sukadev Bhattiprolu
[not found] ` <20100120230621.GA30288-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 2+ messages in thread
From: Sukadev Bhattiprolu @ 2010-01-20 23:06 UTC (permalink / raw)
To: serue-r/Jw6+rmf7HQT0dZR+AlfA; +Cc: Containers
This test currently fails during restart on ckpt-v19-rc2.
On Serge's cr-next it fails cleanly during checkpoint due to:
commit 5d1f1227384876dd13a66cad1f286d98f9b1891b
Author: Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu Dec 17 09:35:13 2009 -0800
ckpt-files: error out on file locks and leases
---
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Fri, 15 Jan 2010 15:33:55 -0800
Subject: [PATCH] filelock1: Test restore of adivsory locks during restart
Test that any byte-range locks held by a process at the time of
checkpoint are restored correctly after restart.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
fileio/Makefile | 9 +-
fileio/filelock1.c | 383 +++++++++++++++++++++++++++++++++++++++++++++++
fileio/run-filelock1.sh | 218 +++++++++++++++++++++++++++
3 files changed, 608 insertions(+), 2 deletions(-)
create mode 100644 fileio/filelock1.c
create mode 100755 fileio/run-filelock1.sh
diff --git a/fileio/Makefile b/fileio/Makefile
index 071a9eb..40d19da 100644
--- a/fileio/Makefile
+++ b/fileio/Makefile
@@ -1,6 +1,11 @@
-targets = fileio1
+targets = fileio1 filelock1
-all: $(targets)
+INCLUDE = ../libcrtest
+LIBCRTEST = ../libcrtest/common.o
+CFLAGS = -I $(INCLUDE)
+LDFLAGS = $(LIBCRTEST)
+
+all: $(LIBCRTEST) $(targets)
clean:
rm -f $(targets)
diff --git a/fileio/filelock1.c b/fileio/filelock1.c
new file mode 100644
index 0000000..305cbeb
--- /dev/null
+++ b/fileio/filelock1.c
@@ -0,0 +1,383 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
+#include "libcrtest.h"
+
+#define TEST_FILE "data.d/data.filelock1"
+#define LOG_FILE "logs.d/log.filelock1"
+
+typedef unsigned long long u64;
+
+extern FILE *logfp;
+int test_fd;
+int event_fd1;
+int event_fd2;
+
+/*
+ * Description:
+ * Ensure that F_RDLCK and F_WRLCK byte-range locks held by a process at
+ * the time of checkpoint are properly restored when the process is
+ * restarted from the checkpoint.
+ *
+ * Implementation:
+ * Two processes, P0 and P1 acquire the set of locks described by
+ * locks_list[] below. Then, they notify the parent that they are ready for
+ * checkpoint and wait for checkpoint to be done. When they are restarted
+ * (i.e when test_done() is TRUE), each process verifies that it has the
+ * locks it had at the time of checkpoint and that it cannot grab a lock
+ * held by the other process.
+ */
+
+setup_notification()
+{
+ int efd;
+
+ efd = eventfd(0, 0);
+ if (efd < 0) {
+ fprintf(logfp, "ERROR: eventfd(): %s\n", strerror(errno));
+ do_exit(1);
+ }
+ return efd;
+}
+
+wait_for_events(int efd, u64 total)
+{
+ int n;
+ u64 events;
+ u64 count = (u64)0;
+
+ do {
+ fprintf(logfp, "%d: wait_for_events: fd %d, reading for %llu\n",
+ getpid(), efd, total);
+ fflush(logfp);
+
+ n = read(efd, &events, sizeof(events));
+ if (n != sizeof(events)) {
+ fprintf(logfp, "ERROR: read(event_fd) %s\n",
+ strerror(errno));
+ do_exit(1);
+ }
+ fprintf(logfp, "%d: wait_for_events: fd %d read %llu\n",
+ getpid(), efd, events);
+
+ count += events;
+ } while (count < total);
+}
+
+notify_one_event(int efd)
+{
+ int n;
+ u64 event = (u64)1;
+
+ fprintf(logfp, "%d: Notifying one event on fd %d\n", getpid(), efd);
+ fflush(logfp);
+
+ n = write(efd, &event, sizeof(event));
+ if (n != sizeof(event)) {
+ fprintf(logfp, "ERROR: write(event_fd) %s\n", strerror(errno));
+ do_exit(1);
+ }
+}
+
+struct test_arg {
+ int child_idx;
+ int type;
+ int start;
+ int len;
+};
+
+struct test_arg locks_list[] = {
+ { 0, F_WRLCK, 0, 17 },
+ { 1, F_WRLCK, 18, 16 },
+ { 0, F_WRLCK, 35, 27 },
+ { 1, F_WRLCK, 63, 17 },
+ { 0, F_RDLCK, 81, 25 },
+ { 1, F_RDLCK, 81, 25 },
+};
+
+void set_lock(int fd, struct test_arg *tlock)
+{
+ int rc;
+ struct flock lock;
+
+ lock.l_type = tlock->type;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = (off_t)tlock->start;
+ lock.l_len = (off_t)tlock->len;
+
+ rc = fcntl(fd, F_SETLK, &lock);
+ if (rc < 0 && errno != EAGAIN) {
+ fprintf(logfp, "%d: set_lock(): ERROR [%d, %llu, %llu]: %s\n",
+ getpid(), tlock->type, (u64)tlock->start,
+ (u64)tlock->len, strerror(errno));
+ fflush(logfp);
+ kill(getppid(), SIGUSR1);
+ do_exit(1);
+ }
+
+ fprintf(logfp, "%d: set_lock(): [%d, %llu, %llu] %s\n", getpid(),
+ tlock->type, (u64)tlock->start, (u64)tlock->len,
+ rc < 0 ? strerror(errno) : "done");
+}
+/*
+ * If @set is TRUE, ensure that the given lock is set.
+ * If @set is FALSE, ensure that the given lock is NOT set.
+ */
+void test_lock(int fd, int locked_by_me, struct test_arg *tlock)
+{
+ int rc;
+ int conflict;
+ struct flock lock;
+ char lock_info[512];
+
+ lock.l_type = tlock->type;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = (off_t)tlock->start;
+ lock.l_len = (off_t)tlock->len;
+ lock.l_pid = 0;
+
+ sprintf(lock_info, "lock [%d, %llu, %llu] ", tlock->type,
+ (u64)tlock->start, (u64)tlock->len);
+
+ conflict = 0;
+ rc = fcntl(fd, F_SETLK, &lock);
+ if (rc < 0 && (errno == EAGAIN || errno == EACCES)) {
+ rc = fcntl(fd, F_GETLK, &lock);
+ if (rc < 0) {
+ fprintf(logfp, "ERROR: fcntl(F_GETLK): %s, error %s\n",
+ lock_info, strerror(errno));
+ goto error;
+ }
+
+ if (lock.l_type == F_UNLCK || lock.l_pid == 0) {
+ fprintf(logfp, "%d: ERROR: %s F_SETLK / F_GETLK "
+ "mismatch !!!\n", getpid(), lock_info);
+ goto error;
+ }
+ conflict = 1;
+ } else if (rc < 0) {
+ fprintf(logfp, "ERROR: fcntl(F_SETLK): %s, error %s\n",
+ lock_info, strerror(errno));
+ goto error;
+ }
+
+ fprintf(logfp, "%d: %s, locked_by_me: %d, conflict %d\n", getpid(),
+ lock_info, locked_by_me, conflict);
+
+ if (locked_by_me && conflict) {
+ fprintf(logfp, "%d: FAIL: %s is NOT set by me !!!\n", getpid(),
+ lock_info);
+ goto error;
+ } else if (!locked_by_me && !conflict) {
+ fprintf(logfp, "%d: FAIL: %s is NOT set by peer !!!\n",
+ getpid(), lock_info);
+ goto error;
+ } else {
+ fprintf(logfp, "%d: PASS: %s is %sset by me\n",
+ getpid(), lock_info, conflict ? "not " : "");
+ return;
+ }
+
+error:
+ fflush(logfp);
+ kill(getppid(), SIGUSR1);
+ do_exit(1);
+}
+
+void handler(int sig)
+{
+ /*
+ * We completed the test and siblings have completed their test.
+ * So, safe to drop our locks and exit.
+ */
+ fprintf(logfp, "%d: Ok to exit...\n", getpid());
+ fflush(logfp);
+ do_exit(0);
+}
+
+int do_child1(int idx)
+{
+ int rc;
+ int locked_by_me;
+ int i;
+ int num_locks;
+ int failed;
+
+ signal(SIGINT, handler);
+
+ num_locks = sizeof(locks_list) / sizeof(struct test_arg);
+
+ for (i = 0; i < num_locks; i++) {
+ if (idx != locks_list[i].child_idx)
+ continue;
+
+ set_lock(test_fd, &locks_list[i]);
+ }
+
+ /*
+ * Tell parent we are ready for checkpoint...
+ */
+ notify_one_event(event_fd1);
+
+ /*
+ * Wait for checkpoint/restart
+ */
+ fprintf(logfp, "%d: waiting for test-done\n", idx);
+ fflush(logfp);
+ while(!test_done()) {
+ sleep(1);
+ }
+ fprintf(logfp, "%d: Found test-done\n", idx);
+ fflush(logfp);
+
+ for (i = 0; i < num_locks; i++) {
+ /*
+ * If we had (not) set the lock earlier, ensure we still have
+ * it (not) set.
+ */
+ locked_by_me = 0;
+ if (idx == locks_list[i].child_idx ||
+ locks_list[i].type == F_RDLCK)
+ locked_by_me = 1;
+
+ test_lock(test_fd, locked_by_me, &locks_list[i]);
+ }
+
+ /*
+ * Notify parent that we are done testing the locks.
+ */
+ notify_one_event(event_fd2);
+
+ /*
+ * Hold onto our locks and wait for siblings to complete their
+ * test on our locks. Parent will SIGINT us when it is safe to
+ * exit.
+ */
+ pause();
+
+ do_exit(0);
+}
+
+/*
+ * Populate the test file so the children can lock some portions of
+ * the file
+ */
+void setup_test_file()
+{
+ char buf[256];
+
+ test_fd = open(TEST_FILE, O_RDWR|O_CREAT|O_TRUNC, 0666);
+ if (test_fd < 0) {
+ fprintf(logfp, "ERROR: open(%s): %s\n", TEST_FILE,
+ strerror(errno));
+ do_exit(1);
+ }
+
+ memset(buf, 0, sizeof(buf));
+ write(test_fd, buf, sizeof(buf));
+}
+
+int pid1, pid2;
+void child_handler(int sig)
+{
+ /*
+ * Wait for the child that exited prematurely
+ */
+ fprintf(logfp, "%d: Got signal %d\n", getpid(), sig);
+ fflush(logfp);
+
+ if (sig == SIGCHLD)
+ do_wait(1);
+ fprintf(logfp, "%d: Test case FAILED\n", getpid());
+ fflush(logfp);
+ /*
+ * Kill (remaining) children and exit.
+ */
+ kill(pid1, SIGKILL);
+ kill(pid2, SIGKILL);
+
+ do_exit(-1);
+}
+
+main(int argc, char *argv[])
+{
+ int i, status, rc;
+
+ if (test_done()) {
+ printf("Remove %s before running test\n", TEST_DONE);
+ do_exit(1);
+ }
+
+ logfp = fopen(LOG_FILE, "w");
+ if (!logfp) {
+ perror("open() logfile");
+ do_exit(1);
+ }
+
+ printf("%s: Closing stdio fds and writing messages to %s\n",
+ argv[0], LOG_FILE);
+
+ for (i=0; i<100; i++) {
+ if (fileno(logfp) != i)
+ close(i);
+ }
+
+ setup_test_file();
+ event_fd1 = setup_notification();
+ event_fd2 = setup_notification();
+
+ /*
+ * Before waiting for events below, ensure we will be notified
+ * if a child encounters an error and/or exits prematurely.
+ */
+ signal(SIGUSR1, child_handler);
+ signal(SIGCHLD, child_handler);
+
+ /*
+ * Create the first child and wait for it take its record locks
+ */
+ pid1 = fork();
+ if (pid1 == 0)
+ do_child1(0);
+ wait_for_events(event_fd1, 1);
+
+ /*
+ * Create the second child and wait for it take its locks.
+ */
+ pid2 = fork();
+ if (pid2 == 0)
+ do_child1(1);
+ wait_for_events(event_fd1, 1);
+
+ /*
+ * Now that the test processes are ready, tell any wrapper scripts,
+ * we are ready for checkpoint
+ */
+ set_checkpoint_ready();
+
+ fprintf(logfp, "***** %d: Ready for checkpoint\n", getpid());
+ fflush(logfp);
+
+ /*
+ * Wait for all children to test the locks. Since a processes locks
+ * are dropped on exit, if process P1 exits before process P2 has
+ * completed testing a conflicting lock, P2 may acquire the lock
+ * supposed to be held by P1 and wrongly assume that test failed.
+ */
+ wait_for_events(event_fd2, 2);
+
+ signal(SIGCHLD, SIG_IGN);
+
+ /*
+ * Tell children it is safe to exit
+ */
+ kill(pid1, SIGINT);
+ kill(pid2, SIGINT);
+
+ do_wait(2);
+
+ do_exit(0);
+}
diff --git a/fileio/run-filelock1.sh b/fileio/run-filelock1.sh
new file mode 100755
index 0000000..0ba2d18
--- /dev/null
+++ b/fileio/run-filelock1.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+source ../common.sh
+
+dir=`mktemp -p . -d -t cr_filelock1_XXXXXXX` || (echo "mktemp failed"; exit 1)
+
+# NOTE: As of ckpt-v15-dev, the --container option to 'ckpt' causes this
+# test to fail with "container not isolated" message due to the
+# log-file being shared between the application threads.
+#
+CHECKPOINT="`which checkpoint` --container"
+RESTART=`which restart`
+ECHO="/bin/echo -e"
+
+TEST_CMD="../filelock1"
+TEST_ARGS=""
+TEST_LOG="logs.d/log.filelock1"
+SCRIPT_LOG="logs.d/log.run-filelock1"
+TEST_PID_FILE="pid.filelock1";
+
+SNAPSHOT_DIR="snap1.d"
+
+TEST_DONE="test-done"
+CHECKPOINT_FILE="checkpoint-filelock1";
+CHECKPOINT_READY="checkpoint-ready"
+CHECKPOINT_DONE="checkpoint-done"
+
+LOGS_DIR="logs.d"
+DATA_DIR="data.d"
+
+NS_EXEC="../../ns_exec"
+NS_EXEC_ARGS="-cgpuimP $TEST_PID_FILE"
+
+checkpoint()
+{
+ local pid=$1
+
+ $ECHO "\t- Checkpoint: $CHECKPOINT $pid \> $CHECKPOINT_FILE"
+ $CHECKPOINT $pid > $CHECKPOINT_FILE
+ ret=$?
+ if [ $ret -ne 0 ]; then
+ $ECHO "***** FAIL: Checkpoint of $pid failed"
+ ps -efL |grep $TEST_CMD >> $SCRIPT_LOG
+ exit 1;
+ fi
+}
+
+function wait_for_checkpoint_ready()
+{
+ # Wait for test to finish setup
+ while [ ! -f $CHECKPOINT_READY ]; do
+ $ECHO "\t- Waiting for $CHECKPOINT_READY"
+ sleep 1;
+ done;
+}
+
+function create_container()
+{
+ local pid;
+
+ cmdline="$NS_EXEC $NS_EXEC_ARGS -- $TEST_CMD $TEST_ARGS"
+
+ $ECHO "\t- Creating container:"
+ $ECHO "\t- $cmdline"
+
+ $cmdline &
+
+ wait_for_checkpoint_ready;
+
+ # Find global pid of container-init
+ pid=`cat $TEST_PID_FILE`;
+ if [ "x$pid" == "x" ]; then
+ $ECHO "***** FAIL: Invalid container-init pid $pid"
+ ps -efL |grep $TEST_CMD >> $SCRIPT_LOG
+ exit 1
+ fi
+ $ECHO "Created container with pid $pid" >> $SCRIPT_LOG
+}
+
+function restart_container
+{
+ local ret;
+
+ cmdline="$RESTART --pids --pidns --wait"
+ $ECHO "\t- $cmdline"
+
+ sleep 1
+
+ $cmdline < $CHECKPOINT_FILE >> $SCRIPT_LOG 2>&1 &
+ ret=$?
+
+ if [ $ret -ne 0 ]; then
+ $ECHO "***** FAIL: Restart of $pid failed"
+ ps -efL |grep $TEST_CMD >> $SCRIPT_LOG
+ exit 1;
+ fi
+
+}
+
+function create_fs_snapshot()
+{
+ # Prepare for snapshot
+ if [ -d $SNAPSHOT_DIR ]; then
+ rm -rf ${SNAPSHOT_DIR}.prev
+ mv $SNAPSHOT_DIR ${SNAPSHOT_DIR}.prev
+ mkdir $SNAPSHOT_DIR
+ fi
+
+ # Snapshot the log files
+ cp ${LOGS_DIR}/* $SNAPSHOT_DIR
+}
+
+function restore_fs_snapshot()
+{
+ # Restore the snapshot after the main process has been killed
+ /bin/cp ${SNAPSHOT_DIR}/* $LOGS_DIR
+}
+
+cd $dir
+echo "Current directory: `pwd`"
+
+if [ ! -d $LOGS_DIR ]; then
+ mkdir $LOGS_DIR
+fi
+
+if [ ! -d $DATA_DIR ]; then
+ mkdir $DATA_DIR
+fi
+
+if [ ! -d $SNAPSHOT_DIR ]; then
+ mkdir $SNAPSHOT_DIR
+fi
+
+if [ ! -f $INPUT_DATA ]; then
+ $FILEIO -C $INPUT_DATA
+fi
+
+# Make sure no stray filelock1 process from another run is still going
+killall $TEST_CMD > $SCRIPT_LOG 2>&1
+
+> $SCRIPT_LOG;
+cnt=1
+while [ $cnt -lt 20 ]; do
+ $ECHO "===== Iteration $cnt"
+
+ # Remove any 'state' files, start the app and let it tell us
+ # when it is ready
+ rm -f $CHECKPOINT_READY $TEST_DONE $TEST_PID_FILE
+
+ create_container
+ wait_for_checkpoint_ready
+
+ pid=`cat $TEST_PID_FILE`
+
+ $ECHO "\t- Done creating container, cinit-pid $pid"
+
+ ps -efL |grep $TEST_CMD >> $SCRIPT_LOG
+
+ # override default freezerdir
+ if [ -d $freezerdir ]; then
+ rmdir $freezerdir
+ fi
+ freezerdir=$freezermountpoint/$pid
+ freeze_pid $pid
+
+ num_pids1=`ps -efL |grep $TEST_CMD | wc -l`
+
+ create_fs_snapshot
+
+ checkpoint $pid
+
+ touch $CHECKPOINT_DONE
+
+ killall -9 `basename $TEST_CMD`
+
+ thaw
+
+ sleep 3
+
+ restore_fs_snapshot
+
+ restart_container
+
+ sleep 3;
+
+ num_pids2=`ps -efL |grep $TEST_CMD | wc -l`
+ ps -efL |grep $TEST_CMD >> $SCRIPT_LOG
+ $ECHO "\t- num_pids1 $num_pids1, num_pids2 $num_pids2";
+
+ # ns_exec pid is parent-pid of restarted-container-init
+ nspid=`pidof restart`
+
+ if [ "x$nspid" == "x" ]; then
+ $ECHO "***** FAIL: Can't find pid of $RESTART"
+ exit 1;
+ fi
+
+ # End test gracefully
+ touch $TEST_DONE
+
+ $ECHO "\t- Waiting for restarted container to exit (gloabl-pid $nspid)"
+ wait $nspid;
+ ret=$?
+
+ grep --binary-files=text FAIL $PWD/$TEST_LOG > /dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ $ECHO "\t***** Application FAILED after restart" >> $SCRIPT_LOG
+ $ECHO "\t***** See $TEST_LOG for details" >> $SCRIPT_LOG
+
+ $ECHO "\t***** Application FAILED after restart"
+ $ECHO "\tSee $PWD/$TEST_LOG for details"
+ exit 1;
+ fi
+
+ $ECHO "\t- Container exited, status $ret"
+
+ cnt=$((cnt+1))
+done
--
1.6.0.4
^ permalink raw reply related [flat|nested] 2+ messages in thread[parent not found: <20100120230621.GA30288-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH][cr-test]: filelock1: Test restore of adivsory locks during restart [not found] ` <20100120230621.GA30288-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2010-01-21 21:37 ` Serge E. Hallyn 0 siblings, 0 replies; 2+ messages in thread From: Serge E. Hallyn @ 2010-01-21 21:37 UTC (permalink / raw) To: Sukadev Bhattiprolu; +Cc: Containers thanks, both applied. seems to do fine on s390 on v19-5. Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org): > > This test currently fails during restart on ckpt-v19-rc2. > > On Serge's cr-next it fails cleanly during checkpoint due to: > > commit 5d1f1227384876dd13a66cad1f286d98f9b1891b > Author: Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> > Date: Thu Dec 17 09:35:13 2009 -0800 > > ckpt-files: error out on file locks and leases > > --- > From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> > Date: Fri, 15 Jan 2010 15:33:55 -0800 > Subject: [PATCH] filelock1: Test restore of adivsory locks during restart > > Test that any byte-range locks held by a process at the time of > checkpoint are restored correctly after restart. > > Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> > --- > fileio/Makefile | 9 +- > fileio/filelock1.c | 383 +++++++++++++++++++++++++++++++++++++++++++++++ > fileio/run-filelock1.sh | 218 +++++++++++++++++++++++++++ > 3 files changed, 608 insertions(+), 2 deletions(-) > create mode 100644 fileio/filelock1.c > create mode 100755 fileio/run-filelock1.sh > > diff --git a/fileio/Makefile b/fileio/Makefile > index 071a9eb..40d19da 100644 > --- a/fileio/Makefile > +++ b/fileio/Makefile > @@ -1,6 +1,11 @@ > -targets = fileio1 > +targets = fileio1 filelock1 > > -all: $(targets) > +INCLUDE = ../libcrtest > +LIBCRTEST = ../libcrtest/common.o > +CFLAGS = -I $(INCLUDE) > +LDFLAGS = $(LIBCRTEST) > + > +all: $(LIBCRTEST) $(targets) > > clean: > rm -f $(targets) > diff --git a/fileio/filelock1.c b/fileio/filelock1.c > new file mode 100644 > index 0000000..305cbeb > --- /dev/null > +++ b/fileio/filelock1.c > @@ -0,0 +1,383 @@ > +#include <stdio.h> > +#include <unistd.h> > +#include <fcntl.h> > +#include <string.h> > +#include <signal.h> > +#include <errno.h> > +#include "libcrtest.h" > + > +#define TEST_FILE "data.d/data.filelock1" > +#define LOG_FILE "logs.d/log.filelock1" > + > +typedef unsigned long long u64; > + > +extern FILE *logfp; > +int test_fd; > +int event_fd1; > +int event_fd2; > + > +/* > + * Description: > + * Ensure that F_RDLCK and F_WRLCK byte-range locks held by a process at > + * the time of checkpoint are properly restored when the process is > + * restarted from the checkpoint. > + * > + * Implementation: > + * Two processes, P0 and P1 acquire the set of locks described by > + * locks_list[] below. Then, they notify the parent that they are ready for > + * checkpoint and wait for checkpoint to be done. When they are restarted > + * (i.e when test_done() is TRUE), each process verifies that it has the > + * locks it had at the time of checkpoint and that it cannot grab a lock > + * held by the other process. > + */ > + > +setup_notification() > +{ > + int efd; > + > + efd = eventfd(0, 0); > + if (efd < 0) { > + fprintf(logfp, "ERROR: eventfd(): %s\n", strerror(errno)); > + do_exit(1); > + } > + return efd; > +} > + > +wait_for_events(int efd, u64 total) > +{ > + int n; > + u64 events; > + u64 count = (u64)0; > + > + do { > + fprintf(logfp, "%d: wait_for_events: fd %d, reading for %llu\n", > + getpid(), efd, total); > + fflush(logfp); > + > + n = read(efd, &events, sizeof(events)); > + if (n != sizeof(events)) { > + fprintf(logfp, "ERROR: read(event_fd) %s\n", > + strerror(errno)); > + do_exit(1); > + } > + fprintf(logfp, "%d: wait_for_events: fd %d read %llu\n", > + getpid(), efd, events); > + > + count += events; > + } while (count < total); > +} > + > +notify_one_event(int efd) > +{ > + int n; > + u64 event = (u64)1; > + > + fprintf(logfp, "%d: Notifying one event on fd %d\n", getpid(), efd); > + fflush(logfp); > + > + n = write(efd, &event, sizeof(event)); > + if (n != sizeof(event)) { > + fprintf(logfp, "ERROR: write(event_fd) %s\n", strerror(errno)); > + do_exit(1); > + } > +} > + > +struct test_arg { > + int child_idx; > + int type; > + int start; > + int len; > +}; > + > +struct test_arg locks_list[] = { > + { 0, F_WRLCK, 0, 17 }, > + { 1, F_WRLCK, 18, 16 }, > + { 0, F_WRLCK, 35, 27 }, > + { 1, F_WRLCK, 63, 17 }, > + { 0, F_RDLCK, 81, 25 }, > + { 1, F_RDLCK, 81, 25 }, > +}; > + > +void set_lock(int fd, struct test_arg *tlock) > +{ > + int rc; > + struct flock lock; > + > + lock.l_type = tlock->type; > + lock.l_whence = SEEK_SET; > + lock.l_start = (off_t)tlock->start; > + lock.l_len = (off_t)tlock->len; > + > + rc = fcntl(fd, F_SETLK, &lock); > + if (rc < 0 && errno != EAGAIN) { > + fprintf(logfp, "%d: set_lock(): ERROR [%d, %llu, %llu]: %s\n", > + getpid(), tlock->type, (u64)tlock->start, > + (u64)tlock->len, strerror(errno)); > + fflush(logfp); > + kill(getppid(), SIGUSR1); > + do_exit(1); > + } > + > + fprintf(logfp, "%d: set_lock(): [%d, %llu, %llu] %s\n", getpid(), > + tlock->type, (u64)tlock->start, (u64)tlock->len, > + rc < 0 ? strerror(errno) : "done"); > +} > +/* > + * If @set is TRUE, ensure that the given lock is set. > + * If @set is FALSE, ensure that the given lock is NOT set. > + */ > +void test_lock(int fd, int locked_by_me, struct test_arg *tlock) > +{ > + int rc; > + int conflict; > + struct flock lock; > + char lock_info[512]; > + > + lock.l_type = tlock->type; > + lock.l_whence = SEEK_SET; > + lock.l_start = (off_t)tlock->start; > + lock.l_len = (off_t)tlock->len; > + lock.l_pid = 0; > + > + sprintf(lock_info, "lock [%d, %llu, %llu] ", tlock->type, > + (u64)tlock->start, (u64)tlock->len); > + > + conflict = 0; > + rc = fcntl(fd, F_SETLK, &lock); > + if (rc < 0 && (errno == EAGAIN || errno == EACCES)) { > + rc = fcntl(fd, F_GETLK, &lock); > + if (rc < 0) { > + fprintf(logfp, "ERROR: fcntl(F_GETLK): %s, error %s\n", > + lock_info, strerror(errno)); > + goto error; > + } > + > + if (lock.l_type == F_UNLCK || lock.l_pid == 0) { > + fprintf(logfp, "%d: ERROR: %s F_SETLK / F_GETLK " > + "mismatch !!!\n", getpid(), lock_info); > + goto error; > + } > + conflict = 1; > + } else if (rc < 0) { > + fprintf(logfp, "ERROR: fcntl(F_SETLK): %s, error %s\n", > + lock_info, strerror(errno)); > + goto error; > + } > + > + fprintf(logfp, "%d: %s, locked_by_me: %d, conflict %d\n", getpid(), > + lock_info, locked_by_me, conflict); > + > + if (locked_by_me && conflict) { > + fprintf(logfp, "%d: FAIL: %s is NOT set by me !!!\n", getpid(), > + lock_info); > + goto error; > + } else if (!locked_by_me && !conflict) { > + fprintf(logfp, "%d: FAIL: %s is NOT set by peer !!!\n", > + getpid(), lock_info); > + goto error; > + } else { > + fprintf(logfp, "%d: PASS: %s is %sset by me\n", > + getpid(), lock_info, conflict ? "not " : ""); > + return; > + } > + > +error: > + fflush(logfp); > + kill(getppid(), SIGUSR1); > + do_exit(1); > +} > + > +void handler(int sig) > +{ > + /* > + * We completed the test and siblings have completed their test. > + * So, safe to drop our locks and exit. > + */ > + fprintf(logfp, "%d: Ok to exit...\n", getpid()); > + fflush(logfp); > + do_exit(0); > +} > + > +int do_child1(int idx) > +{ > + int rc; > + int locked_by_me; > + int i; > + int num_locks; > + int failed; > + > + signal(SIGINT, handler); > + > + num_locks = sizeof(locks_list) / sizeof(struct test_arg); > + > + for (i = 0; i < num_locks; i++) { > + if (idx != locks_list[i].child_idx) > + continue; > + > + set_lock(test_fd, &locks_list[i]); > + } > + > + /* > + * Tell parent we are ready for checkpoint... > + */ > + notify_one_event(event_fd1); > + > + /* > + * Wait for checkpoint/restart > + */ > + fprintf(logfp, "%d: waiting for test-done\n", idx); > + fflush(logfp); > + while(!test_done()) { > + sleep(1); > + } > + fprintf(logfp, "%d: Found test-done\n", idx); > + fflush(logfp); > + > + for (i = 0; i < num_locks; i++) { > + /* > + * If we had (not) set the lock earlier, ensure we still have > + * it (not) set. > + */ > + locked_by_me = 0; > + if (idx == locks_list[i].child_idx || > + locks_list[i].type == F_RDLCK) > + locked_by_me = 1; > + > + test_lock(test_fd, locked_by_me, &locks_list[i]); > + } > + > + /* > + * Notify parent that we are done testing the locks. > + */ > + notify_one_event(event_fd2); > + > + /* > + * Hold onto our locks and wait for siblings to complete their > + * test on our locks. Parent will SIGINT us when it is safe to > + * exit. > + */ > + pause(); > + > + do_exit(0); > +} > + > +/* > + * Populate the test file so the children can lock some portions of > + * the file > + */ > +void setup_test_file() > +{ > + char buf[256]; > + > + test_fd = open(TEST_FILE, O_RDWR|O_CREAT|O_TRUNC, 0666); > + if (test_fd < 0) { > + fprintf(logfp, "ERROR: open(%s): %s\n", TEST_FILE, > + strerror(errno)); > + do_exit(1); > + } > + > + memset(buf, 0, sizeof(buf)); > + write(test_fd, buf, sizeof(buf)); > +} > + > +int pid1, pid2; > +void child_handler(int sig) > +{ > + /* > + * Wait for the child that exited prematurely > + */ > + fprintf(logfp, "%d: Got signal %d\n", getpid(), sig); > + fflush(logfp); > + > + if (sig == SIGCHLD) > + do_wait(1); > + fprintf(logfp, "%d: Test case FAILED\n", getpid()); > + fflush(logfp); > + /* > + * Kill (remaining) children and exit. > + */ > + kill(pid1, SIGKILL); > + kill(pid2, SIGKILL); > + > + do_exit(-1); > +} > + > +main(int argc, char *argv[]) > +{ > + int i, status, rc; > + > + if (test_done()) { > + printf("Remove %s before running test\n", TEST_DONE); > + do_exit(1); > + } > + > + logfp = fopen(LOG_FILE, "w"); > + if (!logfp) { > + perror("open() logfile"); > + do_exit(1); > + } > + > + printf("%s: Closing stdio fds and writing messages to %s\n", > + argv[0], LOG_FILE); > + > + for (i=0; i<100; i++) { > + if (fileno(logfp) != i) > + close(i); > + } > + > + setup_test_file(); > + event_fd1 = setup_notification(); > + event_fd2 = setup_notification(); > + > + /* > + * Before waiting for events below, ensure we will be notified > + * if a child encounters an error and/or exits prematurely. > + */ > + signal(SIGUSR1, child_handler); > + signal(SIGCHLD, child_handler); > + > + /* > + * Create the first child and wait for it take its record locks > + */ > + pid1 = fork(); > + if (pid1 == 0) > + do_child1(0); > + wait_for_events(event_fd1, 1); > + > + /* > + * Create the second child and wait for it take its locks. > + */ > + pid2 = fork(); > + if (pid2 == 0) > + do_child1(1); > + wait_for_events(event_fd1, 1); > + > + /* > + * Now that the test processes are ready, tell any wrapper scripts, > + * we are ready for checkpoint > + */ > + set_checkpoint_ready(); > + > + fprintf(logfp, "***** %d: Ready for checkpoint\n", getpid()); > + fflush(logfp); > + > + /* > + * Wait for all children to test the locks. Since a processes locks > + * are dropped on exit, if process P1 exits before process P2 has > + * completed testing a conflicting lock, P2 may acquire the lock > + * supposed to be held by P1 and wrongly assume that test failed. > + */ > + wait_for_events(event_fd2, 2); > + > + signal(SIGCHLD, SIG_IGN); > + > + /* > + * Tell children it is safe to exit > + */ > + kill(pid1, SIGINT); > + kill(pid2, SIGINT); > + > + do_wait(2); > + > + do_exit(0); > +} > diff --git a/fileio/run-filelock1.sh b/fileio/run-filelock1.sh > new file mode 100755 > index 0000000..0ba2d18 > --- /dev/null > +++ b/fileio/run-filelock1.sh > @@ -0,0 +1,218 @@ > +#!/bin/bash > + > +source ../common.sh > + > +dir=`mktemp -p . -d -t cr_filelock1_XXXXXXX` || (echo "mktemp failed"; exit 1) > + > +# NOTE: As of ckpt-v15-dev, the --container option to 'ckpt' causes this > +# test to fail with "container not isolated" message due to the > +# log-file being shared between the application threads. > +# > +CHECKPOINT="`which checkpoint` --container" > +RESTART=`which restart` > +ECHO="/bin/echo -e" > + > +TEST_CMD="../filelock1" > +TEST_ARGS="" > +TEST_LOG="logs.d/log.filelock1" > +SCRIPT_LOG="logs.d/log.run-filelock1" > +TEST_PID_FILE="pid.filelock1"; > + > +SNAPSHOT_DIR="snap1.d" > + > +TEST_DONE="test-done" > +CHECKPOINT_FILE="checkpoint-filelock1"; > +CHECKPOINT_READY="checkpoint-ready" > +CHECKPOINT_DONE="checkpoint-done" > + > +LOGS_DIR="logs.d" > +DATA_DIR="data.d" > + > +NS_EXEC="../../ns_exec" > +NS_EXEC_ARGS="-cgpuimP $TEST_PID_FILE" > + > +checkpoint() > +{ > + local pid=$1 > + > + $ECHO "\t- Checkpoint: $CHECKPOINT $pid \> $CHECKPOINT_FILE" > + $CHECKPOINT $pid > $CHECKPOINT_FILE > + ret=$? > + if [ $ret -ne 0 ]; then > + $ECHO "***** FAIL: Checkpoint of $pid failed" > + ps -efL |grep $TEST_CMD >> $SCRIPT_LOG > + exit 1; > + fi > +} > + > +function wait_for_checkpoint_ready() > +{ > + # Wait for test to finish setup > + while [ ! -f $CHECKPOINT_READY ]; do > + $ECHO "\t- Waiting for $CHECKPOINT_READY" > + sleep 1; > + done; > +} > + > +function create_container() > +{ > + local pid; > + > + cmdline="$NS_EXEC $NS_EXEC_ARGS -- $TEST_CMD $TEST_ARGS" > + > + $ECHO "\t- Creating container:" > + $ECHO "\t- $cmdline" > + > + $cmdline & > + > + wait_for_checkpoint_ready; > + > + # Find global pid of container-init > + pid=`cat $TEST_PID_FILE`; > + if [ "x$pid" == "x" ]; then > + $ECHO "***** FAIL: Invalid container-init pid $pid" > + ps -efL |grep $TEST_CMD >> $SCRIPT_LOG > + exit 1 > + fi > + $ECHO "Created container with pid $pid" >> $SCRIPT_LOG > +} > + > +function restart_container > +{ > + local ret; > + > + cmdline="$RESTART --pids --pidns --wait" > + $ECHO "\t- $cmdline" > + > + sleep 1 > + > + $cmdline < $CHECKPOINT_FILE >> $SCRIPT_LOG 2>&1 & > + ret=$? > + > + if [ $ret -ne 0 ]; then > + $ECHO "***** FAIL: Restart of $pid failed" > + ps -efL |grep $TEST_CMD >> $SCRIPT_LOG > + exit 1; > + fi > + > +} > + > +function create_fs_snapshot() > +{ > + # Prepare for snapshot > + if [ -d $SNAPSHOT_DIR ]; then > + rm -rf ${SNAPSHOT_DIR}.prev > + mv $SNAPSHOT_DIR ${SNAPSHOT_DIR}.prev > + mkdir $SNAPSHOT_DIR > + fi > + > + # Snapshot the log files > + cp ${LOGS_DIR}/* $SNAPSHOT_DIR > +} > + > +function restore_fs_snapshot() > +{ > + # Restore the snapshot after the main process has been killed > + /bin/cp ${SNAPSHOT_DIR}/* $LOGS_DIR > +} > + > +cd $dir > +echo "Current directory: `pwd`" > + > +if [ ! -d $LOGS_DIR ]; then > + mkdir $LOGS_DIR > +fi > + > +if [ ! -d $DATA_DIR ]; then > + mkdir $DATA_DIR > +fi > + > +if [ ! -d $SNAPSHOT_DIR ]; then > + mkdir $SNAPSHOT_DIR > +fi > + > +if [ ! -f $INPUT_DATA ]; then > + $FILEIO -C $INPUT_DATA > +fi > + > +# Make sure no stray filelock1 process from another run is still going > +killall $TEST_CMD > $SCRIPT_LOG 2>&1 > + > +> $SCRIPT_LOG; > +cnt=1 > +while [ $cnt -lt 20 ]; do > + $ECHO "===== Iteration $cnt" > + > + # Remove any 'state' files, start the app and let it tell us > + # when it is ready > + rm -f $CHECKPOINT_READY $TEST_DONE $TEST_PID_FILE > + > + create_container > + wait_for_checkpoint_ready > + > + pid=`cat $TEST_PID_FILE` > + > + $ECHO "\t- Done creating container, cinit-pid $pid" > + > + ps -efL |grep $TEST_CMD >> $SCRIPT_LOG > + > + # override default freezerdir > + if [ -d $freezerdir ]; then > + rmdir $freezerdir > + fi > + freezerdir=$freezermountpoint/$pid > + freeze_pid $pid > + > + num_pids1=`ps -efL |grep $TEST_CMD | wc -l` > + > + create_fs_snapshot > + > + checkpoint $pid > + > + touch $CHECKPOINT_DONE > + > + killall -9 `basename $TEST_CMD` > + > + thaw > + > + sleep 3 > + > + restore_fs_snapshot > + > + restart_container > + > + sleep 3; > + > + num_pids2=`ps -efL |grep $TEST_CMD | wc -l` > + ps -efL |grep $TEST_CMD >> $SCRIPT_LOG > + $ECHO "\t- num_pids1 $num_pids1, num_pids2 $num_pids2"; > + > + # ns_exec pid is parent-pid of restarted-container-init > + nspid=`pidof restart` > + > + if [ "x$nspid" == "x" ]; then > + $ECHO "***** FAIL: Can't find pid of $RESTART" > + exit 1; > + fi > + > + # End test gracefully > + touch $TEST_DONE > + > + $ECHO "\t- Waiting for restarted container to exit (gloabl-pid $nspid)" > + wait $nspid; > + ret=$? > + > + grep --binary-files=text FAIL $PWD/$TEST_LOG > /dev/null 2>&1 > + if [ $? -eq 0 ]; then > + $ECHO "\t***** Application FAILED after restart" >> $SCRIPT_LOG > + $ECHO "\t***** See $TEST_LOG for details" >> $SCRIPT_LOG > + > + $ECHO "\t***** Application FAILED after restart" > + $ECHO "\tSee $PWD/$TEST_LOG for details" > + exit 1; > + fi > + > + $ECHO "\t- Container exited, status $ret" > + > + cnt=$((cnt+1)) > +done > -- > 1.6.0.4 ^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2010-01-21 21:37 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-20 23:06 [PATCH][cr-test]: filelock1: Test restore of adivsory locks during restart Sukadev Bhattiprolu
[not found] ` <20100120230621.GA30288-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-01-21 21:37 ` Serge E. Hallyn
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.