* [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 2/5][v5][cr]: Define flock_set() Sukadev Bhattiprolu
` (7 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
Move IS_POSIX(), IS_FLOCK(), IS_LEASE() and 'for_each_lock()' into
include/linux/fs.h since these are also needed to checkpoint/restart
file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
fs/locks.c | 7 -------
include/linux/fs.h | 7 +++++++
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index 9cd859e..da53795 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -130,16 +130,9 @@
#include <asm/uaccess.h>
-#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
-#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
-
int leases_enable = 1;
int lease_break_time = 45;
-#define for_each_lock(inode, lockp) \
- for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-
static LIST_HEAD(file_lock_list);
static LIST_HEAD(blocked_list);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ee725ff..909a535 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1088,6 +1088,13 @@ struct file_lock {
} fl_u;
};
+#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
+#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
+#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
+
+#define for_each_lock(inode, lockp) \
+ for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
+
/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
--
1.6.0.4
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/5][v5][cr]: Define flock_set()
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2010-10-29 6:16 ` [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 3/5][v5][cr]: Define flock64_set() Sukadev Bhattiprolu
` (6 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
Extract core functionality of fcntl_setlk() into a separate function,
flock_set(). flock_set() can be also used when restarting a checkpointed
application and restoring its file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
fs/locks.c | 44 +++++++++++++++++++++++++++-----------------
include/linux/fs.h | 1 +
2 files changed, 28 insertions(+), 17 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index da53795..6c6ced4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1758,14 +1758,10 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
-/* Apply the lock described by l to an open file descriptor.
- * This implements both the F_SETLK and F_SETLKW commands of fcntl().
- */
-int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock __user *l)
+int flock_set(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock flock;
struct inode *inode;
struct file *f;
int error;
@@ -1773,13 +1769,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
inode = filp->f_path.dentry->d_inode;
/* Don't allow mandatory locks on files that may be memory mapped
@@ -1791,7 +1780,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
}
again:
- error = flock_to_posix_lock(filp, file_lock, &flock);
+ error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
if (cmd == F_SETLKW) {
@@ -1799,7 +1788,7 @@ again:
}
error = -EBADF;
- switch (flock.l_type) {
+ switch (flock->l_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
goto out;
@@ -1829,8 +1818,8 @@ again:
spin_lock(¤t->files->file_lock);
f = fcheck(fd);
spin_unlock(¤t->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
+ if (!error && f != filp && flock->l_type != F_UNLCK) {
+ flock->l_type = F_UNLCK;
goto again;
}
@@ -1839,6 +1828,27 @@ out:
return error;
}
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock __user *l)
+{
+ int error;
+ struct flock flock;
+
+ /*
+ * This might block, so we do it before checking the inode
+ * in flock_set().
+ */
+ error = -EFAULT;
+ if (copy_from_user(&flock, l, sizeof(flock)))
+ return error;
+
+ return flock_set(fd, filp, cmd, &flock);
+}
+
+
#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 909a535..5e9ea17 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1112,6 +1112,7 @@ extern void send_sigio(struct fown_struct *fown, int fd, int band);
extern int fcntl_getlk(struct file *, struct flock __user *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
struct flock __user *);
+extern int flock_set(unsigned int, struct file *, unsigned int, struct flock *);
#if BITS_PER_LONG == 32
extern int fcntl_getlk64(struct file *, struct flock64 __user *);
--
1.6.0.4
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 3/5][v5][cr]: Define flock64_set()
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2010-10-29 6:16 ` [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 2/5][v5][cr]: Define flock_set() Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks Sukadev Bhattiprolu
` (5 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
Extract core functionality of fcntl_setlk64() into a separate function,
flock64_set(). flock64_set() can be also used when restarting a checkpointed
application and restoring its file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
fs/locks.c | 38 ++++++++++++++++++++++++--------------
include/linux/fs.h | 2 ++
2 files changed, 26 insertions(+), 14 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index 6c6ced4..34b0e14 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1889,11 +1889,10 @@ out:
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
-int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock64 __user *l)
+int flock64_set(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock64 flock;
struct inode *inode;
struct file *f;
int error;
@@ -1901,13 +1900,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
inode = filp->f_path.dentry->d_inode;
/* Don't allow mandatory locks on files that may be memory mapped
@@ -1919,7 +1911,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
}
again:
- error = flock64_to_posix_lock(filp, file_lock, &flock);
+ error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
if (cmd == F_SETLKW64) {
@@ -1927,7 +1919,7 @@ again:
}
error = -EBADF;
- switch (flock.l_type) {
+ switch (flock->l_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
goto out;
@@ -1952,8 +1944,8 @@ again:
spin_lock(¤t->files->file_lock);
f = fcheck(fd);
spin_unlock(¤t->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
+ if (!error && f != filp && flock->l_type != F_UNLCK) {
+ flock->l_type = F_UNLCK;
goto again;
}
@@ -1961,6 +1953,24 @@ out:
locks_free_lock(file_lock);
return error;
}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 __user *l)
+{
+ struct flock64 flock;
+
+ /*
+ * This might block, so we do it before checking the inode in
+ * flock64_set().
+ */
+ if (copy_from_user(&flock, l, sizeof(flock)))
+ return -EFAULT;
+
+ return flock64_set(fd, filp, cmd, &flock);
+}
#endif /* BITS_PER_LONG == 32 */
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5e9ea17..3f72462 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1118,6 +1118,8 @@ extern int flock_set(unsigned int, struct file *, unsigned int, struct flock *);
extern int fcntl_getlk64(struct file *, struct flock64 __user *);
extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
struct flock64 __user *);
+extern int flock64_set(unsigned int, struct file *, unsigned int,
+ struct flock64 *);
#endif
extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
--
1.6.0.4
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
` (2 preceding siblings ...)
2010-10-29 6:16 ` [PATCH 3/5][v5][cr]: Define flock64_set() Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 5/5][v5][cr]: Document design of C/R of file-locks Sukadev Bhattiprolu
` (4 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
While checkpointing each file-descriptor, find all the locks on the
file and save information about the lock in the checkpoint-image.
During restart of the application, read the saved file-locks from the
checkpoint image and for each POSIX lock, call flock_set() to set the
lock on the file.
As pointed out by Matt Helsley, no special handling is necessary for a
process P2 in the checkpointed container that is blocked on a lock, L1
held by another process P1. Processes in the restarted container begin
execution only after all processes have restored. If the blocked process
P2 is restored first, it will prepare to return an -ERESTARTSYS from the
fcntl() system call, but wait for P1 to be restored. When P1 is restored,
it will re-acquire the lock L1 before P1 and P2 begin actual execution.
This ensures that even if P2 is scheduled to run before P1, P2 will go
back to waiting for the lock L1.
Changelog[v5]:
[Oren Laadan]: Combine checkpoint and restart patches into one
for easier review
Changelog[v4]:
[Oren Laadan]: For consistency with other such objects, replace
the "marker lock" checkpoint with a checkpoint of a count of the
file-locks before the first file-lock of each file.
Changelog[v3]:
[Oren Laadan] Add a missing (loff_t) type cast and use a macro
to set the marker/dummy file lock
Changelog[v2]:
[Matt Helsley]: Use fixed sizes (__s64) instead of 'loff_t' in
'struct ckpt_hdr_file_lock'.
[Matt Helsley, Serge Hallyn]: Highlight new use of BKL (using
lock_flocks() macros as suggested by Serge).
[Matt Helsley]: Reorg code a bit to simplify error handling.
[Matt Helsley]: Reorg code to initialize marker-lock (Pass a
NULL lock to checkpoint_one_lock() to indicate marker).
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
fs/checkpoint.c | 318 ++++++++++++++++++++++++++++++++++++++--
include/linux/checkpoint_hdr.h | 17 ++
2 files changed, 320 insertions(+), 15 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 87d7c6e..898a016 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -26,8 +26,19 @@
#include <linux/checkpoint.h>
#include <linux/eventpoll.h>
#include <linux/eventfd.h>
+#include <linux/smp_lock.h>
#include <net/sock.h>
+/*
+ * TODO: This code uses the BKL for consistency with other uses of
+ * 'for_each_lock()'. But since the BKL may be replaced with another
+ * lock in the future, use lock_flocks() macros instead. lock_flocks()
+ * are currently used in BKL-fix sand boxes and when those changes
+ * are merged, the following macros can be removed
+ */
+#define lock_flocks() lock_kernel()
+#define unlock_flocks() unlock_kernel()
+
/**************************************************************************
* Checkpoint
*/
@@ -249,8 +260,120 @@ static int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
return ret;
}
+static int checkpoint_one_file_lock(struct ckpt_ctx *ctx,
+ struct file_lock *lock)
+{
+ int rc;
+ struct ckpt_hdr_file_lock *h;
+
+ if (!IS_POSIX(lock)) {
+ /* Hmm, we should have caught this while counting locks */
+ return -EBADF;
+ }
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCK);
+ if (!h)
+ return -ENOMEM;
+
+ h->fl_start = lock->fl_start;
+ h->fl_end = lock->fl_end;
+ h->fl_type = lock->fl_type;
+ h->fl_flags = lock->fl_flags;
+
+ rc = ckpt_write_obj(ctx, &h->h);
+
+ ckpt_hdr_put(ctx, h);
+
+ return rc;
+}
+
+static int checkpoint_file_lock_count(struct ckpt_ctx *ctx, int num_locks)
+{
+ int rc;
+ struct ckpt_hdr_file_lock_count *h;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCK_COUNT);
+ if (!h)
+ return -ENOMEM;
+
+ h->nr_locks = num_locks;
+
+ rc = ckpt_write_obj(ctx, &h->h);
+
+ ckpt_hdr_put(ctx, h);
+
+ return rc;
+}
+
+int
+checkpoint_file_locks(struct ckpt_ctx *ctx, struct files_struct *files,
+ struct file *file)
+{
+ int n;
+ int rc;
+ struct inode *inode;
+ struct file_lock **lockpp;
+ struct file_lock *lockp;
+
+ lock_flocks();
+
+ /*
+ * First count the number of file-locks on this file
+ */
+ n = 0;
+ rc = -EBADF;
+ inode = file->f_path.dentry->d_inode;
+ for_each_lock(inode, lockpp) {
+ lockp = *lockpp;
+ if (lockp->fl_owner != files)
+ continue;
+
+ ckpt_debug("Lock [%lld, %lld, %d, 0x%x]\n", lockp->fl_start,
+ lockp->fl_end, lockp->fl_type, lockp->fl_flags);
+
+ if (lockp->fl_owner != files)
+ continue;
+
+ if (IS_POSIX(lockp))
+ n++;
+ else {
+ ckpt_err(ctx, rc, "%(T), checkpoint of lock "
+ "[%lld, %lld, %d, 0x%x] failed\n",
+ lockp->fl_start, lockp->fl_end,
+ lockp->fl_type, lockp->fl_flags);
+ goto out;
+ }
+ }
+
+ /*
+ * Checkpoint the count of file-locks
+ */
+ rc = checkpoint_file_lock_count(ctx, n);
+ if (rc < 0) {
+ ckpt_err(ctx, rc, "%(T), checkpoint file-lock count failed\n");
+ goto out;
+ }
+
+ /*
+ * Make a second pass and checkpoint file-locks themselves.
+ */
+ for_each_lock(inode, lockpp) {
+ lockp = *lockpp;
+ if (lockp->fl_owner != files)
+ continue;
+
+ rc = checkpoint_one_file_lock(ctx, lockp);
+ if (rc < 0)
+ goto out;
+ }
+
+out:
+ unlock_flocks();
+ return rc;
+}
+
/**
- * ckpt_write_file_desc - dump the state of a given file descriptor
+ * checkpoint_file_desc - dump the state of a given file descriptor
* @ctx: checkpoint context
* @files: files_struct pointer
* @fd: file descriptor
@@ -282,18 +405,6 @@ static int checkpoint_file_desc(struct ckpt_ctx *ctx,
}
rcu_read_unlock();
- ret = find_locks_with_owner(file, files);
- /*
- * find_locks_with_owner() returns an error when there
- * are no locks found, so we *want* it to return an error
- * code. Its success means we have to fail the checkpoint.
- */
- if (!ret) {
- ret = -EBADF;
- ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
- goto out;
- }
-
/* sanity check (although this shouldn't happen) */
ret = -EBADF;
if (!file) {
@@ -328,6 +439,11 @@ static int checkpoint_file_desc(struct ckpt_ctx *ctx,
h->fd_close_on_exec = coe;
ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ ret = checkpoint_file_locks(ctx, files, file);
+
out:
ckpt_hdr_put(ctx, h);
if (file)
@@ -792,8 +908,176 @@ static void *restore_file(struct ckpt_ctx *ctx)
return (void *)file;
}
+#if BITS_PER_LONG == 32
+
+/*
+ * NOTE: Even if we checkpointed a lock that was set with 'struct flock'
+ * restore the lock using 'struct flock64'. Note that both these lock
+ * types are first converted to a posix_file_lock before processing so
+ * converting to 'struct flock64' is (hopefully) not a problem.
+ * NFS for instance uses IS_SETLK() instead of cmd == F_SETLK.
+ *
+ * TODO: Are there filesystems that implement F_SETLK but not F_SETLK64 ?
+ * If there are, restore_one_posix_lock() will fail.
+ */
+static int
+ckpt_hdr_file_lock_to_flock64(struct ckpt_hdr_file_lock *h, struct flock64 *fl)
+{
+ /*
+ * We checkpoint the 'raw' fl_type which in case of leases includes
+ * the F_INPROGRESS flag. But for posix-locks, the fl_type should
+ * be simple.
+ */
+ switch(h->fl_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ case F_UNLCK:
+ break;
+ default:
+ ckpt_debug("Bad posix lock type 0x%x ?\n", h->fl_type);
+ return -EINVAL;
+ }
+
+ memset(fl, 0, sizeof(*fl));
+ fl->l_type = h->fl_type;
+ fl->l_start = h->fl_start;
+ fl->l_len = h->fl_end == OFFSET_MAX ? 0 : h->fl_end - h->fl_start + 1;
+ fl->l_whence = SEEK_SET;
+
+ /* TODO: Init ->l_sysid, l_pid fields */
+ ckpt_debug("Restoring filelock [%lld, %lld, %d]\n", fl->l_start,
+ fl->l_len, fl->l_type);
+
+ return 0;
+}
+
+static int restore_one_posix_lock(struct ckpt_ctx *ctx, struct file *file,
+ int fd, struct ckpt_hdr_file_lock *h)
+{
+ struct flock64 fl;
+ int ret;
+
+ ret = ckpt_hdr_file_lock_to_flock64(h, &fl);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T) Unexpected flock\n");
+ return ret;
+ }
+
+ /*
+ * Use F_SETLK because we should not have to wait for the lock. If
+ * another process holds the lock, it indicates that filesystem-state
+ * is not consistent with what it was at checkpoint. In which case we
+ * better fail.
+ */
+ ret = flock64_set(fd, file, F_SETLK64, &fl);
+ if (ret)
+ ckpt_err(ctx, ret, "flock64_set(): %d\n", (int)h->fl_type);
+
+ return ret;
+}
+
+#else
+
+static int
+ckpt_hdr_file_lock_to_flock(struct ckpt_hdr_file_lock *h, struct flock *fl)
+{
+ /*
+ * We checkpoint the 'raw' fl_type which in case of leases includes
+ * the F_INPROGRESS flag. But for posix-locks, the fl_type should
+ * be simple.
+ */
+ switch(h->fl_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ case F_UNLCK:
+ break;
+ default:
+ ckpt_debug("Bad posix lock type 0x%x ?\n", h->fl_type);
+ return -EINVAL;
+ }
+
+ memset(fl, 0, sizeof(*fl));
+
+ fl->l_type = h->fl_type;
+ fl->l_start = h->fl_start;
+ fl->l_len = fl->fl_end == OFFSET_MAX ? 0 : h->fl_end - h->fl_start + 1;
+ fl->l_whence = SEEK_SET;
+
+ ckpt_debug("Restoring filelock [%lld, %lld, %d]\n", fl->l_start,
+ fl->l_len, fl->l_type);
+
+ /* TODO: Init ->l_sysid, l_pid fields */
+
+ return 0;
+}
+
+static int restore_one_posix_lock(struct ckpt_ctx *ctx, struct file *file,
+ int fd, struct ckpt_hdr_file_lock *h)
+{
+ struct flock fl;
+ int ret;
+
+ ret = ckpt_hdr_file_lock_to_flock(h, &fl);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T) Unexpected flock\n");
+ break;
+ }
+
+ /*
+ * Use F_SETLK because we should not have to wait for the lock. If
+ * another process holds the lock, it indicates that filesystem-state
+ * is not consistent with what it was at checkpoint. In which case we
+ * better fail.
+ */
+ ret = flock_set(fd, file, F_SETLK, &fl);
+ if (ret)
+ ckpt_err(ctx, ret, "flock_set(): %d\n", (int)h->fl_type);
+
+ return ret;
+}
+#endif
+
+static int restore_file_locks(struct ckpt_ctx *ctx, struct file *file, int fd)
+{
+ int i, ret;
+ struct ckpt_hdr_file_lock *h;
+ struct ckpt_hdr_file_lock_count *hfc;
+
+ hfc = ckpt_read_obj_type(ctx, sizeof(*hfc), CKPT_HDR_FILE_LOCK_COUNT);
+ if (IS_ERR(hfc))
+ return PTR_ERR(hfc);
+
+ ret = 0;
+ for (i = 0; i < hfc->nr_locks; i++) {
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCK);
+ if (IS_ERR(h)) {
+ ret = PTR_ERR(h);
+ goto out;
+ }
+
+ ckpt_debug("Lock [%lld, %lld, %d, 0x%x]\n", h->fl_start,
+ h->fl_end, (int)h->fl_type, h->fl_flags);
+
+ ret = -EBADF;
+ if (h->fl_flags & FL_POSIX)
+ ret = restore_one_posix_lock(ctx, file, fd, h);
+
+ ckpt_hdr_put(ctx, h);
+
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)\n");
+ goto out;
+ }
+
+ }
+out:
+ ckpt_hdr_put(ctx, hfc);
+ return ret;
+}
+
/**
- * ckpt_read_file_desc - restore the state of a given file descriptor
+ * restore_file_desc - restore the state of a given file descriptor
* @ctx: checkpoint context
*
* Restores the state of a file descriptor; looks up the objref (in the
@@ -839,7 +1123,11 @@ static int restore_file_desc(struct ckpt_ctx *ctx)
}
set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
- ret = 0;
+
+ ret = restore_file_locks(ctx, file, h->fd_descriptor);
+ if (ret < 0)
+ ckpt_err(ctx, ret, "Error on fd %d\n", h->fd_descriptor);
+
out:
ckpt_hdr_put(ctx, h);
return ret;
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 049bb82..7b3267c 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -159,6 +159,10 @@ enum {
#define CKPT_HDR_TTY_LDISC CKPT_HDR_TTY_LDISC
CKPT_HDR_EPOLL_ITEMS, /* must be after file-table */
#define CKPT_HDR_EPOLL_ITEMS CKPT_HDR_EPOLL_ITEMS
+ CKPT_HDR_FILE_LOCK_COUNT,
+#define CKPT_HDR_FILE_LOCK_COUNT CKPT_HDR_FILE_LOCK_COUNT
+ CKPT_HDR_FILE_LOCK,
+#define CKPT_HDR_FILE_LOCK CKPT_HDR_FILE_LOCK
CKPT_HDR_MM = 401,
#define CKPT_HDR_MM CKPT_HDR_MM
@@ -614,6 +618,19 @@ struct ckpt_hdr_file_generic {
struct ckpt_hdr_file common;
} __attribute__((aligned(8)));
+struct ckpt_hdr_file_lock_count {
+ struct ckpt_hdr h;
+ __u32 nr_locks;
+};
+
+struct ckpt_hdr_file_lock {
+ struct ckpt_hdr h;
+ __s64 fl_start;
+ __s64 fl_end;
+ __u8 fl_type;
+ __u8 fl_flags;
+};
+
struct ckpt_hdr_file_pipe {
struct ckpt_hdr_file common;
__s32 pipe_objref;
--
1.6.0.4
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 5/5][v5][cr]: Document design of C/R of file-locks
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
` (3 preceding siblings ...)
2010-10-29 6:16 ` [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 14:31 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Lin Ming
` (3 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
Summarize the file-system consistency requirements and the design of
the C/R of file-locks and leases.
Changelog[v5]:
- This version of the patchset only checkpoints/restores file-locks.
C/R of file-owner information requires additional work with struct
pids and will be addressed in a follow-on patch. C/R of file-leases,
depends on C/R of file-owner info Removed the design information of
C/R of file leases from the Documenation for now.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
Documentation/checkpoint/file-locks | 52 +++++++++++++++++++++++++++++++++++
1 files changed, 52 insertions(+), 0 deletions(-)
create mode 100644 Documentation/checkpoint/file-locks
diff --git a/Documentation/checkpoint/file-locks b/Documentation/checkpoint/file-locks
new file mode 100644
index 0000000..ccffdef
--- /dev/null
+++ b/Documentation/checkpoint/file-locks
@@ -0,0 +1,52 @@
+
+Filesystem consistency across C/R.
+==================================
+
+To checkpoint/restart a process that is using any filesystem resource, the
+kernel assumes that the file system state at the time of restart is consistent
+with its state at the time of checkpoint. In general, this consistency can be
+achieved by:
+
+ a. running the application inside a container (to ensure no process
+ outside the container modifies the filesystem/IPC or other states)
+
+ b. freezing the application before checkpoint
+ c. taking a snapshot of the file system while application is frozen
+ d. checkpointing the application while it is frozen
+
+ e. restoring the file system state to its snapshot
+ f. restart the application inside a container
+
+i.e the kernel assumes that file system state is consistent but it does/can
+NOT verify that it is. The administrator must provide this consistency taking
+into account the file system type including whether it is local or remote,
+and the tools available in the file system (snapshot tools in btrfs or rsync
+etc).
+
+For distributed applications operating on distributed filesystems, it is
+expected that an external mechanism will coordinate the freeze/checkpoint/
+snapshot/restart across the nodes. IOW, the current semantics in the kernel
+provide for C/R on a single node.
+
+Checkpoint/restart of file-locks.
+================================
+
+To checkpoint file-locks in an application, we start with each file-descriptor
+and count the number of file-locks on that file-descriptor. We save this count
+in the checkpoint image, and then information about each file-lock on the
+file-descriptor.
+
+When restarting the application from the checkpoint, we read the file-lock
+count for each file-descriptor and then read the information about each
+file-lock. For each file-lock, we call flock_set() to set a new file-lock.
+
+No special handling is necessary for a process P2 in the checkpointed container
+that is blocked on a file-lock, L1 held by another process P1. Processes in the
+restarted container begin execution only after all processes have restored.
+If the blocked process P2 is restored first, it will prepare to return an
+-ERESTARTSYS from the fcntl() system call, but wait for P1 to be restored.
+When P1 is restored, it will re-acquire the file-lock L1 before P1 and P2 begin
+actual execution.
+
+This ensures that even if P2 is scheduled to run before P1, P2 will go
+back to waiting for the file-lock L1.
--
1.6.0.4
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 0/5][v5][cr] Checkpoint/restart file locks
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
` (4 preceding siblings ...)
2010-10-29 6:16 ` [PATCH 5/5][v5][cr]: Document design of C/R of file-locks Sukadev Bhattiprolu
@ 2010-10-29 14:31 ` Lin Ming
2011-01-11 1:37 ` [Patch 0/2] Checkpoint and restart of " Oren Laadan
` (2 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Lin Ming @ 2010-10-29 14:31 UTC (permalink / raw)
To: Sukadev Bhattiprolu
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
On Fri, Oct 29, 2010 at 2:16 PM, Sukadev Bhattiprolu
<sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> wrote:
> Checkpoint/restart file locks.
>
> Changelog[v5]:
> - This patchset only checkpoints/restores file locks. C/R of
> file-owner and file-leases will be addressed in follown patches.
> C/R of file-owner information must deal with nested-containers
> and, will need a way to C/R struct pids. C/R of file-leases depends
> on C/R of file-owner information.
>
>
> Sukadev Bhattiprolu (5):
> Move file_lock macros into linux/fs.h
> Define flock_set()
> Define flock64_set()
> Checkpoint/restore file-locks
> Document design of C/R of file-locks and leases
>
> Documentation/checkpoint/file-locks | 52 ++++++
> fs/checkpoint.c | 318 +++++++++++++++++++++++++++++++++--
> fs/locks.c | 89 ++++++----
> include/linux/checkpoint_hdr.h | 17 ++
> include/linux/fs.h | 10 +
> 5 files changed, 433 insertions(+), 53 deletions(-)
> create mode 100644 Documentation/checkpoint/file-locks
Hi,
Which tree are these patches against?
I can't apply them neither to Linus tree(18cb657c) nor
vfs-2.6.git/for-linus branch(a4cdbd8b).
Lin Ming
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 0/5][v5][cr] Checkpoint/restart file locks
[not found] ` <AANLkTiktKVuAtjPaHT9d+7ATpKn+jMonFYzeEmd85L3o-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-10-29 18:35 ` Sukadev Bhattiprolu
0 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 18:35 UTC (permalink / raw)
To: Lin Ming
Cc: Matthew Wilcox, Containers, Jamie Lokier,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Dan Smith,
Steven Whitehouse
Lin Ming [lin-7mNXGst3two@public.gmane.org] wrote:
| Hi,
|
| Which tree are these patches against?
|
| I can't apply them neither to Linus tree(18cb657c) nor
| vfs-2.6.git/for-linus branch(a4cdbd8b).
These apply to the checkpoint/restart tree:
git://git.ncl.cs.columbia.edu/pub/git/linux-cr.git
They need the checkpoint/restart infrastructure which is not in the main
line kernel yet. We have been using the [cr] prefix to identify these
patches, but will add a pointer to the above tree in the future.
Thanks,
Sukadev
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Patch 0/2] Checkpoint and restart of file locks
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
` (5 preceding siblings ...)
2010-10-29 14:31 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Lin Ming
@ 2011-01-11 1:37 ` Oren Laadan
[not found] ` <1294709825-23357-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2011-01-11 1:37 ` [PATCH 1/2] Refactor fcntl_setlk() and fcntl_setlk64() Oren Laadan
2011-01-11 1:37 ` [PATCH 2/2] c/r: add checkpoint support for file-locks Oren Laadan
8 siblings, 1 reply; 12+ messages in thread
From: Oren Laadan @ 2011-01-11 1:37 UTC (permalink / raw)
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Cc: Sukadev Bhattiprolu
The following two patches add c/r support for POSIX file locks. They
are based on the series posted by Suka some time ago:
https://lists.linux-foundation.org/pipermail/containers/2010-October/025855.html
Suka:
When I read your post I thought that boht posix and flock were
supported, but looking at the code I saw only posix. Am I missing
anything ?
Also, I'm not sure that all the input is well sanitized during
restart - for example, the possible values for the lock flags.
Any thoughts ?
Lastly, do you have test cases to run against this patch and test
it ?
Thanks,
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 1/2] Refactor fcntl_setlk() and fcntl_setlk64()
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
` (6 preceding siblings ...)
2011-01-11 1:37 ` [Patch 0/2] Checkpoint and restart of " Oren Laadan
@ 2011-01-11 1:37 ` Oren Laadan
2011-01-11 1:37 ` [PATCH 2/2] c/r: add checkpoint support for file-locks Oren Laadan
8 siblings, 0 replies; 12+ messages in thread
From: Oren Laadan @ 2011-01-11 1:37 UTC (permalink / raw)
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Cc: Sukadev Bhattiprolu
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Extract core functionality of fcntl_setlk() and fcntl_setlk64() into
separate functions, flock_set() and flock64_set() respective. These
functions can be also used when restarting a checkpointed application
and restoring its file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
fs/locks.c | 76 +++++++++++++++++++++++++++++++--------------------
include/linux/fs.h | 4 +++
2 files changed, 50 insertions(+), 30 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index e30ad46..c3fc56d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1807,14 +1807,13 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
-/* Apply the lock described by l to an open file descriptor.
+/* Apply the lock described by @flock to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
-int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock __user *l)
+int do_fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock flock;
struct inode *inode;
struct file *f;
int error;
@@ -1822,13 +1821,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
inode = filp->f_path.dentry->d_inode;
/* Don't allow mandatory locks on files that may be memory mapped
@@ -1840,7 +1832,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
}
again:
- error = flock_to_posix_lock(filp, file_lock, &flock);
+ error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
if (cmd == F_SETLKW) {
@@ -1848,7 +1840,7 @@ again:
}
error = -EBADF;
- switch (flock.l_type) {
+ switch (flock->l_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
goto out;
@@ -1878,8 +1870,8 @@ again:
spin_lock(¤t->files->file_lock);
f = fcheck(fd);
spin_unlock(¤t->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
+ if (!error && f != filp && flock->l_type != F_UNLCK) {
+ flock->l_type = F_UNLCK;
goto again;
}
@@ -1888,6 +1880,23 @@ out:
return error;
}
+int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock __user *l)
+{
+ int error;
+ struct flock flock;
+
+ /*
+ * This might block, so we do it before checking the inode
+ * in flock_set().
+ */
+ error = -EFAULT;
+ if (copy_from_user(&flock, l, sizeof(flock)))
+ return error;
+
+ return do_fcntl_setlk(fd, filp, cmd, &flock);
+}
+
#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
@@ -1925,14 +1934,13 @@ out:
return error;
}
-/* Apply the lock described by l to an open file descriptor.
+/* Apply the lock described by @flock to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
-int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock64 __user *l)
+int do_fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock64 flock;
struct inode *inode;
struct file *f;
int error;
@@ -1940,13 +1948,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
inode = filp->f_path.dentry->d_inode;
/* Don't allow mandatory locks on files that may be memory mapped
@@ -1958,7 +1959,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
}
again:
- error = flock64_to_posix_lock(filp, file_lock, &flock);
+ error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
if (cmd == F_SETLKW64) {
@@ -1966,7 +1967,7 @@ again:
}
error = -EBADF;
- switch (flock.l_type) {
+ switch (flock->l_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
goto out;
@@ -1991,8 +1992,8 @@ again:
spin_lock(¤t->files->file_lock);
f = fcheck(fd);
spin_unlock(¤t->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
+ if (!error && f != filp && flock->l_type != F_UNLCK) {
+ flock->l_type = F_UNLCK;
goto again;
}
@@ -2002,6 +2003,21 @@ out:
}
#endif /* BITS_PER_LONG == 32 */
+int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 __user *l)
+{
+ struct flock64 flock;
+
+ /*
+ * This might block, so we do it before checking the inode in
+ * flock64_set().
+ */
+ if (copy_from_user(&flock, l, sizeof(flock)))
+ return -EFAULT;
+
+ return do_fcntl_setlk64(fd, filp, cmd, &flock);
+}
+
/*
* This function is called when the file is being removed
* from the task's fd array. POSIX locks belonging to this task
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3d8bf75..7f38be1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1121,11 +1121,15 @@ extern void send_sigio(struct fown_struct *fown, int fd, int band);
extern int fcntl_getlk(struct file *, struct flock __user *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
struct flock __user *);
+extern int do_fcntl_setlk(unsigned int, struct file *, unsigned int,
+ struct flock *);
#if BITS_PER_LONG == 32
extern int fcntl_getlk64(struct file *, struct flock64 __user *);
extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
struct flock64 __user *);
+extern int do_fcntl_set(unsigned int, struct file *, unsigned int,
+ struct flock64 *);
#endif
extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
--
1.7.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/2] c/r: add checkpoint support for file-locks
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
` (7 preceding siblings ...)
2011-01-11 1:37 ` [PATCH 1/2] Refactor fcntl_setlk() and fcntl_setlk64() Oren Laadan
@ 2011-01-11 1:37 ` Oren Laadan
8 siblings, 0 replies; 12+ messages in thread
From: Oren Laadan @ 2011-01-11 1:37 UTC (permalink / raw)
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Cc: Sukadev Bhattiprolu
This patch adds suppprt for POSIX file locks only. Support for flock
file locks, file-owner, and fileleases will be addresses later.
While checkpointing each file-descriptor, find all the locks on the
file and save information about the lock in the checkpoint-image.
During restart of the application, read the saved file-locks from the
checkpoint image and for each POSIX lock, call flock_set() to set the
lock on the file.
As pointed out by Matt Helsley, no special handling is necessary for a
process P2 in the checkpointed container that is blocked on a lock, L1
held by another process P1. Processes in the restarted container begin
execution only after all processes have restored. If the blocked process
P2 is restored first, it will prepare to return an -ERESTARTSYS from the
fcntl() system call, but wait for P1 to be restored. When P1 is restored,
it will re-acquire the lock L1 before P1 and P2 begin actual execution.
This ensures that even if P2 is scheduled to run before P1, P2 will go
back to waiting for the lock L1.
Based on patches originally by Sukadev Bhattiprolu.
Changelog[v23-rc1]:
- Major rework and clean up the code
- Merge 23/64 bit code in a simpler way
- Merge locks c/r documenation into this patch
- Move code to fs/locks.c where it belongs
- Do not hold lock_flocks() while writing data out
Changelog[v5]:
- Combine checkpoint and restart patches for easier review
Changelog[v4]:
- For consistency with other such objects, replace the "marker lock"
an explicit count of file-locks for each file
Changelog[v3]:
- Add a missing (loff_t) type cast and use a macro to set the
marker/dummy file lock
Changelog[v2]:
- [Matt Helsley]: Use fixed sizes (__s64) instead of 'loff_t' in
'struct ckpt_hdr_file_lock'.
- [Matt Helsley, Serge Hallyn]: Highlight new use of BKL (using
lock_flocks() macros as suggested by Serge).
- [Matt Helsley]: Reorg code a bit to simplify error handling.
- [Matt Helsley]: Reorg code to initialize marker-lock (Pass a NULL
lock to checkpoint_one_lock() to indicate marker).
Cc: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
Documentation/checkpoint/file-system.txt | 59 +++++++
fs/checkpoint.c | 22 +--
fs/locks.c | 268 ++++++++++++++++++++++++++++++
include/linux/checkpoint_hdr.h | 14 ++
include/linux/fs.h | 5 +
5 files changed, 355 insertions(+), 13 deletions(-)
create mode 100644 Documentation/checkpoint/file-system.txt
diff --git a/Documentation/checkpoint/file-system.txt b/Documentation/checkpoint/file-system.txt
new file mode 100644
index 0000000..a2d4c18
--- /dev/null
+++ b/Documentation/checkpoint/file-system.txt
@@ -0,0 +1,59 @@
+
+Filesystem consistency across C/R
+=================================
+
+To checkpoint/restart a process that is using any filesystem resource, the
+kernel assumes that the file system state at the time of restart is consistent
+with its state at the time of checkpoint. In general, this consistency can be
+achieved by:
+
+ a. running the application inside a container (to ensure no process
+ outside the container modifies the filesystem/IPC or other states)
+
+ b. freezing the application before checkpoint
+ c. taking a snapshot of the file system while application is frozen
+ d. checkpointing the application while it is frozen
+
+ e. restoring the file system state to its snapshot
+ f. restart the application inside a container
+
+i.e the kernel assumes that file system state is consistent but it does/can
+NOT verify that it is. The administrator must provide this consistency taking
+into account the file system type including whether it is local or remote,
+and the tools available in the file system (snapshot tools in btrfs or rsync
+etc).
+
+For distributed applications operating on distributed filesystems, it is
+expected that an external mechanism will coordinate the freeze/checkpoint/
+snapshot/restart across the nodes. IOW, the current semantics in the kernel
+provide for C/R on a single node.
+
+
+Checkpoint/restart of file-locks
+================================
+
+To checkpoint file-locks in an application, we start with each file-descriptor
+and count the number of file-locks on that file-descriptor. We save this count
+in the checkpoint image, and then information about each file-lock on the
+file-descriptor.
+
+When restarting the application from the checkpoint, we read the file-lock
+count for each file-descriptor and then read the information about each
+file-lock. For each file-lock, we call flock_set() to set a new file-lock.
+
+No special handling is necessary for a process P2 in the checkpointed container
+that is blocked on a file-lock, L1 held by another process P1. Processes in the
+restarted container begin execution only after all processes have restored.
+If the blocked process P2 is restored first, it will prepare to return an
+-ERESTARTSYS from the fcntl() system call, but wait for P1 to be restored.
+When P1 is restored, it will re-acquire the file-lock L1 before P1 and P2 begin
+actual execution.
+
+This ensures that even if P2 is scheduled to run before P1, P2 will go
+back to waiting for the file-lock L1.
+
+STATUS:
+[done] POSIX file locks
+[todo] FLOCK file locks
+[todo] file leases
+[todo] file ownership
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index fd539c5..02bbd61 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -281,18 +281,6 @@ static int checkpoint_file_desc(struct ckpt_ctx *ctx,
}
rcu_read_unlock();
- ret = find_locks_with_owner(file, files);
- /*
- * find_locks_with_owner() returns an error when there
- * are no locks found, so we *want* it to return an error
- * code. Its success means we have to fail the checkpoint.
- */
- if (!ret) {
- ret = -EBADF;
- ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
- goto out;
- }
-
/* sanity check (although this shouldn't happen) */
ret = -EBADF;
if (!file) {
@@ -327,6 +315,10 @@ static int checkpoint_file_desc(struct ckpt_ctx *ctx,
h->fd_close_on_exec = coe;
ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ ret = checkpoint_file_locks(ctx, files, file);
out:
ckpt_hdr_put(ctx, h);
if (file)
@@ -838,8 +830,12 @@ static int restore_file_desc(struct ckpt_ctx *ctx)
}
set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
- ret = 0;
+
+ ret = restore_file_locks(ctx, file, h->fd_descriptor);
out:
+ if (ret < 0)
+ ckpt_err(ctx, ret, "Failed fd %d\n", h->fd_descriptor);
+
ckpt_hdr_put(ctx, h);
return ret;
}
diff --git a/fs/locks.c b/fs/locks.c
index c3fc56d..f4373f8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,7 @@
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
+#include <linux/checkpoint.h>
#include <asm/uaccess.h>
@@ -2388,6 +2389,273 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
EXPORT_SYMBOL(lock_may_write);
+#ifdef CONFIG_CHECKPOINT
+static int checkpoint_count_locks(struct ckpt_ctx *ctx,
+ struct files_struct *files,
+ struct file *file)
+{
+ struct inode *inode;
+ struct file_lock **lockpp;
+ struct file_lock *lockp;
+ int count = 0;
+
+ inode = file->f_path.dentry->d_inode;
+
+ lock_flocks();
+ for_each_lock(inode, lockpp) {
+ lockp = *lockpp;
+ if (lockp->fl_owner != files)
+ continue;
+ ckpt_debug("Lock [%lld, %lld, %d, 0x%x]\n", lockp->fl_start,
+ lockp->fl_end, lockp->fl_type, lockp->fl_flags);
+ if (lockp->fl_owner != files)
+ continue;
+ if (!IS_POSIX(lockp)) {
+ ckpt_err(ctx, -EBADF, "%(T), checkpoint of lock "
+ "[%lld, %lld, %d, 0x%x] failed\n",
+ lockp->fl_start, lockp->fl_end,
+ lockp->fl_type, lockp->fl_flags);
+ count = -EBADF;
+ break;
+ }
+ count++;
+ }
+ unlock_flocks();
+ return count;
+}
+
+#define CKPT_FLOCK_CHUNK (8096 / (int) sizeof(struct ckpt_file_lock))
+
+static int checkpoint_file_locks_chunk(struct ckpt_ctx *ctx,
+ struct files_struct *files,
+ struct file *file,
+ struct file_lock **last,
+ struct ckpt_file_lock *hh)
+{
+ struct inode *inode;
+ struct file_lock **lockpp;
+ struct file_lock *lockp;
+ int count = 0;
+ int found = 0;
+
+ inode = file->f_path.dentry->d_inode;
+
+ if (*last == NULL) /* start from the beginning */
+ found = 1;
+
+ lock_flocks();
+
+ for_each_lock(inode, lockpp) {
+ lockp = *lockpp;
+ if (!found) { /* start where we previous pass left off */
+ if (lockp == *last)
+ found = 1;
+ continue;
+ }
+ if (lockp->fl_owner != files)
+ continue;
+ if (!IS_POSIX(lockp)) { /* something must have changed */
+ count = -EBUSY;
+ break;
+ }
+
+ hh->fl_start = lockp->fl_start;
+ hh->fl_end = lockp->fl_end;
+ hh->fl_type = lockp->fl_type;
+ hh->fl_flags = lockp->fl_flags;
+ hh++;
+
+ *last = lockp;
+
+ if (++count == CKPT_FLOCK_CHUNK)
+ break;
+ }
+
+ unlock_flocks();
+
+ return count;
+}
+
+int checkpoint_file_locks(struct ckpt_ctx *ctx, struct files_struct *files,
+ struct file *file)
+{
+ struct ckpt_hdr_file_locks *h;
+ struct ckpt_file_lock *hh;
+ struct file_lock *last = NULL;
+ int nr_locks, len;
+ int ret;
+
+ nr_locks = checkpoint_count_locks(ctx, files, file);
+ if (nr_locks < 0)
+ return nr_locks;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCKS);
+ if (!h)
+ return -ENOMEM;
+ h->nr_locks = nr_locks;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ if (ret || !nr_locks)
+ return ret;
+
+ len = nr_locks * sizeof(*hh) + sizeof(*h);
+ ret = ckpt_write_obj_type(ctx, NULL, len, CKPT_HDR_BUFFER);
+ if (ret < 0)
+ return ret;
+
+ hh = kzalloc(sizeof(*hh) * CKPT_FLOCK_CHUNK, GFP_KERNEL);
+ if (!hh)
+ return -ENOMEM;
+
+ while (nr_locks > 0) {
+ ret = checkpoint_file_locks_chunk(ctx, files, file, &last, hh);
+ if (ret <= 0) /* zero means no more locks */
+ break;
+ nr_locks -= ret;
+ ret = ckpt_kwrite(ctx, hh, ret * sizeof(*hh));
+ if (ret < 0)
+ break;
+ }
+
+ kfree(hh);
+
+ if (nr_locks != 0)
+ ret = -EBUSY;
+ if (ret)
+ ckpt_err(ctx, ret, "Checkpointing file locks\n");
+
+ return ret;
+}
+
+/*
+ * NOTE: For BITS_PER_LONG == 32, always use 'struct flock64' to
+ * restore, even if the original lock was 'struct flock'. Anyway,
+ * we first convert both to a posix_file_lock before processing.
+ *
+ * E.g., NFS for instance uses IS_SETLK() instead of cmd==F_SETLK.
+ *
+ * TODO: do other fs implement F_SETLK but not F_SETLK64 ? if so,
+ * then restore_one_lock() will fail :(
+ */
+static int restore_one_lock(struct ckpt_ctx *ctx, struct file *file,
+ int fd, struct ckpt_file_lock *h)
+{
+#if BITS_PER_LONG == 32
+ struct flock64 fl;
+#else
+ struct flock fl;
+#endif
+ int ret;
+
+ /* TODO: sanitize the remaining possible flags */
+ if ((h->fl_type & (FL_POSIX | FL_FLOCK | FL_LEASE)) != FL_POSIX)
+ return -EINVAL;
+
+ /* TODO: we don't expect leases, nor F_INPROGRESS */
+ switch(h->fl_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ case F_UNLCK:
+ break;
+ default:
+ ckpt_err(ctx, -EINVAL, "Bad posix lock 0x%x\n", h->fl_type);
+ return -EINVAL;
+ }
+
+ memset(&fl, 0, sizeof(fl));
+ fl.l_type = h->fl_type;
+ fl.l_start = h->fl_start;
+ if (h->fl_end != OFFSET_MAX)
+ fl.l_len = h->fl_end - h->fl_start + 1;
+ fl.l_whence = SEEK_SET;
+
+ /* TODO: init ->l_sysid, l_pid fields */
+
+ ckpt_debug("posix lock [%lld, %lld, %d]\n",
+ fl.l_start, fl.l_len, fl.l_type);
+
+ /*
+ * Use F_SETLK because we should not have to wait for the
+ * lock. If another process holds the lock, it indicates that
+ * filesystem-state is not consistent with what it was at
+ * checkpoint. In which case we better fail.
+ */
+#if BITS_PER_LONG == 32
+ ret = do_fcntl_setlk64(fd, file, F_SETLK64, &fl);
+#else
+ ret = do_fcntl_setlk(fd, file, F_SETLK64, &fl);
+#endif
+ if (ret < 0)
+ ckpt_err(ctx, ret, "Failed posix lock %d\n", (int)h->fl_type);
+
+ return ret;
+}
+
+static int restore_file_locks_chunk(struct ckpt_ctx *ctx, struct file *file,
+ int fd, struct ckpt_file_lock *hh, int nr)
+{
+ int i, ret;
+
+ ret = ckpt_kread(ctx, hh, nr * sizeof(*hh));
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr; i++) {
+ if (!(hh[i].fl_flags & FL_POSIX))
+ return -EINVAL;
+
+ ret = restore_one_lock(ctx, file, fd, &hh[i]);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+int restore_file_locks(struct ckpt_ctx *ctx, struct file *file, int fd)
+{
+ struct ckpt_hdr_file_locks *h;
+ struct ckpt_file_lock *hh;
+ int nr_locks, ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCKS);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ nr_locks = h->nr_locks;
+ ckpt_hdr_put(ctx, h);
+
+ if (!nr_locks)
+ return 0;
+
+ ret = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+ if (ret < 0)
+ return ret;
+ /* make sure the size is what we expect */
+ if (nr_locks != (ret / sizeof(*hh)))
+ return -EINVAL;
+
+ hh = kzalloc(sizeof(*hh) * CKPT_FLOCK_CHUNK, GFP_KERNEL);
+ if (!hh)
+ return -ENOMEM;
+
+ while (nr_locks > 0) {
+ int n = min(nr_locks, CKPT_FLOCK_CHUNK);
+
+ ret = restore_file_locks_chunk(ctx, file, fd, hh, n);
+ if (ret < 0)
+ nr_locks = ret;
+
+ nr_locks -= n;
+ }
+
+ kfree(hh);
+
+ return nr_locks;
+}
+#endif /* CONFIG_CHECKPOINT */
+
static int __init filelock_init(void)
{
filelock_cache = kmem_cache_create("file_lock_cache",
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index b12d586..f7e233d 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -160,6 +160,8 @@ enum {
#define CKPT_HDR_TTY_LDISC CKPT_HDR_TTY_LDISC
CKPT_HDR_EPOLL_ITEMS, /* must be after file-table */
#define CKPT_HDR_EPOLL_ITEMS CKPT_HDR_EPOLL_ITEMS
+ CKPT_HDR_FILE_LOCKS,
+#define CKPT_HDR_FILE_LOCKS CKPT_HDR_FILE_LOCKS
CKPT_HDR_MM = 401,
#define CKPT_HDR_MM CKPT_HDR_MM
@@ -620,6 +622,18 @@ struct ckpt_hdr_file_eventfd {
__u32 flags;
} __attribute__((aligned(8)));
+struct ckpt_file_lock {
+ __s64 fl_start;
+ __s64 fl_end;
+ __u8 fl_type;
+ __u8 fl_flags;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_file_locks {
+ struct ckpt_hdr h;
+ __u32 nr_locks;
+} __attribute__((aligned(8)));
+
/* socket */
struct ckpt_hdr_socket {
struct ckpt_hdr h;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f38be1..f368e30 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2394,6 +2394,11 @@ void inode_set_bytes(struct inode *inode, loff_t bytes);
#ifdef CONFIG_CHECKPOINT
extern int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+extern int checkpoint_file_locks(struct ckpt_ctx *ctx,
+ struct files_struct *files,
+ struct file *file);
+extern int restore_file_locks(struct ckpt_ctx *ctx, struct file *file, int fd);
+
#endif
extern int vfs_readdir(struct file *, filldir_t, void *);
--
1.7.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Patch 0/2] Checkpoint and restart of file locks
[not found] ` <1294709825-23357-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2011-01-12 0:17 ` Sukadev Bhattiprolu
[not found] ` <20110112001700.GB7229-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2011-01-12 0:17 UTC (permalink / raw)
To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Oren Laadan [orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org] wrote:
|
| The following two patches add c/r support for POSIX file locks. They
| are based on the series posted by Suka some time ago:
| https://lists.linux-foundation.org/pipermail/containers/2010-October/025855.html
|
| Suka:
|
| When I read your post I thought that boht posix and flock were
| supported, but looking at the code I saw only posix. Am I missing
| anything ?
Hmm. I have been working with only the posix locks for starters since
it checkpoints/restarts the more general, 'struct file_lock' object.
But I should make that explicit. If a patch description implies that
it covers flocks can be checkpointed, please let me know and I will
fix it.
C/R of leases/file-owner info is blocked on C/R of 'struct pids'.
|
| Also, I'm not sure that all the input is well sanitized during
| restart - for example, the possible values for the lock flags.
| Any thoughts ?
I have this check in restore_file_locks():
+ ret = -EBADF;
+ if (h->fl_flags & FL_POSIX)
+ ret = restore_one_posix_lock(ctx, file, fd,
The other fields (lock type for instance) will be validated by flock64_set()
as for a new fcntl() call - no ?
|
| Lastly, do you have test cases to run against this patch and test
| it ?
Yes, I added some tests to cr-tests a while ago and then updated them.
Will find the updates and post them.
Sukadev
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch 0/2] Checkpoint and restart of file locks
[not found] ` <20110112001700.GB7229-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2011-01-12 0:30 ` Oren Laadan
0 siblings, 0 replies; 12+ messages in thread
From: Oren Laadan @ 2011-01-12 0:30 UTC (permalink / raw)
To: Sukadev Bhattiprolu
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On 01/11/2011 07:17 PM, Sukadev Bhattiprolu wrote:
> Oren Laadan [orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org] wrote:
> |
> | The following two patches add c/r support for POSIX file locks. They
> | are based on the series posted by Suka some time ago:
> | https://lists.linux-foundation.org/pipermail/containers/2010-October/025855.html
> |
> | Suka:
> |
> | When I read your post I thought that boht posix and flock were
> | supported, but looking at the code I saw only posix. Am I missing
> | anything ?
>
> Hmm. I have been working with only the posix locks for starters since
> it checkpoints/restarts the more general, 'struct file_lock' object.
>
> But I should make that explicit. If a patch description implies that
> it covers flocks can be checkpointed, please let me know and I will
> fix it.
>
Ok, just checking - I already modified the patch description
and the inline comments.
> C/R of leases/file-owner info is blocked on C/R of 'struct pids'.
It's in the working ...
>
> |
> | Also, I'm not sure that all the input is well sanitized during
> | restart - for example, the possible values for the lock flags.
> | Any thoughts ?
>
> I have this check in restore_file_locks():
>
> + ret = -EBADF;
> + if (h->fl_flags & FL_POSIX)
> + ret = restore_one_posix_lock(ctx, file, fd,
In this specific example, I fixed it to ensure that only one of
FL_POSIX, FL_FLOCK and FL_LEASE is is present.
(Actually, now I see that I used fl_type, and I should fix it to
be fl_flags, but the intent was good ...)
>
> The other fields (lock type for instance) will be validated by flock64_set()
> as for a new fcntl() call - no ?
That's what I was asking :o
>
> |
> | Lastly, do you have test cases to run against this patch and test
> | it ?
>
> Yes, I added some tests to cr-tests a while ago and then updated them.
> Will find the updates and post them.
Thanks,
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2011-01-12 0:30 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <1288333001-28838-1-git-send-email-sukadev@linux.vnet.ibm.com>
[not found] ` <1288333001-28838-1-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2010-10-29 6:16 ` [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 2/5][v5][cr]: Define flock_set() Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 3/5][v5][cr]: Define flock64_set() Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 5/5][v5][cr]: Document design of C/R of file-locks Sukadev Bhattiprolu
2010-10-29 14:31 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Lin Ming
2011-01-11 1:37 ` [Patch 0/2] Checkpoint and restart of " Oren Laadan
[not found] ` <1294709825-23357-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2011-01-12 0:17 ` Sukadev Bhattiprolu
[not found] ` <20110112001700.GB7229-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2011-01-12 0:30 ` Oren Laadan
2011-01-11 1:37 ` [PATCH 1/2] Refactor fcntl_setlk() and fcntl_setlk64() Oren Laadan
2011-01-11 1:37 ` [PATCH 2/2] c/r: add checkpoint support for file-locks Oren Laadan
[not found] ` <AANLkTiktKVuAtjPaHT9d+7ATpKn+jMonFYzeEmd85L3o@mail.gmail.com>
[not found] ` <AANLkTiktKVuAtjPaHT9d+7ATpKn+jMonFYzeEmd85L3o-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-10-29 18:35 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox