* [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h
2010-10-29 6:16 [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 2/5][v5][cr]: Define flock_set() Sukadev Bhattiprolu
` (4 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Serge Hallyn, Matt Helsley, Dan Smith, Matthew Wilcox,
Jamie Lokier, Steven Whitehouse, linux-fsdevel, Containers
Move IS_POSIX(), IS_FLOCK(), IS_LEASE() and 'for_each_lock()' into
include/linux/fs.h since these are also needed to checkpoint/restart
file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
fs/locks.c | 7 -------
include/linux/fs.h | 7 +++++++
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index 9cd859e..da53795 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -130,16 +130,9 @@
#include <asm/uaccess.h>
-#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
-#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
-
int leases_enable = 1;
int lease_break_time = 45;
-#define for_each_lock(inode, lockp) \
- for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-
static LIST_HEAD(file_lock_list);
static LIST_HEAD(blocked_list);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ee725ff..909a535 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1088,6 +1088,13 @@ struct file_lock {
} fl_u;
};
+#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
+#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
+#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
+
+#define for_each_lock(inode, lockp) \
+ for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
+
/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
--
1.6.0.4
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/5][v5][cr]: Define flock_set()
2010-10-29 6:16 [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 3/5][v5][cr]: Define flock64_set() Sukadev Bhattiprolu
` (3 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Serge Hallyn, Matt Helsley, Dan Smith, Matthew Wilcox,
Jamie Lokier, Steven Whitehouse, linux-fsdevel, Containers
Extract core functionality of fcntl_setlk() into a separate function,
flock_set(). flock_set() can be also used when restarting a checkpointed
application and restoring its file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
fs/locks.c | 44 +++++++++++++++++++++++++++-----------------
include/linux/fs.h | 1 +
2 files changed, 28 insertions(+), 17 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index da53795..6c6ced4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1758,14 +1758,10 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
-/* Apply the lock described by l to an open file descriptor.
- * This implements both the F_SETLK and F_SETLKW commands of fcntl().
- */
-int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock __user *l)
+int flock_set(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock flock;
struct inode *inode;
struct file *f;
int error;
@@ -1773,13 +1769,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
inode = filp->f_path.dentry->d_inode;
/* Don't allow mandatory locks on files that may be memory mapped
@@ -1791,7 +1780,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
}
again:
- error = flock_to_posix_lock(filp, file_lock, &flock);
+ error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
if (cmd == F_SETLKW) {
@@ -1799,7 +1788,7 @@ again:
}
error = -EBADF;
- switch (flock.l_type) {
+ switch (flock->l_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
goto out;
@@ -1829,8 +1818,8 @@ again:
spin_lock(¤t->files->file_lock);
f = fcheck(fd);
spin_unlock(¤t->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
+ if (!error && f != filp && flock->l_type != F_UNLCK) {
+ flock->l_type = F_UNLCK;
goto again;
}
@@ -1839,6 +1828,27 @@ out:
return error;
}
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock __user *l)
+{
+ int error;
+ struct flock flock;
+
+ /*
+ * This might block, so we do it before checking the inode
+ * in flock_set().
+ */
+ error = -EFAULT;
+ if (copy_from_user(&flock, l, sizeof(flock)))
+ return error;
+
+ return flock_set(fd, filp, cmd, &flock);
+}
+
+
#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 909a535..5e9ea17 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1112,6 +1112,7 @@ extern void send_sigio(struct fown_struct *fown, int fd, int band);
extern int fcntl_getlk(struct file *, struct flock __user *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
struct flock __user *);
+extern int flock_set(unsigned int, struct file *, unsigned int, struct flock *);
#if BITS_PER_LONG == 32
extern int fcntl_getlk64(struct file *, struct flock64 __user *);
--
1.6.0.4
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/5][v5][cr]: Define flock64_set()
2010-10-29 6:16 [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 1/5][v5][cr]: Move file_lock macros into linux/fs.h Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 2/5][v5][cr]: Define flock_set() Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks Sukadev Bhattiprolu
` (2 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Serge Hallyn, Matt Helsley, Dan Smith, Matthew Wilcox,
Jamie Lokier, Steven Whitehouse, linux-fsdevel, Containers
Extract core functionality of fcntl_setlk64() into a separate function,
flock64_set(). flock64_set() can be also used when restarting a checkpointed
application and restoring its file-locks.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
fs/locks.c | 38 ++++++++++++++++++++++++--------------
include/linux/fs.h | 2 ++
2 files changed, 26 insertions(+), 14 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index 6c6ced4..34b0e14 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1889,11 +1889,10 @@ out:
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
-int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
- struct flock64 __user *l)
+int flock64_set(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct flock64 flock;
struct inode *inode;
struct file *f;
int error;
@@ -1901,13 +1900,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /*
- * This might block, so we do it before checking the inode.
- */
- error = -EFAULT;
- if (copy_from_user(&flock, l, sizeof(flock)))
- goto out;
-
inode = filp->f_path.dentry->d_inode;
/* Don't allow mandatory locks on files that may be memory mapped
@@ -1919,7 +1911,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
}
again:
- error = flock64_to_posix_lock(filp, file_lock, &flock);
+ error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
if (cmd == F_SETLKW64) {
@@ -1927,7 +1919,7 @@ again:
}
error = -EBADF;
- switch (flock.l_type) {
+ switch (flock->l_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
goto out;
@@ -1952,8 +1944,8 @@ again:
spin_lock(¤t->files->file_lock);
f = fcheck(fd);
spin_unlock(¤t->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
+ if (!error && f != filp && flock->l_type != F_UNLCK) {
+ flock->l_type = F_UNLCK;
goto again;
}
@@ -1961,6 +1953,24 @@ out:
locks_free_lock(file_lock);
return error;
}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 __user *l)
+{
+ struct flock64 flock;
+
+ /*
+ * This might block, so we do it before checking the inode in
+ * flock64_set().
+ */
+ if (copy_from_user(&flock, l, sizeof(flock)))
+ return -EFAULT;
+
+ return flock64_set(fd, filp, cmd, &flock);
+}
#endif /* BITS_PER_LONG == 32 */
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5e9ea17..3f72462 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1118,6 +1118,8 @@ extern int flock_set(unsigned int, struct file *, unsigned int, struct flock *);
extern int fcntl_getlk64(struct file *, struct flock64 __user *);
extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
struct flock64 __user *);
+extern int flock64_set(unsigned int, struct file *, unsigned int,
+ struct flock64 *);
#endif
extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
--
1.6.0.4
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks
2010-10-29 6:16 [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
` (2 preceding siblings ...)
2010-10-29 6:16 ` [PATCH 3/5][v5][cr]: Define flock64_set() Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 6:16 ` [PATCH 5/5][v5][cr]: Document design of C/R of file-locks Sukadev Bhattiprolu
2010-10-29 14:31 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Lin Ming
5 siblings, 0 replies; 8+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Serge Hallyn, Matt Helsley, Dan Smith, Matthew Wilcox,
Jamie Lokier, Steven Whitehouse, linux-fsdevel, Containers
While checkpointing each file-descriptor, find all the locks on the
file and save information about the lock in the checkpoint-image.
During restart of the application, read the saved file-locks from the
checkpoint image and for each POSIX lock, call flock_set() to set the
lock on the file.
As pointed out by Matt Helsley, no special handling is necessary for a
process P2 in the checkpointed container that is blocked on a lock, L1
held by another process P1. Processes in the restarted container begin
execution only after all processes have restored. If the blocked process
P2 is restored first, it will prepare to return an -ERESTARTSYS from the
fcntl() system call, but wait for P1 to be restored. When P1 is restored,
it will re-acquire the lock L1 before P1 and P2 begin actual execution.
This ensures that even if P2 is scheduled to run before P1, P2 will go
back to waiting for the lock L1.
Changelog[v5]:
[Oren Laadan]: Combine checkpoint and restart patches into one
for easier review
Changelog[v4]:
[Oren Laadan]: For consistency with other such objects, replace
the "marker lock" checkpoint with a checkpoint of a count of the
file-locks before the first file-lock of each file.
Changelog[v3]:
[Oren Laadan] Add a missing (loff_t) type cast and use a macro
to set the marker/dummy file lock
Changelog[v2]:
[Matt Helsley]: Use fixed sizes (__s64) instead of 'loff_t' in
'struct ckpt_hdr_file_lock'.
[Matt Helsley, Serge Hallyn]: Highlight new use of BKL (using
lock_flocks() macros as suggested by Serge).
[Matt Helsley]: Reorg code a bit to simplify error handling.
[Matt Helsley]: Reorg code to initialize marker-lock (Pass a
NULL lock to checkpoint_one_lock() to indicate marker).
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
fs/checkpoint.c | 318 ++++++++++++++++++++++++++++++++++++++--
include/linux/checkpoint_hdr.h | 17 ++
2 files changed, 320 insertions(+), 15 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 87d7c6e..898a016 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -26,8 +26,19 @@
#include <linux/checkpoint.h>
#include <linux/eventpoll.h>
#include <linux/eventfd.h>
+#include <linux/smp_lock.h>
#include <net/sock.h>
+/*
+ * TODO: This code uses the BKL for consistency with other uses of
+ * 'for_each_lock()'. But since the BKL may be replaced with another
+ * lock in the future, use lock_flocks() macros instead. lock_flocks()
+ * are currently used in BKL-fix sand boxes and when those changes
+ * are merged, the following macros can be removed
+ */
+#define lock_flocks() lock_kernel()
+#define unlock_flocks() unlock_kernel()
+
/**************************************************************************
* Checkpoint
*/
@@ -249,8 +260,120 @@ static int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
return ret;
}
+static int checkpoint_one_file_lock(struct ckpt_ctx *ctx,
+ struct file_lock *lock)
+{
+ int rc;
+ struct ckpt_hdr_file_lock *h;
+
+ if (!IS_POSIX(lock)) {
+ /* Hmm, we should have caught this while counting locks */
+ return -EBADF;
+ }
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCK);
+ if (!h)
+ return -ENOMEM;
+
+ h->fl_start = lock->fl_start;
+ h->fl_end = lock->fl_end;
+ h->fl_type = lock->fl_type;
+ h->fl_flags = lock->fl_flags;
+
+ rc = ckpt_write_obj(ctx, &h->h);
+
+ ckpt_hdr_put(ctx, h);
+
+ return rc;
+}
+
+static int checkpoint_file_lock_count(struct ckpt_ctx *ctx, int num_locks)
+{
+ int rc;
+ struct ckpt_hdr_file_lock_count *h;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCK_COUNT);
+ if (!h)
+ return -ENOMEM;
+
+ h->nr_locks = num_locks;
+
+ rc = ckpt_write_obj(ctx, &h->h);
+
+ ckpt_hdr_put(ctx, h);
+
+ return rc;
+}
+
+int
+checkpoint_file_locks(struct ckpt_ctx *ctx, struct files_struct *files,
+ struct file *file)
+{
+ int n;
+ int rc;
+ struct inode *inode;
+ struct file_lock **lockpp;
+ struct file_lock *lockp;
+
+ lock_flocks();
+
+ /*
+ * First count the number of file-locks on this file
+ */
+ n = 0;
+ rc = -EBADF;
+ inode = file->f_path.dentry->d_inode;
+ for_each_lock(inode, lockpp) {
+ lockp = *lockpp;
+ if (lockp->fl_owner != files)
+ continue;
+
+ ckpt_debug("Lock [%lld, %lld, %d, 0x%x]\n", lockp->fl_start,
+ lockp->fl_end, lockp->fl_type, lockp->fl_flags);
+
+ if (lockp->fl_owner != files)
+ continue;
+
+ if (IS_POSIX(lockp))
+ n++;
+ else {
+ ckpt_err(ctx, rc, "%(T), checkpoint of lock "
+ "[%lld, %lld, %d, 0x%x] failed\n",
+ lockp->fl_start, lockp->fl_end,
+ lockp->fl_type, lockp->fl_flags);
+ goto out;
+ }
+ }
+
+ /*
+ * Checkpoint the count of file-locks
+ */
+ rc = checkpoint_file_lock_count(ctx, n);
+ if (rc < 0) {
+ ckpt_err(ctx, rc, "%(T), checkpoint file-lock count failed\n");
+ goto out;
+ }
+
+ /*
+ * Make a second pass and checkpoint file-locks themselves.
+ */
+ for_each_lock(inode, lockpp) {
+ lockp = *lockpp;
+ if (lockp->fl_owner != files)
+ continue;
+
+ rc = checkpoint_one_file_lock(ctx, lockp);
+ if (rc < 0)
+ goto out;
+ }
+
+out:
+ unlock_flocks();
+ return rc;
+}
+
/**
- * ckpt_write_file_desc - dump the state of a given file descriptor
+ * checkpoint_file_desc - dump the state of a given file descriptor
* @ctx: checkpoint context
* @files: files_struct pointer
* @fd: file descriptor
@@ -282,18 +405,6 @@ static int checkpoint_file_desc(struct ckpt_ctx *ctx,
}
rcu_read_unlock();
- ret = find_locks_with_owner(file, files);
- /*
- * find_locks_with_owner() returns an error when there
- * are no locks found, so we *want* it to return an error
- * code. Its success means we have to fail the checkpoint.
- */
- if (!ret) {
- ret = -EBADF;
- ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
- goto out;
- }
-
/* sanity check (although this shouldn't happen) */
ret = -EBADF;
if (!file) {
@@ -328,6 +439,11 @@ static int checkpoint_file_desc(struct ckpt_ctx *ctx,
h->fd_close_on_exec = coe;
ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ ret = checkpoint_file_locks(ctx, files, file);
+
out:
ckpt_hdr_put(ctx, h);
if (file)
@@ -792,8 +908,176 @@ static void *restore_file(struct ckpt_ctx *ctx)
return (void *)file;
}
+#if BITS_PER_LONG == 32
+
+/*
+ * NOTE: Even if we checkpointed a lock that was set with 'struct flock'
+ * restore the lock using 'struct flock64'. Note that both these lock
+ * types are first converted to a posix_file_lock before processing so
+ * converting to 'struct flock64' is (hopefully) not a problem.
+ * NFS for instance uses IS_SETLK() instead of cmd == F_SETLK.
+ *
+ * TODO: Are there filesystems that implement F_SETLK but not F_SETLK64 ?
+ * If there are, restore_one_posix_lock() will fail.
+ */
+static int
+ckpt_hdr_file_lock_to_flock64(struct ckpt_hdr_file_lock *h, struct flock64 *fl)
+{
+ /*
+ * We checkpoint the 'raw' fl_type which in case of leases includes
+ * the F_INPROGRESS flag. But for posix-locks, the fl_type should
+ * be simple.
+ */
+ switch(h->fl_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ case F_UNLCK:
+ break;
+ default:
+ ckpt_debug("Bad posix lock type 0x%x ?\n", h->fl_type);
+ return -EINVAL;
+ }
+
+ memset(fl, 0, sizeof(*fl));
+ fl->l_type = h->fl_type;
+ fl->l_start = h->fl_start;
+ fl->l_len = h->fl_end == OFFSET_MAX ? 0 : h->fl_end - h->fl_start + 1;
+ fl->l_whence = SEEK_SET;
+
+ /* TODO: Init ->l_sysid, l_pid fields */
+ ckpt_debug("Restoring filelock [%lld, %lld, %d]\n", fl->l_start,
+ fl->l_len, fl->l_type);
+
+ return 0;
+}
+
+static int restore_one_posix_lock(struct ckpt_ctx *ctx, struct file *file,
+ int fd, struct ckpt_hdr_file_lock *h)
+{
+ struct flock64 fl;
+ int ret;
+
+ ret = ckpt_hdr_file_lock_to_flock64(h, &fl);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T) Unexpected flock\n");
+ return ret;
+ }
+
+ /*
+ * Use F_SETLK because we should not have to wait for the lock. If
+ * another process holds the lock, it indicates that filesystem-state
+ * is not consistent with what it was at checkpoint. In which case we
+ * better fail.
+ */
+ ret = flock64_set(fd, file, F_SETLK64, &fl);
+ if (ret)
+ ckpt_err(ctx, ret, "flock64_set(): %d\n", (int)h->fl_type);
+
+ return ret;
+}
+
+#else
+
+static int
+ckpt_hdr_file_lock_to_flock(struct ckpt_hdr_file_lock *h, struct flock *fl)
+{
+ /*
+ * We checkpoint the 'raw' fl_type which in case of leases includes
+ * the F_INPROGRESS flag. But for posix-locks, the fl_type should
+ * be simple.
+ */
+ switch(h->fl_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ case F_UNLCK:
+ break;
+ default:
+ ckpt_debug("Bad posix lock type 0x%x ?\n", h->fl_type);
+ return -EINVAL;
+ }
+
+ memset(fl, 0, sizeof(*fl));
+
+ fl->l_type = h->fl_type;
+ fl->l_start = h->fl_start;
+ fl->l_len = fl->fl_end == OFFSET_MAX ? 0 : h->fl_end - h->fl_start + 1;
+ fl->l_whence = SEEK_SET;
+
+ ckpt_debug("Restoring filelock [%lld, %lld, %d]\n", fl->l_start,
+ fl->l_len, fl->l_type);
+
+ /* TODO: Init ->l_sysid, l_pid fields */
+
+ return 0;
+}
+
+static int restore_one_posix_lock(struct ckpt_ctx *ctx, struct file *file,
+ int fd, struct ckpt_hdr_file_lock *h)
+{
+ struct flock fl;
+ int ret;
+
+ ret = ckpt_hdr_file_lock_to_flock(h, &fl);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T) Unexpected flock\n");
+ break;
+ }
+
+ /*
+ * Use F_SETLK because we should not have to wait for the lock. If
+ * another process holds the lock, it indicates that filesystem-state
+ * is not consistent with what it was at checkpoint. In which case we
+ * better fail.
+ */
+ ret = flock_set(fd, file, F_SETLK, &fl);
+ if (ret)
+ ckpt_err(ctx, ret, "flock_set(): %d\n", (int)h->fl_type);
+
+ return ret;
+}
+#endif
+
+static int restore_file_locks(struct ckpt_ctx *ctx, struct file *file, int fd)
+{
+ int i, ret;
+ struct ckpt_hdr_file_lock *h;
+ struct ckpt_hdr_file_lock_count *hfc;
+
+ hfc = ckpt_read_obj_type(ctx, sizeof(*hfc), CKPT_HDR_FILE_LOCK_COUNT);
+ if (IS_ERR(hfc))
+ return PTR_ERR(hfc);
+
+ ret = 0;
+ for (i = 0; i < hfc->nr_locks; i++) {
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_LOCK);
+ if (IS_ERR(h)) {
+ ret = PTR_ERR(h);
+ goto out;
+ }
+
+ ckpt_debug("Lock [%lld, %lld, %d, 0x%x]\n", h->fl_start,
+ h->fl_end, (int)h->fl_type, h->fl_flags);
+
+ ret = -EBADF;
+ if (h->fl_flags & FL_POSIX)
+ ret = restore_one_posix_lock(ctx, file, fd, h);
+
+ ckpt_hdr_put(ctx, h);
+
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)\n");
+ goto out;
+ }
+
+ }
+out:
+ ckpt_hdr_put(ctx, hfc);
+ return ret;
+}
+
/**
- * ckpt_read_file_desc - restore the state of a given file descriptor
+ * restore_file_desc - restore the state of a given file descriptor
* @ctx: checkpoint context
*
* Restores the state of a file descriptor; looks up the objref (in the
@@ -839,7 +1123,11 @@ static int restore_file_desc(struct ckpt_ctx *ctx)
}
set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
- ret = 0;
+
+ ret = restore_file_locks(ctx, file, h->fd_descriptor);
+ if (ret < 0)
+ ckpt_err(ctx, ret, "Error on fd %d\n", h->fd_descriptor);
+
out:
ckpt_hdr_put(ctx, h);
return ret;
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 049bb82..7b3267c 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -159,6 +159,10 @@ enum {
#define CKPT_HDR_TTY_LDISC CKPT_HDR_TTY_LDISC
CKPT_HDR_EPOLL_ITEMS, /* must be after file-table */
#define CKPT_HDR_EPOLL_ITEMS CKPT_HDR_EPOLL_ITEMS
+ CKPT_HDR_FILE_LOCK_COUNT,
+#define CKPT_HDR_FILE_LOCK_COUNT CKPT_HDR_FILE_LOCK_COUNT
+ CKPT_HDR_FILE_LOCK,
+#define CKPT_HDR_FILE_LOCK CKPT_HDR_FILE_LOCK
CKPT_HDR_MM = 401,
#define CKPT_HDR_MM CKPT_HDR_MM
@@ -614,6 +618,19 @@ struct ckpt_hdr_file_generic {
struct ckpt_hdr_file common;
} __attribute__((aligned(8)));
+struct ckpt_hdr_file_lock_count {
+ struct ckpt_hdr h;
+ __u32 nr_locks;
+};
+
+struct ckpt_hdr_file_lock {
+ struct ckpt_hdr h;
+ __s64 fl_start;
+ __s64 fl_end;
+ __u8 fl_type;
+ __u8 fl_flags;
+};
+
struct ckpt_hdr_file_pipe {
struct ckpt_hdr_file common;
__s32 pipe_objref;
--
1.6.0.4
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 5/5][v5][cr]: Document design of C/R of file-locks
2010-10-29 6:16 [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
` (3 preceding siblings ...)
2010-10-29 6:16 ` [PATCH 4/5][v5][cr]: Checkpoint/restore file-locks Sukadev Bhattiprolu
@ 2010-10-29 6:16 ` Sukadev Bhattiprolu
2010-10-29 14:31 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Lin Ming
5 siblings, 0 replies; 8+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 6:16 UTC (permalink / raw)
To: Oren Laadan
Cc: Serge Hallyn, Matt Helsley, Dan Smith, Matthew Wilcox,
Jamie Lokier, Steven Whitehouse, linux-fsdevel, Containers
Summarize the file-system consistency requirements and the design of
the C/R of file-locks and leases.
Changelog[v5]:
- This version of the patchset only checkpoints/restores file-locks.
C/R of file-owner information requires additional work with struct
pids and will be addressed in a follow-on patch. C/R of file-leases,
depends on C/R of file-owner info Removed the design information of
C/R of file leases from the Documenation for now.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
Documentation/checkpoint/file-locks | 52 +++++++++++++++++++++++++++++++++++
1 files changed, 52 insertions(+), 0 deletions(-)
create mode 100644 Documentation/checkpoint/file-locks
diff --git a/Documentation/checkpoint/file-locks b/Documentation/checkpoint/file-locks
new file mode 100644
index 0000000..ccffdef
--- /dev/null
+++ b/Documentation/checkpoint/file-locks
@@ -0,0 +1,52 @@
+
+Filesystem consistency across C/R.
+==================================
+
+To checkpoint/restart a process that is using any filesystem resource, the
+kernel assumes that the file system state at the time of restart is consistent
+with its state at the time of checkpoint. In general, this consistency can be
+achieved by:
+
+ a. running the application inside a container (to ensure no process
+ outside the container modifies the filesystem/IPC or other states)
+
+ b. freezing the application before checkpoint
+ c. taking a snapshot of the file system while application is frozen
+ d. checkpointing the application while it is frozen
+
+ e. restoring the file system state to its snapshot
+ f. restart the application inside a container
+
+i.e the kernel assumes that file system state is consistent but it does/can
+NOT verify that it is. The administrator must provide this consistency taking
+into account the file system type including whether it is local or remote,
+and the tools available in the file system (snapshot tools in btrfs or rsync
+etc).
+
+For distributed applications operating on distributed filesystems, it is
+expected that an external mechanism will coordinate the freeze/checkpoint/
+snapshot/restart across the nodes. IOW, the current semantics in the kernel
+provide for C/R on a single node.
+
+Checkpoint/restart of file-locks.
+================================
+
+To checkpoint file-locks in an application, we start with each file-descriptor
+and count the number of file-locks on that file-descriptor. We save this count
+in the checkpoint image, and then information about each file-lock on the
+file-descriptor.
+
+When restarting the application from the checkpoint, we read the file-lock
+count for each file-descriptor and then read the information about each
+file-lock. For each file-lock, we call flock_set() to set a new file-lock.
+
+No special handling is necessary for a process P2 in the checkpointed container
+that is blocked on a file-lock, L1 held by another process P1. Processes in the
+restarted container begin execution only after all processes have restored.
+If the blocked process P2 is restored first, it will prepare to return an
+-ERESTARTSYS from the fcntl() system call, but wait for P1 to be restored.
+When P1 is restored, it will re-acquire the file-lock L1 before P1 and P2 begin
+actual execution.
+
+This ensures that even if P2 is scheduled to run before P1, P2 will go
+back to waiting for the file-lock L1.
--
1.6.0.4
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 0/5][v5][cr] Checkpoint/restart file locks
2010-10-29 6:16 [PATCH 0/5][v5][cr] Checkpoint/restart file locks Sukadev Bhattiprolu
` (4 preceding siblings ...)
2010-10-29 6:16 ` [PATCH 5/5][v5][cr]: Document design of C/R of file-locks Sukadev Bhattiprolu
@ 2010-10-29 14:31 ` Lin Ming
2010-10-29 18:35 ` Sukadev Bhattiprolu
5 siblings, 1 reply; 8+ messages in thread
From: Lin Ming @ 2010-10-29 14:31 UTC (permalink / raw)
To: Sukadev Bhattiprolu
Cc: Oren Laadan, Serge Hallyn, Matt Helsley, Dan Smith,
Matthew Wilcox, Jamie Lokier, Steven Whitehouse, linux-fsdevel,
Containers
On Fri, Oct 29, 2010 at 2:16 PM, Sukadev Bhattiprolu
<sukadev@linux.vnet.ibm.com> wrote:
> Checkpoint/restart file locks.
>
> Changelog[v5]:
> - This patchset only checkpoints/restores file locks. C/R of
> file-owner and file-leases will be addressed in follown patches.
> C/R of file-owner information must deal with nested-containers
> and, will need a way to C/R struct pids. C/R of file-leases depends
> on C/R of file-owner information.
>
>
> Sukadev Bhattiprolu (5):
> Move file_lock macros into linux/fs.h
> Define flock_set()
> Define flock64_set()
> Checkpoint/restore file-locks
> Document design of C/R of file-locks and leases
>
> Documentation/checkpoint/file-locks | 52 ++++++
> fs/checkpoint.c | 318 +++++++++++++++++++++++++++++++++--
> fs/locks.c | 89 ++++++----
> include/linux/checkpoint_hdr.h | 17 ++
> include/linux/fs.h | 10 +
> 5 files changed, 433 insertions(+), 53 deletions(-)
> create mode 100644 Documentation/checkpoint/file-locks
Hi,
Which tree are these patches against?
I can't apply them neither to Linus tree(18cb657c) nor
vfs-2.6.git/for-linus branch(a4cdbd8b).
Lin Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 0/5][v5][cr] Checkpoint/restart file locks
2010-10-29 14:31 ` [PATCH 0/5][v5][cr] Checkpoint/restart file locks Lin Ming
@ 2010-10-29 18:35 ` Sukadev Bhattiprolu
0 siblings, 0 replies; 8+ messages in thread
From: Sukadev Bhattiprolu @ 2010-10-29 18:35 UTC (permalink / raw)
To: Lin Ming
Cc: Matthew Wilcox, Containers, Jamie Lokier, linux-fsdevel,
Dan Smith, Steven Whitehouse
Lin Ming [lin@ming.vg] wrote:
| Hi,
|
| Which tree are these patches against?
|
| I can't apply them neither to Linus tree(18cb657c) nor
| vfs-2.6.git/for-linus branch(a4cdbd8b).
These apply to the checkpoint/restart tree:
git://git.ncl.cs.columbia.edu/pub/git/linux-cr.git
They need the checkpoint/restart infrastructure which is not in the main
line kernel yet. We have been using the [cr] prefix to identify these
patches, but will add a pointer to the above tree in the future.
Thanks,
Sukadev
^ permalink raw reply [flat|nested] 8+ messages in thread