All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] Revert "sysvipc-shm: correctly handle deleted (active) ipc shared memory"
@ 2009-04-15 16:02 Serge E. Hallyn
       [not found] ` <20090415160221.GA31929-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Serge E. Hallyn @ 2009-04-15 16:02 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers

( this patchset applies on top of v14-3-dev to turn the deferqueue into
a generic mechanism )

From 274f58b8f265dc54e13728aa742c4852bde44ec1 Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Date: Tue, 14 Apr 2009 16:06:41 -0700
Subject: [PATCH 1/4] Revert "sysvipc-shm: correctly handle deleted (active) ipc shared memory"

This reverts commit 373eb88b3d338edc5039d34a30a4574f9a9fdfba.
---
 ipc/ckpt_shm.c |   44 ++++----------------------------------------
 1 files changed, 4 insertions(+), 40 deletions(-)

diff --git a/ipc/ckpt_shm.c b/ipc/ckpt_shm.c
index c5b7f60..ee9b77a 100644
--- a/ipc/ckpt_shm.c
+++ b/ipc/ckpt_shm.c
@@ -145,25 +145,6 @@ int cr_write_ipc_shm(struct cr_ctx *ctx, struct ipc_namespace *ipcns)
  * ipc restart
  */
 
-struct cr_dq_ipcshm_del {
-	struct ipc_namespace *ipcns;
-	int id;
-};
-
-static int cr_ipc_shm_delete(void *data)
-{
-	struct cr_dq_ipcshm_del *dq = (struct cr_dq_ipcshm_del *) data;
-	mm_segment_t old_fs;
-	int ret;
-
-	old_fs = get_fs();
-	set_fs(get_ds());
-	ret = shmctl_down(dq->ipcns, dq->id, IPC_RMID, NULL, 0);
-	set_fs(old_fs);
-
-	return ret;
-}
-
 int cr_ipc_shm_attach(struct file *file,
 		      unsigned long vm_addr,
 		      unsigned long vm_flags)
@@ -243,25 +224,7 @@ static int cr_do_read_ipc_shm(struct cr_ctx *ctx)
 	if (hh->flags & SHM_HUGETLB)	/* FIXME: support SHM_HUGETLB */
 		goto out;
 
-	/*
-	 * SHM_DEST means that the shm is to be deleted after creation.
-	 * However, deleting before it's actually attached is quite silly.
-	 * Instead, we defer this task to until restart has succeeded.
-	 */
-	if (hh->perms.mode & SHM_DEST) {
-		struct cr_dq_ipcshm_del dq;
-
-		/* to not confuse the rest of the code */
-		hh->perms.mode &= ~SHM_DEST;
-
-		dq.ipcns = current->nsproxy->ipc_ns;
-		dq.id = hh->perms.id;
-
-		ret = cr_deferqueue_add(ctx, cr_ipc_shm_delete,
-				       0, &dq, sizeof(dq));
-		if (ret < 0)
-			goto out;
-	}
+	/* FIXME: this will fail for deleted ipc shm segments */
 
 	shmflag = hh->flags | hh->perms.mode | IPC_CREAT | IPC_EXCL;
 	cr_debug("shm: do_shmget size %lld flag %#x id %d\n",
@@ -272,6 +235,7 @@ static int cr_do_read_ipc_shm(struct cr_ctx *ctx)
 		goto out;
 
 	down_write(&shm_ids->rw_mutex);
+
 	ret = -EIDRM;
 	perms = ipc_lock(shm_ids, hh->perms.id);
 	if (IS_ERR(perms)) {	/* this should not happen .. but be safe */
@@ -297,9 +261,9 @@ static int cr_do_read_ipc_shm(struct cr_ctx *ctx)
 	/* deposit in objhash and read contents in */
 	ret = cr_obj_add_ref(ctx, file, hh->objref, CR_OBJ_FILE, 0);
 	if (ret < 0)
-		goto fput;
+		goto file;
 	ret = cr_read_shmem_contents(ctx, file->f_dentry->d_inode);
- fput:
+ file:
 	fput(file);
  out:
 	cr_hbuf_put(ctx, sizeof(*hh));
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/4] Revert "Infrastructure for work postponed to the end of checkpoint/restart"
       [not found] ` <20090415160221.GA31929-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-04-15 16:02   ` Serge E. Hallyn
  2009-04-15 16:02   ` [PATCH 3/4] deferqueue: generic queue to defer work Serge E. Hallyn
  2009-04-15 16:03   ` [PATCH 4/4] sysvipc-shm: correctly handle deleted (active) ipc shared memory Serge E. Hallyn
  2 siblings, 0 replies; 5+ messages in thread
From: Serge E. Hallyn @ 2009-04-15 16:02 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers

This reverts commit cf909e388fc45c3c462ce5f0dbf9b83988a5c1ea.

Conflicts:

	checkpoint/Makefile
---
 checkpoint/Makefile        |    2 +-
 checkpoint/checkpoint.c    |    4 ---
 checkpoint/deferqueue.c    |   62 --------------------------------------------
 checkpoint/restart.c       |    4 ---
 checkpoint/sys.c           |    7 -----
 include/linux/checkpoint.h |    9 ------
 6 files changed, 1 insertions(+), 87 deletions(-)
 delete mode 100644 checkpoint/deferqueue.c

diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index e64784e..23b1e3c 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -2,7 +2,7 @@
 # Makefile for linux checkpoint/restart.
 #
 
-obj-$(CONFIG_CHECKPOINT) += sys.o objhash.o deferqueue.o \
+obj-$(CONFIG_CHECKPOINT) += sys.o objhash.o \
 		checkpoint.o restart.o \
 		ckpt_task.o rstr_task.o \
 		ckpt_mem.o rstr_mem.o \
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 47d5bd1..7382cc3 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -550,10 +550,6 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	if (ret < 0)
 		goto out;
 
-	ret = cr_deferqueue_run(ctx);
-	if (ret < 0)
-		goto out;
-
 	ctx->crid = atomic_inc_return(&cr_ctx_count);
 
 	/* on success, return (unique) checkpoint identifier */
diff --git a/checkpoint/deferqueue.c b/checkpoint/deferqueue.c
deleted file mode 100644
index a02d577..0000000
--- a/checkpoint/deferqueue.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- *  Checkpoint-restart - infrastructure to manage deferred work
- *
- *  Copyright (C) 2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-#include <linux/list.h>
-#include <linux/checkpoint.h>
-
-struct cr_deferqueue {
-	cr_deferqueue_func_t function;
-	unsigned int flags;
-	struct list_head list;
-	char data[0];
-};
-
-int cr_deferqueue_add(struct cr_ctx *ctx, cr_deferqueue_func_t function,
-		     unsigned int flags, void *data, int size)
-{
-	struct cr_deferqueue *wq;
-
-	wq = kmalloc(sizeof(wq) + size, GFP_KERNEL);
-	if (!wq)
-		return -ENOMEM;
-
-	wq->function = function;
-	wq->flags = flags;
-	memcpy(wq->data, data, size);
-
-	cr_debug("adding work %p function %p\n", wq, wq->function);
-	list_add_tail(&ctx->deferqueue, &wq->list);
-	return 0;
-}
-
-/*
- * cr_deferqueue_run - perform all work in the work queue
- * @ctx: checkpoint context
- *
- * returns: number of works performed, or < 0 on error
- */
-int cr_deferqueue_run(struct cr_ctx *ctx)
-{
-	struct cr_deferqueue *wq, *n;
-	int nr = 0;
-	int ret;
-
-	list_for_each_entry_safe(wq, n, &ctx->deferqueue, list) {
-		cr_debug("doing work %p function %p\n", wq, wq->function);
-		ret = wq->function(wq->data);
-		if (ret < 0)
-			cr_debug("wq function failed %d\n", ret);
-		list_del(&wq->list);
-		kfree(wq);
-		nr++;
-	}
-
-	return nr;
-}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index dad257e..5293b9a 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -484,10 +484,6 @@ static int do_restart_root(struct cr_ctx *ctx, pid_t pid)
 	if (ret < 0)
 		return ret;
 
-	ret = cr_deferqueue_run(ctx);
-	if (ret < 0)
-		return ret;
-
 	return cr_read_tail(ctx);
 }
 
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index afcbf75..63ee55e 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -171,14 +171,8 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
 
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
-	int ret;
-
 	BUG_ON(atomic_read(&ctx->refcount));
 
-	ret = cr_deferqueue_run(ctx);
-	if (ret != 0)
-		cr_debug("deferred deferqueue had %d entries", ret);
-
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -217,7 +211,6 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
-	INIT_LIST_HEAD(&ctx->deferqueue);
 	init_waitqueue_head(&ctx->waitq);
 
 	err = -EBADF;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 898176c..7e8d4e0 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -42,7 +42,6 @@ struct cr_ctx {
 	atomic_t refcount;
 
 	struct cr_objhash *objhash;	/* hash for shared objects */
-	struct list_head deferqueue;	/* list of deferred works */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
@@ -75,14 +74,6 @@ extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
 extern void cr_ctx_get(struct cr_ctx *ctx);
 extern void cr_ctx_put(struct cr_ctx *ctx);
 
-/* deferred tasks */
-
-typedef int (*cr_deferqueue_func_t)(void *);
-
-extern int cr_deferqueue_run(struct cr_ctx *ctx);
-extern int cr_deferqueue_add(struct cr_ctx *ctx, cr_deferqueue_func_t func,
-			     unsigned int flags, void *data, int size);
-
 /* shared objects handling */
 
 enum {
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/4] deferqueue: generic queue to defer work
       [not found] ` <20090415160221.GA31929-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-04-15 16:02   ` [PATCH 2/4] Revert "Infrastructure for work postponed to the end of checkpoint/restart" Serge E. Hallyn
@ 2009-04-15 16:02   ` Serge E. Hallyn
       [not found]     ` <20090415160254.GB31983-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-04-15 16:03   ` [PATCH 4/4] sysvipc-shm: correctly handle deleted (active) ipc shared memory Serge E. Hallyn
  2 siblings, 1 reply; 5+ messages in thread
From: Serge E. Hallyn @ 2009-04-15 16:02 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers

Add a interface to postpone an action until the end of the entire
checkpoint or restart operation. This is useful when during the
scan of tasks an operation cannot be performed in place, to avoid
the need for a second scan.

One use case is when restoring an ipc shared memory region that has
been deleted (but is still attached), during restart it needs to be
create, attached and then deleted. However, creation and attachment
are performed in distinct locations, so deletion can not be performed
on the spot. Instead, this work (delete) is deferred until later.
(This example is in one of the following patches).

The interface is as follows:

deferqueue_create(void):
  Allocated a new deferqueue.

deferqueue_run(deferqueue):
  Execute all the pending works in the queue. Returns the number of
  works executed, or an error.

deferqueue_add(deferqueue, function, data, size):
  Enqueue a postponed work. @function is the function to do the work,
  which will be called with @data as an argument. @size tells the
  size of data.

deferqueue_destroy(deferqueue):
  Free the deferqueue and any queued items.

Why aren't we using the existing kernel workqueue mechanism?  We need
to defer to work until the end of the operation: not earlier, since we
need other things to be in place; not later, to not block waiting for
it. However, the workqueue schedules the work for 'some time later'.
Also, the kernel workqueue may run in any task context, but we require
many times that an operation be run in the context of some specific
restarting task (e.g., restoring IPC state of a certain ipc_ns).

Instead, this mechanism is a simple way for the c/r operation as a
whole, and later a task in particular, to defer some action until
later (but not arbitrarily later) _in the restart_ operation.

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/Kconfig         |    5 ++
 include/linux/deferqueue.h |   31 ++++++++++++++
 kernel/Makefile            |    1 +
 kernel/deferqueue.c        |   94 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 131 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/deferqueue.h
 create mode 100644 kernel/deferqueue.c

diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
index 1761b0a..53ed6fa 100644
--- a/checkpoint/Kconfig
+++ b/checkpoint/Kconfig
@@ -2,9 +2,14 @@
 # implemented the hooks for processor state etc. needed by the
 # core checkpoint/restart code.
 
+config DEFERQUEUE
+	bool
+	default n
+
 config CHECKPOINT
 	bool "Enable checkpoint/restart (EXPERIMENTAL)"
 	depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
+	select DEFERQUEUE
 	help
 	  Application checkpoint/restart is the ability to save the
 	  state of a running application so that it can later resume
diff --git a/include/linux/deferqueue.h b/include/linux/deferqueue.h
new file mode 100644
index 0000000..fbdc897
--- /dev/null
+++ b/include/linux/deferqueue.h
@@ -0,0 +1,31 @@
+/*
+ * workqueue.h --- work queue handling for Linux.
+ */
+
+#ifndef _LINUX_DEFERQUEUE_H
+#define _LINUX_DEFERQUEUE_H
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+typedef int (*deferqueue_func_t)(void *);
+
+struct deferqueue_entry {
+	deferqueue_func_t function;
+	struct list_head list;
+	char data[0];
+};
+
+struct deferqueue_head {
+	spinlock_t lock;
+	struct list_head list;
+};
+
+struct deferqueue_head *deferqueue_create(void);
+void deferqueue_destroy(struct deferqueue_head *h);
+int deferqueue_add(struct deferqueue_head *head, deferqueue_func_t function,
+		void *data, int size);
+int deferqueue_run(struct deferqueue_head *head);
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3..0848374 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,6 +22,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 endif
 
+obj-$(CONFIG_DEFERQUEUE) += deferqueue.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
diff --git a/kernel/deferqueue.c b/kernel/deferqueue.c
new file mode 100644
index 0000000..35abab0
--- /dev/null
+++ b/kernel/deferqueue.c
@@ -0,0 +1,94 @@
+/*
+ *  Checkpoint-restart - infrastructure to manage deferred work
+ *
+ *  This differs from a workqueue in that the work must be deferred
+ *  until specifically run by the caller.
+ *
+ *  As the only user currently is checkpoint/restart, which has
+ *  very simple usage, the locking is kept simple.  Adding rules
+ *  is protected by the head->lock.  But deferqueue_run() is only
+ *  called once, after all entries have been added.  So it is not
+ *  protected.  Similarly, _destroy is only called once when the
+ *  cr_ctx is releeased, so it is not locked or refcounted.  These
+ *  can of course be added if needed by other users.
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/deferqueue.h>
+
+struct deferqueue_head *deferqueue_create(void)
+{
+	struct deferqueue_head *h = kmalloc(sizeof(*h), GFP_KERNEL);
+	if (h) {
+		spin_lock_init(&h->lock);
+		INIT_LIST_HEAD(&h->list);
+	}
+	return h;
+}
+
+void deferqueue_destroy(struct deferqueue_head *h)
+{
+	if (!list_empty(&h->list)) {
+		struct deferqueue_entry *wq, *n;
+
+		pr_debug("%s: freeing non-empty queue\n", __func__);
+		list_for_each_entry_safe(wq, n, &h->list, list) {
+			list_del(&wq->list);
+			kfree(wq);
+		}
+	}
+	kfree(h);
+}
+
+int deferqueue_add(struct deferqueue_head *head, deferqueue_func_t function,
+		void *data, int size)
+{
+	struct deferqueue_entry *wq;
+
+	wq = kmalloc(sizeof(wq) + size, GFP_KERNEL);
+	if (!wq)
+		return -ENOMEM;
+
+	wq->function = function;
+	memcpy(wq->data, data, size);
+
+	pr_debug("%s: adding work %p function %p\n", __func__, wq,
+			wq->function);
+	spin_lock(&head->lock);
+	list_add_tail(&head->list, &wq->list);
+	spin_unlock(&head->lock);
+	return 0;
+}
+
+/*
+ * deferqueue_run - perform all work in the work queue
+ * @head: deferqueue_head from which to run
+ *
+ * returns: number of works performed, or < 0 on error
+ */
+int deferqueue_run(struct deferqueue_head *head)
+{
+	struct deferqueue_entry *wq, *n;
+	int nr = 0;
+	int ret;
+
+	list_for_each_entry_safe(wq, n, &head->list, list) {
+		pr_debug("doing work %p function %p\n", wq, wq->function);
+		ret = wq->function(wq->data);
+		if (ret < 0)
+			pr_debug("wq function failed %d\n", ret);
+		list_del(&wq->list);
+		kfree(wq);
+		nr++;
+	}
+
+	return nr;
+}
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/4] sysvipc-shm: correctly handle deleted (active) ipc shared memory
       [not found] ` <20090415160221.GA31929-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-04-15 16:02   ` [PATCH 2/4] Revert "Infrastructure for work postponed to the end of checkpoint/restart" Serge E. Hallyn
  2009-04-15 16:02   ` [PATCH 3/4] deferqueue: generic queue to defer work Serge E. Hallyn
@ 2009-04-15 16:03   ` Serge E. Hallyn
  2 siblings, 0 replies; 5+ messages in thread
From: Serge E. Hallyn @ 2009-04-15 16:03 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers

During restart, an ipc shared region may have SHM_DEST, indicating
that it has been originally deleted (while still active). In this
case the task of deleting the region after restoring it is postponed
until the end of the restart; otherwise, it would be quite silly to
delete it at that time, because it will be ... gone :o

Changelog:
	Apr 14 2009: Serge converts to generic deferqueue

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/sys.c           |   11 ++++++++++
 include/linux/checkpoint.h |    1 +
 ipc/ckpt_shm.c             |   45 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 63ee55e..3dfb7d9 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/capability.h>
 #include <linux/checkpoint.h>
+#include <linux/deferqueue.h>
 
 #include "checkpoint_mem.h"
 
@@ -171,8 +172,14 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
 
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
+	int ret;
+
 	BUG_ON(atomic_read(&ctx->refcount));
 
+	ret = deferqueue_run(ctx->deferqueue);
+	if (ret != 0)
+		cr_debug("deferred deferqueue had %d entries", ret);
+
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -219,6 +226,10 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 		goto err;
 
 	err = -ENOMEM;
+	ctx->deferqueue = deferqueue_create();
+	if (!ctx->deferqueue)
+		goto err;
+
 	ctx->hbuf = kmalloc(CR_HBUF_TOTAL, GFP_KERNEL);
 	if (!ctx->hbuf)
 		goto err;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 7e8d4e0..82d2a40 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -42,6 +42,7 @@ struct cr_ctx {
 	atomic_t refcount;
 
 	struct cr_objhash *objhash;	/* hash for shared objects */
+	struct deferqueue_head *deferqueue; /* list of deferred work */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
diff --git a/ipc/ckpt_shm.c b/ipc/ckpt_shm.c
index ee9b77a..a944b67 100644
--- a/ipc/ckpt_shm.c
+++ b/ipc/ckpt_shm.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/deferqueue.h>
 
 #include <linux/msg.h>	/* needed for util.h that uses 'struct msg_msg' */
 #include "util.h"
@@ -145,6 +146,25 @@ int cr_write_ipc_shm(struct cr_ctx *ctx, struct ipc_namespace *ipcns)
  * ipc restart
  */
 
+struct cr_dq_ipcshm_del {
+	struct ipc_namespace *ipcns;
+	int id;
+};
+
+static int cr_ipc_shm_delete(void *data)
+{
+	struct cr_dq_ipcshm_del *dq = (struct cr_dq_ipcshm_del *) data;
+	mm_segment_t old_fs;
+	int ret;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	ret = shmctl_down(dq->ipcns, dq->id, IPC_RMID, NULL, 0);
+	set_fs(old_fs);
+
+	return ret;
+}
+
 int cr_ipc_shm_attach(struct file *file,
 		      unsigned long vm_addr,
 		      unsigned long vm_flags)
@@ -224,7 +244,25 @@ static int cr_do_read_ipc_shm(struct cr_ctx *ctx)
 	if (hh->flags & SHM_HUGETLB)	/* FIXME: support SHM_HUGETLB */
 		goto out;
 
-	/* FIXME: this will fail for deleted ipc shm segments */
+	/*
+	 * SHM_DEST means that the shm is to be deleted after creation.
+	 * However, deleting before it's actually attached is quite silly.
+	 * Instead, we defer this task to until restart has succeeded.
+	 */
+	if (hh->perms.mode & SHM_DEST) {
+		struct cr_dq_ipcshm_del dq;
+
+		/* to not confuse the rest of the code */
+		hh->perms.mode &= ~SHM_DEST;
+
+		dq.ipcns = current->nsproxy->ipc_ns;
+		dq.id = hh->perms.id;
+
+		ret = deferqueue_add(ctx->deferqueue, cr_ipc_shm_delete,
+				       &dq, sizeof(dq));
+		if (ret < 0)
+			goto out;
+	}
 
 	shmflag = hh->flags | hh->perms.mode | IPC_CREAT | IPC_EXCL;
 	cr_debug("shm: do_shmget size %lld flag %#x id %d\n",
@@ -235,7 +273,6 @@ static int cr_do_read_ipc_shm(struct cr_ctx *ctx)
 		goto out;
 
 	down_write(&shm_ids->rw_mutex);
-
 	ret = -EIDRM;
 	perms = ipc_lock(shm_ids, hh->perms.id);
 	if (IS_ERR(perms)) {	/* this should not happen .. but be safe */
@@ -261,9 +298,9 @@ static int cr_do_read_ipc_shm(struct cr_ctx *ctx)
 	/* deposit in objhash and read contents in */
 	ret = cr_obj_add_ref(ctx, file, hh->objref, CR_OBJ_FILE, 0);
 	if (ret < 0)
-		goto file;
+		goto fput;
 	ret = cr_read_shmem_contents(ctx, file->f_dentry->d_inode);
- file:
+ fput:
 	fput(file);
  out:
 	cr_hbuf_put(ctx, sizeof(*hh));
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/4] deferqueue: generic queue to defer work
       [not found]     ` <20090415160254.GB31983-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-04-17 20:33       ` Oren Laadan
  0 siblings, 0 replies; 5+ messages in thread
From: Oren Laadan @ 2009-04-17 20:33 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers


Thanks, taken.

Serge E. Hallyn wrote:
> Add a interface to postpone an action until the end of the entire
> checkpoint or restart operation. This is useful when during the
> scan of tasks an operation cannot be performed in place, to avoid
> the need for a second scan.
> 
> One use case is when restoring an ipc shared memory region that has
> been deleted (but is still attached), during restart it needs to be
> create, attached and then deleted. However, creation and attachment
> are performed in distinct locations, so deletion can not be performed
> on the spot. Instead, this work (delete) is deferred until later.
> (This example is in one of the following patches).
> 
> The interface is as follows:
> 
> deferqueue_create(void):
>   Allocated a new deferqueue.
> 
> deferqueue_run(deferqueue):
>   Execute all the pending works in the queue. Returns the number of
>   works executed, or an error.
> 
> deferqueue_add(deferqueue, function, data, size):
>   Enqueue a postponed work. @function is the function to do the work,
>   which will be called with @data as an argument. @size tells the
>   size of data.
> 
> deferqueue_destroy(deferqueue):
>   Free the deferqueue and any queued items.
> 
> Why aren't we using the existing kernel workqueue mechanism?  We need
> to defer to work until the end of the operation: not earlier, since we
> need other things to be in place; not later, to not block waiting for
> it. However, the workqueue schedules the work for 'some time later'.
> Also, the kernel workqueue may run in any task context, but we require
> many times that an operation be run in the context of some specific
> restarting task (e.g., restoring IPC state of a certain ipc_ns).
> 
> Instead, this mechanism is a simple way for the c/r operation as a
> whole, and later a task in particular, to defer some action until
> later (but not arbitrarily later) _in the restart_ operation.
> 
> Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/Kconfig         |    5 ++
>  include/linux/deferqueue.h |   31 ++++++++++++++
>  kernel/Makefile            |    1 +
>  kernel/deferqueue.c        |   94 ++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 131 insertions(+), 0 deletions(-)
>  create mode 100644 include/linux/deferqueue.h
>  create mode 100644 kernel/deferqueue.c
> 
> diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
> index 1761b0a..53ed6fa 100644
> --- a/checkpoint/Kconfig
> +++ b/checkpoint/Kconfig
> @@ -2,9 +2,14 @@
>  # implemented the hooks for processor state etc. needed by the
>  # core checkpoint/restart code.
>  
> +config DEFERQUEUE
> +	bool
> +	default n
> +
>  config CHECKPOINT
>  	bool "Enable checkpoint/restart (EXPERIMENTAL)"
>  	depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
> +	select DEFERQUEUE
>  	help
>  	  Application checkpoint/restart is the ability to save the
>  	  state of a running application so that it can later resume
> diff --git a/include/linux/deferqueue.h b/include/linux/deferqueue.h
> new file mode 100644
> index 0000000..fbdc897
> --- /dev/null
> +++ b/include/linux/deferqueue.h
> @@ -0,0 +1,31 @@
> +/*
> + * workqueue.h --- work queue handling for Linux.
> + */
> +
> +#ifndef _LINUX_DEFERQUEUE_H
> +#define _LINUX_DEFERQUEUE_H
> +
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +
> +typedef int (*deferqueue_func_t)(void *);
> +
> +struct deferqueue_entry {
> +	deferqueue_func_t function;
> +	struct list_head list;
> +	char data[0];
> +};
> +
> +struct deferqueue_head {
> +	spinlock_t lock;
> +	struct list_head list;
> +};
> +
> +struct deferqueue_head *deferqueue_create(void);
> +void deferqueue_destroy(struct deferqueue_head *h);
> +int deferqueue_add(struct deferqueue_head *head, deferqueue_func_t function,
> +		void *data, int size);
> +int deferqueue_run(struct deferqueue_head *head);
> +
> +#endif
> diff --git a/kernel/Makefile b/kernel/Makefile
> index e4791b3..0848374 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -22,6 +22,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
>  CFLAGS_REMOVE_sched_clock.o = -pg
>  endif
>  
> +obj-$(CONFIG_DEFERQUEUE) += deferqueue.o
>  obj-$(CONFIG_FREEZER) += freezer.o
>  obj-$(CONFIG_PROFILING) += profile.o
>  obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
> diff --git a/kernel/deferqueue.c b/kernel/deferqueue.c
> new file mode 100644
> index 0000000..35abab0
> --- /dev/null
> +++ b/kernel/deferqueue.c
> @@ -0,0 +1,94 @@
> +/*
> + *  Checkpoint-restart - infrastructure to manage deferred work
> + *
> + *  This differs from a workqueue in that the work must be deferred
> + *  until specifically run by the caller.
> + *
> + *  As the only user currently is checkpoint/restart, which has
> + *  very simple usage, the locking is kept simple.  Adding rules
> + *  is protected by the head->lock.  But deferqueue_run() is only
> + *  called once, after all entries have been added.  So it is not
> + *  protected.  Similarly, _destroy is only called once when the
> + *  cr_ctx is releeased, so it is not locked or refcounted.  These
> + *  can of course be added if needed by other users.
> + *
> + *  Copyright (C) 2009 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/deferqueue.h>
> +
> +struct deferqueue_head *deferqueue_create(void)
> +{
> +	struct deferqueue_head *h = kmalloc(sizeof(*h), GFP_KERNEL);
> +	if (h) {
> +		spin_lock_init(&h->lock);
> +		INIT_LIST_HEAD(&h->list);
> +	}
> +	return h;
> +}
> +
> +void deferqueue_destroy(struct deferqueue_head *h)
> +{
> +	if (!list_empty(&h->list)) {
> +		struct deferqueue_entry *wq, *n;
> +
> +		pr_debug("%s: freeing non-empty queue\n", __func__);
> +		list_for_each_entry_safe(wq, n, &h->list, list) {
> +			list_del(&wq->list);
> +			kfree(wq);
> +		}
> +	}
> +	kfree(h);
> +}
> +
> +int deferqueue_add(struct deferqueue_head *head, deferqueue_func_t function,
> +		void *data, int size)
> +{
> +	struct deferqueue_entry *wq;
> +
> +	wq = kmalloc(sizeof(wq) + size, GFP_KERNEL);
> +	if (!wq)
> +		return -ENOMEM;
> +
> +	wq->function = function;
> +	memcpy(wq->data, data, size);
> +
> +	pr_debug("%s: adding work %p function %p\n", __func__, wq,
> +			wq->function);
> +	spin_lock(&head->lock);
> +	list_add_tail(&head->list, &wq->list);
> +	spin_unlock(&head->lock);
> +	return 0;
> +}
> +
> +/*
> + * deferqueue_run - perform all work in the work queue
> + * @head: deferqueue_head from which to run
> + *
> + * returns: number of works performed, or < 0 on error
> + */
> +int deferqueue_run(struct deferqueue_head *head)
> +{
> +	struct deferqueue_entry *wq, *n;
> +	int nr = 0;
> +	int ret;
> +
> +	list_for_each_entry_safe(wq, n, &head->list, list) {
> +		pr_debug("doing work %p function %p\n", wq, wq->function);
> +		ret = wq->function(wq->data);
> +		if (ret < 0)
> +			pr_debug("wq function failed %d\n", ret);
> +		list_del(&wq->list);
> +		kfree(wq);
> +		nr++;
> +	}
> +
> +	return nr;
> +}

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2009-04-17 20:33 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-04-15 16:02 [PATCH 1/4] Revert "sysvipc-shm: correctly handle deleted (active) ipc shared memory" Serge E. Hallyn
     [not found] ` <20090415160221.GA31929-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-04-15 16:02   ` [PATCH 2/4] Revert "Infrastructure for work postponed to the end of checkpoint/restart" Serge E. Hallyn
2009-04-15 16:02   ` [PATCH 3/4] deferqueue: generic queue to defer work Serge E. Hallyn
     [not found]     ` <20090415160254.GB31983-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-04-17 20:33       ` Oren Laadan
2009-04-15 16:03   ` [PATCH 4/4] sysvipc-shm: correctly handle deleted (active) ipc shared memory Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.