* [Ocfs2-devel] [PATCH 1/2] OCFS2: timer to queue scan of all orphan slots
2009-06-02 6:11 [Ocfs2-devel] Patches that adds delayed orphan scan timer Srinivas Eeda
@ 2009-06-02 6:11 ` Srinivas Eeda
2009-06-02 18:26 ` Sunil Mushran
0 siblings, 1 reply; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-02 6:11 UTC (permalink / raw)
To: ocfs2-devel
In the current implementation, unlink is a two step process.
1) The deleting node requests an EX on dentry lock and place the file in the
orphan directory. The lock request causes other nodes to downcovert to NULL,
and flag the inode as orphaned.
2) Each node that has inode cached will see the ORPHANED flag during iput and
initiates a trylock on OPENLOCK. The node that does the final iput gets the
OPENLOCK, and it wipes the file.
But when there is memory pressure, a dentry could get flushed. During dput, it
removes the lock. So this node will not get a downconvert message on dentry lock
and hence will not flag the inode as ORPHANED.
If this node does the final iput it is not aware that the file got ORPHANED and
hence will not try to wipe the file. This causes orpahns to be around.
The following fix runs a periodic scan on the orphan slots. The scan is done by
one node at a time. It is done once every X seconds, where X is a value between
ORPHAN_SCAN_SCHEDULE_TIMEOUT/2 and ORPHAN_SCAN_SCHEDULE_TIMEOUT milliseconds.
Each time the scan is done by different node so eventually the node that has the
inode cached will get to wipe the file.
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
fs/ocfs2/dlmglue.c | 43 ++++++++++++++++++
fs/ocfs2/dlmglue.h | 3 +
fs/ocfs2/journal.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/journal.h | 4 ++
fs/ocfs2/ocfs2.h | 10 ++++
fs/ocfs2/ocfs2_lockid.h | 5 ++
fs/ocfs2/super.c | 9 ++++
7 files changed, 188 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d..663d779 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -637,6 +641,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ &ocfs2_orphan_scan_lops, osb);
+}
+
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
@@ -2352,6 +2365,33 @@ void ocfs2_inode_unlock(struct inode *inode,
mlog_exit_void();
}
+/* lvb_imtime_packed is used to track a sequence number instead of mtime */
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u64 *seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_meta_lvb *lvb;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ int status = 0;
+
+ lockres = &osb->osb_delayed_scan.ds_lockres;
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ *seqno = be64_to_cpu(lvb->lvb_imtime_packed);
+ return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u64 seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_meta_lvb *lvb;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ lockres = &osb->osb_delayed_scan.ds_lockres;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_imtime_packed = cpu_to_be64(seqno);
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex)
{
@@ -2842,6 +2882,7 @@ local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_orphan_scan_lock_res_init(&osb->osb_delayed_scan.ds_lockres, osb);
osb->cconn = conn;
@@ -2878,6 +2919,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+ ocfs2_lock_res_free(&osb->osb_delayed_scan.ds_lockres);
ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
osb->cconn = NULL;
@@ -3061,6 +3103,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_delayed_scan.ds_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd572..7f26847 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -113,6 +113,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u64 *seqno, int ex);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u64 seqno, int ex);
+
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1..cee42ed 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
@@ -52,6 +54,8 @@
DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 600000
+
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
int node_num, int slot_num);
@@ -1841,6 +1845,116 @@ bail:
return status;
}
+/*
+ * Scan timer should get fired twice within ORPHAN_SCAN_SCHEDULE_TIMEOUT, so
+ * return half of ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some randomness to the
+ * timeout to minimize multple nodes firing the timer at the same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+ unsigned long time;
+
+ get_random_bytes(&time, sizeof(time));
+ time = (time % 5000) + (ORPHAN_SCAN_SCHEDULE_TIMEOUT / 2);
+ return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_delayed_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
+ * to cleanup any orphans that are left over in orphan slots.
+ *
+ * ocfs2_queue_delayed_orphan_scan gets called twice within a timeout value
+ * defined by ORPHAN_SCAN_SCHEDULE_TIMEOUT. It gets an EX lock on ds_lockres and
+ * checks sequence number stored in LVB. If the sequence number is changed it
+ * means some node has done the scan. So, it skips the scan and tracks the
+ * sequence number. If the sequence number didn't change, means a scan didn't
+ * happen, so the node queues a scan and increments the sequence number in LVB.
+ */
+void ocfs2_queue_delayed_orphan_scan(struct ocfs2_super *osb)
+{
+ struct ocfs2_delayed_orphan_scan *ds;
+ int level = DLM_LOCK_EX;
+ int status, i;
+ u64 seqno;
+
+ ds = &osb->osb_delayed_scan;
+
+ /* get an EX on orphan scan lock and sequence number in LVB */
+ status = ocfs2_orphan_scan_lock(osb, &seqno, level);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ goto out;
+ }
+
+ /*
+ * Check the sequence number in LVB. If it's different than what we knew
+ * it means some node did the scan, so just track the seq# and skip
+ * the scan. If the seq# didn't change, a scan didn't happen, so
+ * continue and queue the scans.
+ */
+ if (ds->ds_seqno != seqno) {
+ ds->ds_seqno = seqno;
+ goto unlock;
+ }
+
+ for (i = 0; i < osb->max_slots; i++)
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+ NULL);
+ /*
+ * We queued a recovery on orphan slots, so increment the sequence
+ * number and update LVB so other node will skip the scan for a while
+ */
+ seqno++;
+unlock:
+ ocfs2_orphan_scan_unlock(osb, seqno, level);
+out:
+ return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT/2 millsec */
+void ocfs2_delayed_orphan_scan_work(struct work_struct *work)
+{
+ struct ocfs2_delayed_orphan_scan *ds;
+ struct ocfs2_super *osb;
+
+ ds = container_of(work, struct ocfs2_delayed_orphan_scan,
+ ds_delayed_orphan_scan_work.work);
+ osb = ds->ds_osb;
+
+ mutex_lock(&ds->ds_lock);
+ ocfs2_queue_delayed_orphan_scan(osb);
+ schedule_delayed_work(&ds->ds_delayed_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ mutex_unlock(&ds->ds_lock);
+}
+
+void ocfs2_delayed_orphan_scan_stop(struct ocfs2_super *osb)
+{
+ struct ocfs2_delayed_orphan_scan *ds;
+
+ ds = &osb->osb_delayed_scan;
+ mutex_lock(&ds->ds_lock);
+ cancel_delayed_work(&ds->ds_delayed_orphan_scan_work);
+ mutex_unlock(&ds->ds_lock);
+}
+
+int ocfs2_delayed_orphan_scan_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_delayed_orphan_scan *ds;
+
+ ds = &osb->osb_delayed_scan;
+ ds->ds_osb = osb;
+ mutex_init(&ds->ds_lock);
+
+ INIT_DELAYED_WORK(&ds->ds_delayed_orphan_scan_work,
+ ocfs2_delayed_orphan_scan_work);
+ schedule_delayed_work(&ds->ds_delayed_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ return 0;
+}
+
struct ocfs2_orphan_filldir_priv {
struct inode *head;
struct ocfs2_super *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 619dd7f..8b62b97 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
}
/* Exported only for the journal struct init code in super.c. Do not call. */
+int ocfs2_delayed_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_delayed_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_delayed_orphan_scan_exit(struct ocfs2_super *osb);
+
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281..7dc23de 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -151,6 +151,14 @@ struct ocfs2_lock_res {
#endif
};
+struct ocfs2_delayed_orphan_scan {
+ struct mutex ds_lock;
+ struct ocfs2_super *ds_osb;
+ struct ocfs2_lock_res ds_lockres; /* lock to synchronize scans */
+ struct delayed_work ds_delayed_orphan_scan_work;
+ u64 ds_seqno; /* incremented on every scan */
+};
+
struct ocfs2_dlm_debug {
struct kref d_refcnt;
struct dentry *d_locking_state;
@@ -341,6 +349,8 @@ struct ocfs2_super
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
+ struct ocfs2_delayed_orphan_scan osb_delayed_scan;
+
/* used to protect metaecc calculation check of xattr. */
spinlock_t osb_xattr_lock;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87..fcdba09 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
+ OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_NUM_LOCK_TYPES
};
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_NFS_SYNC:
c = 'Y';
break;
+ case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+ c = 'P';
+ break;
default:
c = '\0';
}
@@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+ [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9..06e139e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_truncate_log_shutdown(osb);
+ ocfs2_delayed_orphan_scan_stop(osb);
+
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
@@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ status = ocfs2_delayed_orphan_scan_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
+ mlog_errno(status);
+ goto bail;
+ }
+
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
--
1.5.6.5
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] OCFS2: timer to queue scan of all orphan slots
2009-06-02 6:11 ` [Ocfs2-devel] [PATCH 1/2] OCFS2: timer to queue scan of all orphan slots Srinivas Eeda
@ 2009-06-02 18:26 ` Sunil Mushran
2009-06-02 18:34 ` Sunil Mushran
0 siblings, 1 reply; 18+ messages in thread
From: Sunil Mushran @ 2009-06-02 18:26 UTC (permalink / raw)
To: ocfs2-devel
Srinivas Eeda wrote:
> In the current implementation, unlink is a two step process.
> 1) The deleting node requests an EX on dentry lock and place the file in the
> orphan directory. The lock request causes other nodes to downcovert to NULL,
> and flag the inode as orphaned.
>
> 2) Each node that has inode cached will see the ORPHANED flag during iput and
> initiates a trylock on OPENLOCK. The node that does the final iput gets the
> OPENLOCK, and it wipes the file.
>
> But when there is memory pressure, a dentry could get flushed. During dput, it
> removes the lock. So this node will not get a downconvert message on dentry lock
> and hence will not flag the inode as ORPHANED.
>
> If this node does the final iput it is not aware that the file got ORPHANED and
> hence will not try to wipe the file. This causes orpahns to be around.
>
> The following fix runs a periodic scan on the orphan slots. The scan is done by
> one node at a time. It is done once every X seconds, where X is a value between
> ORPHAN_SCAN_SCHEDULE_TIMEOUT/2 and ORPHAN_SCAN_SCHEDULE_TIMEOUT milliseconds.
> Each time the scan is done by different node so eventually the node that has the
> inode cached will get to wipe the file.
>
> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
>
How about this wording:
When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
before moving the dentry to the orphan directory. The other nodes, that
all had
a PR on the same dentry lock, flag the corresponding inode as
MAYBE_ORPHANED
during the downconvert. The inode is finally deleted when the last node
to iput
the inode notices the MAYBE_ORPHANED flag.
However, if a node that is actively using an inode comes under memory
pressure
that makes it shrink the dcache and thus free that dentry and its
corresponding
dentry lock, will not be notified of the unlinking of the inode on
another node.
If it so happens that this same node performs the final iput on the
inode, it
will fail to delete that orphaned inode.
This patch fixes this shortcoming by introducing a periodic scan of the
orphan
directories to delete such inodes. Care has been taken to distribute the
workload
across the cluster so that no one node has to perform the task all the
time.
> ---
> fs/ocfs2/dlmglue.c | 43 ++++++++++++++++++
> fs/ocfs2/dlmglue.h | 3 +
> fs/ocfs2/journal.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/journal.h | 4 ++
> fs/ocfs2/ocfs2.h | 10 ++++
> fs/ocfs2/ocfs2_lockid.h | 5 ++
> fs/ocfs2/super.c | 9 ++++
> 7 files changed, 188 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index e15fc7d..663d779 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
> .flags = 0,
> };
>
> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
> + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
> +};
> +
> static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
> .get_osb = ocfs2_get_dentry_osb,
> .post_unlock = ocfs2_dentry_post_unlock,
> @@ -637,6 +641,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
> &ocfs2_nfs_sync_lops, osb);
> }
>
> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> + struct ocfs2_super *osb)
> +{
> + ocfs2_lock_res_init_once(res);
> + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> + &ocfs2_orphan_scan_lops, osb);
> +}
> +
> void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
> struct ocfs2_file_private *fp)
> {
> @@ -2352,6 +2365,33 @@ void ocfs2_inode_unlock(struct inode *inode,
> mlog_exit_void();
> }
>
> +/* lvb_imtime_packed is used to track a sequence number instead of mtime */
>
Don't overload the the meta_lvb. Create a orphan_scan_lvb. We have one
for the quota lock too.
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u64 *seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_meta_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> + int status = 0;
> +
> + lockres = &osb->osb_delayed_scan.ds_lockres;
> + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + *seqno = be64_to_cpu(lvb->lvb_imtime_packed);
> + return status;
> +}
> +
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u64 seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_meta_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> +
> + lockres = &osb->osb_delayed_scan.ds_lockres;
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + lvb->lvb_imtime_packed = cpu_to_be64(seqno);
> + ocfs2_cluster_unlock(osb, lockres, level);
> +}
> +
> int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex)
> {
> @@ -2842,6 +2882,7 @@ local:
> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
> ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
> + ocfs2_orphan_scan_lock_res_init(&osb->osb_delayed_scan.ds_lockres, osb);
>
> osb->cconn = conn;
>
> @@ -2878,6 +2919,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
> ocfs2_lock_res_free(&osb->osb_super_lockres);
> ocfs2_lock_res_free(&osb->osb_rename_lockres);
> ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
> + ocfs2_lock_res_free(&osb->osb_delayed_scan.ds_lockres);
>
> ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
> osb->cconn = NULL;
> @@ -3061,6 +3103,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
> ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
> + ocfs2_simple_drop_lockres(osb, &osb->osb_delayed_scan.ds_lockres);
> }
>
> int ocfs2_drop_inode_locks(struct inode *inode)
> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
> index e1fd572..7f26847 100644
> --- a/fs/ocfs2/dlmglue.h
> +++ b/fs/ocfs2/dlmglue.h
> @@ -113,6 +113,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex);
> void ocfs2_super_unlock(struct ocfs2_super *osb,
> int ex);
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u64 *seqno, int ex);
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u64 seqno, int ex);
> +
> int ocfs2_rename_lock(struct ocfs2_super *osb);
> void ocfs2_rename_unlock(struct ocfs2_super *osb);
> int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index a20a0f1..cee42ed 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -28,6 +28,8 @@
> #include <linux/slab.h>
> #include <linux/highmem.h>
> #include <linux/kthread.h>
> +#include <linux/time.h>
> +#include <linux/random.h>
>
> #define MLOG_MASK_PREFIX ML_JOURNAL
> #include <cluster/masklog.h>
> @@ -52,6 +54,8 @@
>
> DEFINE_SPINLOCK(trans_inc_lock);
>
> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 600000
> +
> static int ocfs2_force_read_journal(struct inode *inode);
> static int ocfs2_recover_node(struct ocfs2_super *osb,
> int node_num, int slot_num);
> @@ -1841,6 +1845,116 @@ bail:
> return status;
> }
>
> +/*
> + * Scan timer should get fired twice within ORPHAN_SCAN_SCHEDULE_TIMEOUT, so
> + * return half of ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some randomness to the
> + * timeout to minimize multple nodes firing the timer at the same time.
> + */
> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
> +{
> + unsigned long time;
> +
> + get_random_bytes(&time, sizeof(time));
> + time = (time % 5000) + (ORPHAN_SCAN_SCHEDULE_TIMEOUT / 2);
> + return msecs_to_jiffies(time);
> +}
>
Why not just make it half the value, 300000. This math is only necessary
if we make this timeout end-user configurable. If so, we do the division
during input. Improves code readability as we do not have to keep explaining
that the timer is fired twice every timeout. Now the timer fires at every
timeout but we only submit the scan job if the seq has not changed on
back-to-back fires.
> +
> +/*
> + * ocfs2_queue_delayed_orphan_scan calls ocfs2_queue_recovery_completion for
> + * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
> + * to cleanup any orphans that are left over in orphan slots.
> + *
> + * ocfs2_queue_delayed_orphan_scan gets called twice within a timeout value
> + * defined by ORPHAN_SCAN_SCHEDULE_TIMEOUT. It gets an EX lock on ds_lockres and
> + * checks sequence number stored in LVB. If the sequence number is changed it
> + * means some node has done the scan. So, it skips the scan and tracks the
> + * sequence number. If the sequence number didn't change, means a scan didn't
> + * happen, so the node queues a scan and increments the sequence number in LVB.
> + */
> +void ocfs2_queue_delayed_orphan_scan(struct ocfs2_super *osb)
> +{
> + struct ocfs2_delayed_orphan_scan *ds;
> + int level = DLM_LOCK_EX;
> + int status, i;
> + u64 seqno;
> +
> + ds = &osb->osb_delayed_scan;
> +
> + /* get an EX on orphan scan lock and sequence number in LVB */
> + status = ocfs2_orphan_scan_lock(osb, &seqno, level);
Hard code DLM_LOCK_EX. This way you can get rid of the comment.
> + if (status < 0) {
> + if (status != -EAGAIN)
> + mlog_errno(status);
> + goto out;
> + }
> +
> + /*
> + * Check the sequence number in LVB. If it's different than what we knew
> + * it means some node did the scan, so just track the seq# and skip
> + * the scan. If the seq# didn't change, a scan didn't happen, so
> + * continue and queue the scans.
> + */
> + if (ds->ds_seqno != seqno) {
> + ds->ds_seqno = seqno;
> + goto unlock;
> + }
> +
>
The comment is repeated. The one above is sufficient.
> + for (i = 0; i < osb->max_slots; i++)
> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
> + NULL);
> + /*
> + * We queued a recovery on orphan slots, so increment the sequence
> + * number and update LVB so other node will skip the scan for a while
> + */
> + seqno++;
> +unlock:
> + ocfs2_orphan_scan_unlock(osb, seqno, level);
> +out:
> + return;
> +}
> +
> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT/2 millsec */
> +void ocfs2_delayed_orphan_scan_work(struct work_struct *work)
> +{
> + struct ocfs2_delayed_orphan_scan *ds;
> + struct ocfs2_super *osb;
> +
> + ds = container_of(work, struct ocfs2_delayed_orphan_scan,
> + ds_delayed_orphan_scan_work.work);
> + osb = ds->ds_osb;
> +
> + mutex_lock(&ds->ds_lock);
> + ocfs2_queue_delayed_orphan_scan(osb);
> + schedule_delayed_work(&ds->ds_delayed_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + mutex_unlock(&ds->ds_lock);
> +}
> +
> +void ocfs2_delayed_orphan_scan_stop(struct ocfs2_super *osb)
> +{
> + struct ocfs2_delayed_orphan_scan *ds;
> +
> + ds = &osb->osb_delayed_scan;
> + mutex_lock(&ds->ds_lock);
> + cancel_delayed_work(&ds->ds_delayed_orphan_scan_work);
> + mutex_unlock(&ds->ds_lock);
> +}
> +
> +int ocfs2_delayed_orphan_scan_init(struct ocfs2_super *osb)
> +{
> + struct ocfs2_delayed_orphan_scan *ds;
> +
> + ds = &osb->osb_delayed_scan;
> + ds->ds_osb = osb;
> + mutex_init(&ds->ds_lock);
> +
> + INIT_DELAYED_WORK(&ds->ds_delayed_orphan_scan_work,
> + ocfs2_delayed_orphan_scan_work);
> + schedule_delayed_work(&ds->ds_delayed_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + return 0;
> +}
> +
> struct ocfs2_orphan_filldir_priv {
> struct inode *head;
> struct ocfs2_super *osb;
> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
> index 619dd7f..8b62b97 100644
> --- a/fs/ocfs2/journal.h
> +++ b/fs/ocfs2/journal.h
> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
> }
>
> /* Exported only for the journal struct init code in super.c. Do not call. */
> +int ocfs2_delayed_orphan_scan_init(struct ocfs2_super *osb);
> +void ocfs2_delayed_orphan_scan_stop(struct ocfs2_super *osb);
> +void ocfs2_delayed_orphan_scan_exit(struct ocfs2_super *osb);
> +
> void ocfs2_complete_recovery(struct work_struct *work);
> void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 1386281..7dc23de 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
> #endif
> };
>
> +struct ocfs2_delayed_orphan_scan {
>
delayed is unnecessary. struct ocfs2_orphan_scan is enough.
> + struct mutex ds_lock;
> + struct ocfs2_super *ds_osb;
> + struct ocfs2_lock_res ds_lockres; /* lock to synchronize scans */
> + struct delayed_work ds_delayed_orphan_scan_work;
>
Again, remove delayed. ds_orphan_scan_work is good.
> + u64 ds_seqno; /* incremented on every scan */
>
$ echo $[4*1024*1024*1024/$[60*24*365]]
8171
Even if we fire once every second, it will take us 8171 years to wrap
u32. ;)
> +};
> +
> struct ocfs2_dlm_debug {
> struct kref d_refcnt;
> struct dentry *d_locking_state;
> @@ -341,6 +349,8 @@ struct ocfs2_super
> unsigned int *osb_orphan_wipes;
> wait_queue_head_t osb_wipe_event;
>
> + struct ocfs2_delayed_orphan_scan osb_delayed_scan;
> +
> /* used to protect metaecc calculation check of xattr. */
> spinlock_t osb_xattr_lock;
>
> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
> index a53ce87..fcdba09 100644
> --- a/fs/ocfs2/ocfs2_lockid.h
> +++ b/fs/ocfs2/ocfs2_lockid.h
> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
> OCFS2_LOCK_TYPE_FLOCK,
> OCFS2_LOCK_TYPE_QINFO,
> OCFS2_LOCK_TYPE_NFS_SYNC,
> + OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> OCFS2_NUM_LOCK_TYPES
> };
>
> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
> case OCFS2_LOCK_TYPE_NFS_SYNC:
> c = 'Y';
> break;
> + case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
> + c = 'P';
> + break;
> default:
> c = '\0';
> }
> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
> [OCFS2_LOCK_TYPE_OPEN] = "Open",
> [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
> [OCFS2_LOCK_TYPE_QINFO] = "Quota",
> + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
> };
>
> static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 79ff8d9..06e139e 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>
> ocfs2_truncate_log_shutdown(osb);
>
> + ocfs2_delayed_orphan_scan_stop(osb);
> +
> /* This will disable recovery and flush any recovery work. */
> ocfs2_recovery_exit(osb);
>
> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
> goto bail;
> }
>
> + status = ocfs2_delayed_orphan_scan_init(osb);
> + if (status) {
> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
> + mlog_errno(status);
> + goto bail;
> + }
> +
> init_waitqueue_head(&osb->checkpoint_event);
> atomic_set(&osb->needs_checkpoint, 0);
>
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] OCFS2: timer to queue scan of all orphan slots
2009-06-02 18:26 ` Sunil Mushran
@ 2009-06-02 18:34 ` Sunil Mushran
0 siblings, 0 replies; 18+ messages in thread
From: Sunil Mushran @ 2009-06-02 18:34 UTC (permalink / raw)
To: ocfs2-devel
Sunil Mushran wrote:
>> + u64 ds_seqno; /* incremented on every scan */
>>
>
> $ echo $[4*1024*1024*1024/$[60*24*365]]
> 8171
> Even if we fire once every second, it will take us 8171 years to wrap
> u32. ;)
>
$ echo $[4*1024*1024*1024/$[60*60*24*365]]
136
duh... it's 136 years. But long enough, I think.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-02 23:37 [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 2) Srinivas Eeda
@ 2009-06-02 23:37 ` Srinivas Eeda
2009-06-03 0:27 ` Sunil Mushran
0 siblings, 1 reply; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-02 23:37 UTC (permalink / raw)
To: ocfs2-devel
When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
before moving the dentry to the orphan directory. The other nodes, that all had
a PR on the same dentry lock, flag the corresponding inode as MAYBE_ORPHANED
during the downconvert. The inode is finally deleted when the last node to iput
the inode notices the MAYBE_ORPHANED flag.
However, if a node that is actively using an inode comes under memory pressure
that makes it shrink the dcache and thus free that dentry and its corresponding
dentry lock. It will not be notified of the unlinking of the inode on another
node. If it so happens that this same node performs the final iput on the inode,
it will fail to delete that orphaned inode.
This patch fixes this shortcoming by introducing a periodic scan of the orphan
directories to delete such inodes. Care has been taken to distribute the
workload across the cluster so that no one node has to perform the task all the
time.
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
fs/ocfs2/dlmglue.c | 47 +++++++++++++++++++++
fs/ocfs2/dlmglue.h | 11 +++++
fs/ocfs2/journal.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/journal.h | 4 ++
fs/ocfs2/ocfs2.h | 10 ++++
fs/ocfs2/ocfs2_lockid.h | 5 ++
fs/ocfs2/super.c | 9 ++++
7 files changed, 192 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d..6f1dabd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -637,6 +641,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ &ocfs2_orphan_scan_lops, osb);
+}
+
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
@@ -2352,6 +2365,37 @@ void ocfs2_inode_unlock(struct inode *inode,
mlog_exit_void();
}
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ int status = 0;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+ if (lvb->lvb_version != OCFS2_ORPHAN_LVB_VERSION)
+ mlog(ML_ERROR, "ORPHAN LVB Mismatch!\n");
+
+ *seqno = be32_to_cpu(lvb->lvb_seqno);
+ return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+ lvb->lvb_seqno = cpu_to_be32(seqno);
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex)
{
@@ -2842,6 +2886,7 @@ local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
@@ -2878,6 +2923,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+ ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
osb->cconn = NULL;
@@ -3061,6 +3107,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd572..5cc36c1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
__be32 lvb_free_entry;
};
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_seqno;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
+
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1..63b54b7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
@@ -52,6 +54,8 @@
DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
int node_num, int slot_num);
@@ -1841,6 +1845,108 @@ bail:
return status;
}
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+ unsigned long time;
+
+ get_random_bytes(&time, sizeof(time));
+ time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+ return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
+ * to cleanup any orphans that are left over in orphan slots.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
+ * It gets an EX lock on os_lockres and checks sequence number stored in LVB. If
+ * the sequence number is changed it means some node has done the scan. Skip the
+ * scan and tracks the sequence number. If the sequence number didn't change,
+ * means a scan didn't happen, so the node queues a scan and increments the
+ * sequence number in LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+ int status, i;
+ u32 seqno;
+
+ os = &osb->osb_orphan_scan;
+
+ status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ goto out;
+ }
+
+ if (os->os_seqno != seqno) {
+ os->os_seqno = seqno;
+ goto unlock;
+ }
+
+ for (i = 0; i < osb->max_slots; i++)
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+ NULL);
+ /*
+ * We queued a recovery on orphan slots, increment the sequence
+ * number and update LVB so other node will skip the scan for a while
+ */
+ seqno++;
+unlock:
+ ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
+out:
+ return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT/2 millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+ struct ocfs2_orphan_scan *os;
+ struct ocfs2_super *osb;
+
+ os = container_of(work, struct ocfs2_orphan_scan,
+ os_orphan_scan_work.work);
+ osb = os->os_osb;
+
+ mutex_lock(&os->os_lock);
+ ocfs2_queue_orphan_scan(osb);
+ schedule_delayed_work(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ mutex_lock(&os->os_lock);
+ cancel_delayed_work(&os->os_orphan_scan_work);
+ mutex_unlock(&os->os_lock);
+}
+
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_osb = osb;
+ mutex_init(&os->os_lock);
+
+ INIT_DELAYED_WORK(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_work);
+ schedule_delayed_work(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ return 0;
+}
+
struct ocfs2_orphan_filldir_priv {
struct inode *head;
struct ocfs2_super *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 619dd7f..3483202 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
}
/* Exported only for the journal struct init code in super.c. Do not call. */
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281..373fb1c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -151,6 +151,14 @@ struct ocfs2_lock_res {
#endif
};
+struct ocfs2_orphan_scan {
+ struct mutex os_lock;
+ struct ocfs2_super *os_osb;
+ struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
+ struct delayed_work os_orphan_scan_work;
+ u32 os_seqno; /* incremented on every scan */
+};
+
struct ocfs2_dlm_debug {
struct kref d_refcnt;
struct dentry *d_locking_state;
@@ -341,6 +349,8 @@ struct ocfs2_super
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
+ struct ocfs2_orphan_scan osb_orphan_scan;
+
/* used to protect metaecc calculation check of xattr. */
spinlock_t osb_xattr_lock;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87..fcdba09 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
+ OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_NUM_LOCK_TYPES
};
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_NFS_SYNC:
c = 'Y';
break;
+ case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+ c = 'P';
+ break;
default:
c = '\0';
}
@@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+ [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9..44ac27e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_truncate_log_shutdown(osb);
+ ocfs2_orphan_scan_stop(osb);
+
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
@@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ status = ocfs2_orphan_scan_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
+ mlog_errno(status);
+ goto bail;
+ }
+
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
--
1.5.6.5
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-02 23:37 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
@ 2009-06-03 0:27 ` Sunil Mushran
0 siblings, 0 replies; 18+ messages in thread
From: Sunil Mushran @ 2009-06-03 0:27 UTC (permalink / raw)
To: ocfs2-devel
Srinivas Eeda wrote:
> When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
> before moving the dentry to the orphan directory. The other nodes, that all had
> a PR on the same dentry lock, flag the corresponding inode as MAYBE_ORPHANED
> during the downconvert. The inode is finally deleted when the last node to iput
> the inode notices the MAYBE_ORPHANED flag.
>
> However, if a node that is actively using an inode comes under memory pressure
> that makes it shrink the dcache and thus free that dentry and its corresponding
> dentry lock. It will not be notified of the unlinking of the inode on another
> node. If it so happens that this same node performs the final iput on the inode,
> it will fail to delete that orphaned inode.
>
Second para does not read right:
A problem arises if a node is forced to free dentry locks because of memory
pressure. If this happens, the node will no longer get downconvert notifications
for the dentries that have been unlinked on another node. If it also happens that
that node is actively using the corresponding inode and happens to be the one
performing the last iput on that inode, it will fail to delete the inode as it
will not have the MAYBE_ORPHANED flag set.
> This patch fixes this shortcoming by introducing a periodic scan of the orphan
> directories to delete such inodes. Care has been taken to distribute the
> workload across the cluster so that no one node has to perform the task all the
> time.
>
> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
> ---
> fs/ocfs2/dlmglue.c | 47 +++++++++++++++++++++
> fs/ocfs2/dlmglue.h | 11 +++++
> fs/ocfs2/journal.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/journal.h | 4 ++
> fs/ocfs2/ocfs2.h | 10 ++++
> fs/ocfs2/ocfs2_lockid.h | 5 ++
> fs/ocfs2/super.c | 9 ++++
> 7 files changed, 192 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index e15fc7d..6f1dabd 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
> .flags = 0,
> };
>
> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
> + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
> +};
> +
> static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
> .get_osb = ocfs2_get_dentry_osb,
> .post_unlock = ocfs2_dentry_post_unlock,
> @@ -637,6 +641,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
> &ocfs2_nfs_sync_lops, osb);
> }
>
> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> + struct ocfs2_super *osb)
> +{
> + ocfs2_lock_res_init_once(res);
> + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> + &ocfs2_orphan_scan_lops, osb);
> +}
> +
> void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
> struct ocfs2_file_private *fp)
> {
> @@ -2352,6 +2365,37 @@ void ocfs2_inode_unlock(struct inode *inode,
> mlog_exit_void();
> }
>
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_orphan_scan_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> + int status = 0;
> +
> + lockres = &osb->osb_orphan_scan.os_lockres;
> + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> +
> + if (lvb->lvb_version != OCFS2_ORPHAN_LVB_VERSION)
> + mlog(ML_ERROR, "ORPHAN LVB Mismatch!\n");
>
BUG_ON(lvb->lvb_version != OCFS2_ORPHAN_LVB_VERSION);
We want to catch this error during testing.
> +
> + *seqno = be32_to_cpu(lvb->lvb_seqno);
> + return status;
> +}
> +
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_orphan_scan_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> +
> + lockres = &osb->osb_orphan_scan.os_lockres;
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
> + lvb->lvb_seqno = cpu_to_be32(seqno);
> + ocfs2_cluster_unlock(osb, lockres, level);
> +}
> +
> int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex)
> {
> @@ -2842,6 +2886,7 @@ local:
> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
> ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
> + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
>
> osb->cconn = conn;
>
> @@ -2878,6 +2923,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
> ocfs2_lock_res_free(&osb->osb_super_lockres);
> ocfs2_lock_res_free(&osb->osb_rename_lockres);
> ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
> + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
>
> ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
> osb->cconn = NULL;
> @@ -3061,6 +3107,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
> ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
> + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
> }
>
> int ocfs2_drop_inode_locks(struct inode *inode)
> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
> index e1fd572..5cc36c1 100644
> --- a/fs/ocfs2/dlmglue.h
> +++ b/fs/ocfs2/dlmglue.h
> @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
> __be32 lvb_free_entry;
> };
>
> +#define OCFS2_ORPHAN_LVB_VERSION 1
> +
> +struct ocfs2_orphan_scan_lvb {
> + __u8 lvb_version;
> + __u8 lvb_reserved[3];
> + __be32 lvb_seqno;
>
lvb_os_seqno will be better. We could use seqno in other lvb's too.
> +};
> +
> /* ocfs2_inode_lock_full() 'arg_flags' flags */
> /* don't wait on recovery. */
> #define OCFS2_META_LOCK_RECOVERY (0x01)
> @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex);
> void ocfs2_super_unlock(struct ocfs2_super *osb,
> int ex);
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
> +
> int ocfs2_rename_lock(struct ocfs2_super *osb);
> void ocfs2_rename_unlock(struct ocfs2_super *osb);
> int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index a20a0f1..63b54b7 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -28,6 +28,8 @@
> #include <linux/slab.h>
> #include <linux/highmem.h>
> #include <linux/kthread.h>
> +#include <linux/time.h>
> +#include <linux/random.h>
>
> #define MLOG_MASK_PREFIX ML_JOURNAL
> #include <cluster/masklog.h>
> @@ -52,6 +54,8 @@
>
> DEFINE_SPINLOCK(trans_inc_lock);
>
> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
> +
> static int ocfs2_force_read_journal(struct inode *inode);
> static int ocfs2_recover_node(struct ocfs2_super *osb,
> int node_num, int slot_num);
> @@ -1841,6 +1845,108 @@ bail:
> return status;
> }
>
> +/*
> + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
> + * randomness to the timeout to minimize multple nodes firing the timer at the
> + * same time.
> + */
> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
> +{
> + unsigned long time;
> +
> + get_random_bytes(&time, sizeof(time));
> + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
> + return msecs_to_jiffies(time);
> +}
> +
> +/*
> + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
> + * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
> + * to cleanup any orphans that are left over in orphan slots.
> + *
> + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
> + * It gets an EX lock on os_lockres and checks sequence number stored in LVB. If
> + * the sequence number is changed it means some node has done the scan. Skip the
> + * scan and tracks the sequence number. If the sequence number didn't change,
> + * means a scan didn't happen, so the node queues a scan and increments the
> + * sequence number in LVB.
> + */
> +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> + int status, i;
> + u32 seqno;
> +
> + os = &osb->osb_orphan_scan;
> +
> + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
> + if (status < 0) {
> + if (status != -EAGAIN)
> + mlog_errno(status);
> + goto out;
> + }
> +
> + if (os->os_seqno != seqno) {
> + os->os_seqno = seqno;
> + goto unlock;
> + }
> +
> + for (i = 0; i < osb->max_slots; i++)
> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
> + NULL);
> + /*
> + * We queued a recovery on orphan slots, increment the sequence
> + * number and update LVB so other node will skip the scan for a while
> + */
> + seqno++;
> +unlock:
> + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
> +out:
> + return;
> +}
> +
> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT/2 millsec */
> +void ocfs2_orphan_scan_work(struct work_struct *work)
> +{
> + struct ocfs2_orphan_scan *os;
> + struct ocfs2_super *osb;
> +
> + os = container_of(work, struct ocfs2_orphan_scan,
> + os_orphan_scan_work.work);
> + osb = os->os_osb;
> +
> + mutex_lock(&os->os_lock);
> + ocfs2_queue_orphan_scan(osb);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + mutex_unlock(&os->os_lock);
> +}
> +
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + mutex_lock(&os->os_lock);
> + cancel_delayed_work(&os->os_orphan_scan_work);
> + mutex_unlock(&os->os_lock);
> +}
> +
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + os->os_osb = osb;
> + mutex_init(&os->os_lock);
> +
> + INIT_DELAYED_WORK(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_work);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + return 0;
> +}
> +
> struct ocfs2_orphan_filldir_priv {
> struct inode *head;
> struct ocfs2_super *osb;
> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
> index 619dd7f..3483202 100644
> --- a/fs/ocfs2/journal.h
> +++ b/fs/ocfs2/journal.h
> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
> }
>
> /* Exported only for the journal struct init code in super.c. Do not call. */
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
> +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
> +
> void ocfs2_complete_recovery(struct work_struct *work);
> void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 1386281..373fb1c 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
> #endif
> };
>
> +struct ocfs2_orphan_scan {
> + struct mutex os_lock;
> + struct ocfs2_super *os_osb;
> + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
> + struct delayed_work os_orphan_scan_work;
> + u32 os_seqno; /* incremented on every scan */
> +};
> +
> struct ocfs2_dlm_debug {
> struct kref d_refcnt;
> struct dentry *d_locking_state;
> @@ -341,6 +349,8 @@ struct ocfs2_super
> unsigned int *osb_orphan_wipes;
> wait_queue_head_t osb_wipe_event;
>
> + struct ocfs2_orphan_scan osb_orphan_scan;
> +
> /* used to protect metaecc calculation check of xattr. */
> spinlock_t osb_xattr_lock;
>
> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
> index a53ce87..fcdba09 100644
> --- a/fs/ocfs2/ocfs2_lockid.h
> +++ b/fs/ocfs2/ocfs2_lockid.h
> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
> OCFS2_LOCK_TYPE_FLOCK,
> OCFS2_LOCK_TYPE_QINFO,
> OCFS2_LOCK_TYPE_NFS_SYNC,
> + OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> OCFS2_NUM_LOCK_TYPES
> };
>
> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
> case OCFS2_LOCK_TYPE_NFS_SYNC:
> c = 'Y';
> break;
> + case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
> + c = 'P';
> + break;
> default:
> c = '\0';
> }
> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
> [OCFS2_LOCK_TYPE_OPEN] = "Open",
> [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
> [OCFS2_LOCK_TYPE_QINFO] = "Quota",
> + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
> };
>
> static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 79ff8d9..44ac27e 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>
> ocfs2_truncate_log_shutdown(osb);
>
> + ocfs2_orphan_scan_stop(osb);
> +
> /* This will disable recovery and flush any recovery work. */
> ocfs2_recovery_exit(osb);
>
> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
> goto bail;
> }
>
> + status = ocfs2_orphan_scan_init(osb);
> + if (status) {
> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
> + mlog_errno(status);
> + goto bail;
> + }
> +
> init_waitqueue_head(&osb->checkpoint_event);
> atomic_set(&osb->needs_checkpoint, 0);
>
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3)
@ 2009-06-04 0:02 Srinivas Eeda
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
` (2 more replies)
0 siblings, 3 replies; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-04 0:02 UTC (permalink / raw)
To: ocfs2-devel
Resending after implementing review comments.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-04 0:02 [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Srinivas Eeda
@ 2009-06-04 0:02 ` Srinivas Eeda
2009-06-04 0:16 ` Sunil Mushran
2009-06-10 5:37 ` Tao Ma
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 2/2] ocfs2 patch to track delayed orphan scan timer statistics Srinivas Eeda
2009-06-04 2:27 ` [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Joel Becker
2 siblings, 2 replies; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-04 0:02 UTC (permalink / raw)
To: ocfs2-devel
When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
before moving the dentry to the orphan directory. The other nodes, that all had
a PR on the same dentry lock, flag the corresponding inode as MAYBE_ORPHANED
during the downconvert. The inode is finally deleted when the last node to iput
the inode notices the MAYBE_ORPHANED flag.
A problem arises if a node is forced to free dentry locks because of memory
pressure. If this happens, the node will no longer get downconvert notifications
for the dentries that have been unlinked on another node. If it also happens
that node is actively using the corresponding inode and happens to be the one
performing the last iput on that inode, it will fail to delete the inode as it
will not have the MAYBE_ORPHANED flag set.
This patch fixes this shortcoming by introducing a periodic scan of the orphan
directories to delete such inodes. Care has been taken to distribute the
workload across the cluster so that no one node has to perform the task all the
time.
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++
fs/ocfs2/dlmglue.h | 11 +++++
fs/ocfs2/journal.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/journal.h | 4 ++
fs/ocfs2/ocfs2.h | 10 ++++
fs/ocfs2/ocfs2_lockid.h | 5 ++
fs/ocfs2/super.c | 9 ++++
7 files changed, 196 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d..0f35b83 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan_lvb *lvb;
+
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ &ocfs2_orphan_scan_lops, osb);
+ lvb = ocfs2_dlm_lvb(&res->l_lksb);
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+}
+
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
@@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
mlog_exit_void();
}
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ int status = 0;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0)
+ return status;
+
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+ *seqno = be32_to_cpu(lvb->lvb_os_seqno);
+ return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+ lvb->lvb_os_seqno = cpu_to_be32(seqno);
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex)
{
@@ -2842,6 +2890,7 @@ local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
@@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+ ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
osb->cconn = NULL;
@@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd572..31b90d7 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
__be32 lvb_free_entry;
};
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_os_seqno;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
+
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1..dc7cea3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
@@ -52,6 +54,8 @@
DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
int node_num, int slot_num);
@@ -1841,6 +1845,108 @@ bail:
return status;
}
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+ unsigned long time;
+
+ get_random_bytes(&time, sizeof(time));
+ time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+ return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
+ * to cleanup any orphans that are left over in orphan slots.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
+ * It gets an EX lock on os_lockres and checks sequence number stored in LVB. If
+ * the sequence number is changed it means some node has done the scan. Skip the
+ * scan and tracks the sequence number. If the sequence number didn't change,
+ * means a scan didn't happen, so the node queues a scan and increments the
+ * sequence number in LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+ int status, i;
+ u32 seqno = 0;
+
+ os = &osb->osb_orphan_scan;
+
+ status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ goto out;
+ }
+
+ if (os->os_seqno != seqno) {
+ os->os_seqno = seqno;
+ goto unlock;
+ }
+
+ for (i = 0; i < osb->max_slots; i++)
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+ NULL);
+ /*
+ * We queued a recovery on orphan slots, increment the sequence
+ * number and update LVB so other node will skip the scan for a while
+ */
+ seqno++;
+unlock:
+ ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
+out:
+ return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+ struct ocfs2_orphan_scan *os;
+ struct ocfs2_super *osb;
+
+ os = container_of(work, struct ocfs2_orphan_scan,
+ os_orphan_scan_work.work);
+ osb = os->os_osb;
+
+ mutex_lock(&os->os_lock);
+ ocfs2_queue_orphan_scan(osb);
+ schedule_delayed_work(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ mutex_lock(&os->os_lock);
+ cancel_delayed_work(&os->os_orphan_scan_work);
+ mutex_unlock(&os->os_lock);
+}
+
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_osb = osb;
+ mutex_init(&os->os_lock);
+
+ INIT_DELAYED_WORK(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_work);
+ schedule_delayed_work(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ return 0;
+}
+
struct ocfs2_orphan_filldir_priv {
struct inode *head;
struct ocfs2_super *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 619dd7f..3483202 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
}
/* Exported only for the journal struct init code in super.c. Do not call. */
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281..373fb1c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -151,6 +151,14 @@ struct ocfs2_lock_res {
#endif
};
+struct ocfs2_orphan_scan {
+ struct mutex os_lock;
+ struct ocfs2_super *os_osb;
+ struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
+ struct delayed_work os_orphan_scan_work;
+ u32 os_seqno; /* incremented on every scan */
+};
+
struct ocfs2_dlm_debug {
struct kref d_refcnt;
struct dentry *d_locking_state;
@@ -341,6 +349,8 @@ struct ocfs2_super
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
+ struct ocfs2_orphan_scan osb_orphan_scan;
+
/* used to protect metaecc calculation check of xattr. */
spinlock_t osb_xattr_lock;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87..fcdba09 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
+ OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_NUM_LOCK_TYPES
};
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_NFS_SYNC:
c = 'Y';
break;
+ case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+ c = 'P';
+ break;
default:
c = '\0';
}
@@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+ [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9..44ac27e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_truncate_log_shutdown(osb);
+ ocfs2_orphan_scan_stop(osb);
+
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
@@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ status = ocfs2_orphan_scan_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
+ mlog_errno(status);
+ goto bail;
+ }
+
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
--
1.5.6.5
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 2/2] ocfs2 patch to track delayed orphan scan timer statistics
2009-06-04 0:02 [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Srinivas Eeda
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
@ 2009-06-04 0:02 ` Srinivas Eeda
2009-06-04 2:27 ` [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Joel Becker
2 siblings, 0 replies; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-04 0:02 UTC (permalink / raw)
To: ocfs2-devel
Patch to track delayed orphan scan timer statistics.
Modifies ocfs2_osb_dump to print the following:
Orphan Scan=> Local: 10 Global: 21 Last Scan: 67 seconds ago
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
fs/ocfs2/journal.c | 4 ++++
fs/ocfs2/ocfs2.h | 4 +++-
fs/ocfs2/super.c | 8 ++++++++
3 files changed, 15 insertions(+), 1 deletions(-)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc7cea3..4d2e28c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1899,6 +1899,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
* number and update LVB so other node will skip the scan for a while
*/
seqno++;
+ os->os_count++;
+ os->os_scantime = CURRENT_TIME;
unlock:
ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
out:
@@ -1938,6 +1940,8 @@ int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
os = &osb->osb_orphan_scan;
os->os_osb = osb;
+ os->os_count = 0;
+ os->os_scantime = CURRENT_TIME;
mutex_init(&os->os_lock);
INIT_DELAYED_WORK(&os->os_orphan_scan_work,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 373fb1c..3355b1a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -156,7 +156,9 @@ struct ocfs2_orphan_scan {
struct ocfs2_super *os_osb;
struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
struct delayed_work os_orphan_scan_work;
- u32 os_seqno; /* incremented on every scan */
+ struct timespec os_scantime; /* time this node ran the scan */
+ u32 os_count; /* tracks node specific scans */
+ u32 os_seqno; /* tracks cluster wide scans */
};
struct ocfs2_dlm_debug {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 44ac27e..d05f3ca 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -208,6 +208,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
int i;
struct ocfs2_cluster_connection *cconn = osb->cconn;
struct ocfs2_recovery_map *rm = osb->recovery_map;
+ struct ocfs2_orphan_scan *os;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -309,6 +310,13 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
i, osb->slot_recovery_generations[i]);
}
+ os = &osb->osb_orphan_scan;
+ out += snprintf(buf + out, len - out, "Orphan Scan=> ");
+ out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
+ os->os_count, os->os_seqno);
+ out += snprintf(buf + out, len - out, " Last Scan: %lu seconds ago\n",
+ (get_seconds() - os->os_scantime.tv_sec));
+
return out;
}
--
1.5.6.5
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
@ 2009-06-04 0:16 ` Sunil Mushran
2009-06-10 5:37 ` Tao Ma
1 sibling, 0 replies; 18+ messages in thread
From: Sunil Mushran @ 2009-06-04 0:16 UTC (permalink / raw)
To: ocfs2-devel
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Srinivas Eeda wrote:
> When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
> before moving the dentry to the orphan directory. The other nodes, that all had
> a PR on the same dentry lock, flag the corresponding inode as MAYBE_ORPHANED
> during the downconvert. The inode is finally deleted when the last node to iput
> the inode notices the MAYBE_ORPHANED flag.
>
> A problem arises if a node is forced to free dentry locks because of memory
> pressure. If this happens, the node will no longer get downconvert notifications
> for the dentries that have been unlinked on another node. If it also happens
> that node is actively using the corresponding inode and happens to be the one
> performing the last iput on that inode, it will fail to delete the inode as it
> will not have the MAYBE_ORPHANED flag set.
>
> This patch fixes this shortcoming by introducing a periodic scan of the orphan
> directories to delete such inodes. Care has been taken to distribute the
> workload across the cluster so that no one node has to perform the task all the
> time.
>
> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
> ---
> fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++
> fs/ocfs2/dlmglue.h | 11 +++++
> fs/ocfs2/journal.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/journal.h | 4 ++
> fs/ocfs2/ocfs2.h | 10 ++++
> fs/ocfs2/ocfs2_lockid.h | 5 ++
> fs/ocfs2/super.c | 9 ++++
> 7 files changed, 196 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index e15fc7d..0f35b83 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
> .flags = 0,
> };
>
> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
> + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
> +};
> +
> static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
> .get_osb = ocfs2_get_dentry_osb,
> .post_unlock = ocfs2_dentry_post_unlock,
> @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
> &ocfs2_nfs_sync_lops, osb);
> }
>
> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> + struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan_lvb *lvb;
> +
> + ocfs2_lock_res_init_once(res);
> + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> + &ocfs2_orphan_scan_lops, osb);
> + lvb = ocfs2_dlm_lvb(&res->l_lksb);
> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
> +}
> +
> void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
> struct ocfs2_file_private *fp)
> {
> @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
> mlog_exit_void();
> }
>
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_orphan_scan_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> + int status = 0;
> +
> + lockres = &osb->osb_orphan_scan.os_lockres;
> + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
> + if (status < 0)
> + return status;
> +
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
> + *seqno = be32_to_cpu(lvb->lvb_os_seqno);
> + return status;
> +}
> +
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_orphan_scan_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> +
> + lockres = &osb->osb_orphan_scan.os_lockres;
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
> + lvb->lvb_os_seqno = cpu_to_be32(seqno);
> + ocfs2_cluster_unlock(osb, lockres, level);
> +}
> +
> int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex)
> {
> @@ -2842,6 +2890,7 @@ local:
> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
> ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
> + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
>
> osb->cconn = conn;
>
> @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
> ocfs2_lock_res_free(&osb->osb_super_lockres);
> ocfs2_lock_res_free(&osb->osb_rename_lockres);
> ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
> + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
>
> ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
> osb->cconn = NULL;
> @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
> ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
> + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
> }
>
> int ocfs2_drop_inode_locks(struct inode *inode)
> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
> index e1fd572..31b90d7 100644
> --- a/fs/ocfs2/dlmglue.h
> +++ b/fs/ocfs2/dlmglue.h
> @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
> __be32 lvb_free_entry;
> };
>
> +#define OCFS2_ORPHAN_LVB_VERSION 1
> +
> +struct ocfs2_orphan_scan_lvb {
> + __u8 lvb_version;
> + __u8 lvb_reserved[3];
> + __be32 lvb_os_seqno;
> +};
> +
> /* ocfs2_inode_lock_full() 'arg_flags' flags */
> /* don't wait on recovery. */
> #define OCFS2_META_LOCK_RECOVERY (0x01)
> @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex);
> void ocfs2_super_unlock(struct ocfs2_super *osb,
> int ex);
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
> +
> int ocfs2_rename_lock(struct ocfs2_super *osb);
> void ocfs2_rename_unlock(struct ocfs2_super *osb);
> int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index a20a0f1..dc7cea3 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -28,6 +28,8 @@
> #include <linux/slab.h>
> #include <linux/highmem.h>
> #include <linux/kthread.h>
> +#include <linux/time.h>
> +#include <linux/random.h>
>
> #define MLOG_MASK_PREFIX ML_JOURNAL
> #include <cluster/masklog.h>
> @@ -52,6 +54,8 @@
>
> DEFINE_SPINLOCK(trans_inc_lock);
>
> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
> +
> static int ocfs2_force_read_journal(struct inode *inode);
> static int ocfs2_recover_node(struct ocfs2_super *osb,
> int node_num, int slot_num);
> @@ -1841,6 +1845,108 @@ bail:
> return status;
> }
>
> +/*
> + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
> + * randomness to the timeout to minimize multple nodes firing the timer at the
> + * same time.
> + */
> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
> +{
> + unsigned long time;
> +
> + get_random_bytes(&time, sizeof(time));
> + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
> + return msecs_to_jiffies(time);
> +}
> +
> +/*
> + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
> + * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
> + * to cleanup any orphans that are left over in orphan slots.
> + *
> + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
> + * It gets an EX lock on os_lockres and checks sequence number stored in LVB. If
> + * the sequence number is changed it means some node has done the scan. Skip the
> + * scan and tracks the sequence number. If the sequence number didn't change,
> + * means a scan didn't happen, so the node queues a scan and increments the
> + * sequence number in LVB.
> + */
> +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> + int status, i;
> + u32 seqno = 0;
> +
> + os = &osb->osb_orphan_scan;
> +
> + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
> + if (status < 0) {
> + if (status != -EAGAIN)
> + mlog_errno(status);
> + goto out;
> + }
> +
> + if (os->os_seqno != seqno) {
> + os->os_seqno = seqno;
> + goto unlock;
> + }
> +
> + for (i = 0; i < osb->max_slots; i++)
> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
> + NULL);
> + /*
> + * We queued a recovery on orphan slots, increment the sequence
> + * number and update LVB so other node will skip the scan for a while
> + */
> + seqno++;
> +unlock:
> + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
> +out:
> + return;
> +}
> +
> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
> +void ocfs2_orphan_scan_work(struct work_struct *work)
> +{
> + struct ocfs2_orphan_scan *os;
> + struct ocfs2_super *osb;
> +
> + os = container_of(work, struct ocfs2_orphan_scan,
> + os_orphan_scan_work.work);
> + osb = os->os_osb;
> +
> + mutex_lock(&os->os_lock);
> + ocfs2_queue_orphan_scan(osb);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + mutex_unlock(&os->os_lock);
> +}
> +
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + mutex_lock(&os->os_lock);
> + cancel_delayed_work(&os->os_orphan_scan_work);
> + mutex_unlock(&os->os_lock);
> +}
> +
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + os->os_osb = osb;
> + mutex_init(&os->os_lock);
> +
> + INIT_DELAYED_WORK(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_work);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + return 0;
> +}
> +
> struct ocfs2_orphan_filldir_priv {
> struct inode *head;
> struct ocfs2_super *osb;
> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
> index 619dd7f..3483202 100644
> --- a/fs/ocfs2/journal.h
> +++ b/fs/ocfs2/journal.h
> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
> }
>
> /* Exported only for the journal struct init code in super.c. Do not call. */
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
> +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
> +
> void ocfs2_complete_recovery(struct work_struct *work);
> void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 1386281..373fb1c 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
> #endif
> };
>
> +struct ocfs2_orphan_scan {
> + struct mutex os_lock;
> + struct ocfs2_super *os_osb;
> + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
> + struct delayed_work os_orphan_scan_work;
> + u32 os_seqno; /* incremented on every scan */
> +};
> +
> struct ocfs2_dlm_debug {
> struct kref d_refcnt;
> struct dentry *d_locking_state;
> @@ -341,6 +349,8 @@ struct ocfs2_super
> unsigned int *osb_orphan_wipes;
> wait_queue_head_t osb_wipe_event;
>
> + struct ocfs2_orphan_scan osb_orphan_scan;
> +
> /* used to protect metaecc calculation check of xattr. */
> spinlock_t osb_xattr_lock;
>
> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
> index a53ce87..fcdba09 100644
> --- a/fs/ocfs2/ocfs2_lockid.h
> +++ b/fs/ocfs2/ocfs2_lockid.h
> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
> OCFS2_LOCK_TYPE_FLOCK,
> OCFS2_LOCK_TYPE_QINFO,
> OCFS2_LOCK_TYPE_NFS_SYNC,
> + OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> OCFS2_NUM_LOCK_TYPES
> };
>
> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
> case OCFS2_LOCK_TYPE_NFS_SYNC:
> c = 'Y';
> break;
> + case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
> + c = 'P';
> + break;
> default:
> c = '\0';
> }
> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
> [OCFS2_LOCK_TYPE_OPEN] = "Open",
> [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
> [OCFS2_LOCK_TYPE_QINFO] = "Quota",
> + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
> };
>
> static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 79ff8d9..44ac27e 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>
> ocfs2_truncate_log_shutdown(osb);
>
> + ocfs2_orphan_scan_stop(osb);
> +
> /* This will disable recovery and flush any recovery work. */
> ocfs2_recovery_exit(osb);
>
> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
> goto bail;
> }
>
> + status = ocfs2_orphan_scan_init(osb);
> + if (status) {
> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
> + mlog_errno(status);
> + goto bail;
> + }
> +
> init_waitqueue_head(&osb->checkpoint_event);
> atomic_set(&osb->needs_checkpoint, 0);
>
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3)
2009-06-04 0:02 [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Srinivas Eeda
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 2/2] ocfs2 patch to track delayed orphan scan timer statistics Srinivas Eeda
@ 2009-06-04 2:27 ` Joel Becker
2 siblings, 0 replies; 18+ messages in thread
From: Joel Becker @ 2009-06-04 2:27 UTC (permalink / raw)
To: ocfs2-devel
On Wed, Jun 03, 2009 at 05:02:54PM -0700, Srinivas Eeda wrote:
> Resending after implementing review comments.
These patches are now in the merge-window branch of ocfs2.git.
They also live on the orphan-scan topic branch.
Joel
--
"I think it would be a good idea."
- Mahatma Ghandi, when asked what he thought of Western
civilization
Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-04 6:40 [Ocfs2-devel] Backport that adds delayed orphan scan timer to 1.4 Srinivas Eeda
@ 2009-06-04 6:40 ` Srinivas Eeda
2009-06-09 23:47 ` Sunil Mushran
0 siblings, 1 reply; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-04 6:40 UTC (permalink / raw)
To: ocfs2-devel
When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
before moving the dentry to the orphan directory. The other nodes, that all had
a PR on the same dentry lock, flag the corresponding inode as MAYBE_ORPHANED
during the downconvert. The inode is finally deleted when the last node to iput
the inode notices the MAYBE_ORPHANED flag.
A problem arises if a node is forced to free dentry locks because of memory
pressure. If this happens, the node will no longer get downconvert notifications
for the dentries that have been unlinked on another node. If it also happens
that node is actively using the corresponding inode and happens to be the one
performing the last iput on that inode, it will fail to delete the inode as it
will not have the MAYBE_ORPHANED flag set.
This patch fixes this shortcoming by introducing a periodic scan of the orphan
directories to delete such inodes. Care has been taken to distribute the
workload across the cluster so that no one node has to perform the task all the
time.
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++
fs/ocfs2/dlmglue.h | 10 ++++
fs/ocfs2/journal.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/journal.h | 4 ++
fs/ocfs2/ocfs2.h | 9 ++++
fs/ocfs2/ocfs2_lockid.h | 5 ++
fs/ocfs2/super.c | 9 ++++
7 files changed, 194 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index bacb092..72463d8 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -246,6 +246,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -632,6 +636,19 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_rename_lops, osb);
}
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan_lvb *lvb;
+
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ &ocfs2_orphan_scan_lops, osb);
+ lvb = (struct ocfs2_orphan_scan_lvb *)res->l_lksb.lvb;
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+}
+
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
@@ -2212,6 +2229,37 @@ void ocfs2_inode_unlock(struct inode *inode,
mlog_exit_void();
}
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int status = 0;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0)
+ return status;
+
+ lvb = (struct ocfs2_orphan_scan_lvb *)lockres->l_lksb.lvb;
+ if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+ *seqno = be32_to_cpu(lvb->lvb_os_seqno);
+ return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ lvb = (struct ocfs2_orphan_scan_lvb *)lockres->l_lksb.lvb;
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+ lvb->lvb_os_seqno = cpu_to_be32(seqno);
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex)
{
@@ -2676,6 +2724,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+ ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->dlm = dlm;
@@ -2706,6 +2755,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
+ ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
dlm_unregister_domain(osb->dlm);
osb->dlm = NULL;
@@ -2904,6 +2954,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
{
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902..a197c09 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,14 @@ struct ocfs2_meta_lvb {
__be32 lvb_reserved2;
};
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_os_seqno;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -97,6 +105,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 484ccb5..320d577 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
@@ -50,6 +52,8 @@
DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
int node_num);
@@ -1440,6 +1444,108 @@ bail:
return status;
}
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+ unsigned long time;
+
+ get_random_bytes(&time, sizeof(time));
+ time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+ return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
+ * to cleanup any orphans that are left over in orphan slots.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT second
+ * It gets an EX lock on os_lockres and checks sequence number stored in LVB. If
+ * the sequence number is changed it means some node has done the scan. Skip the
+ * scan and tracks the sequence number. If the sequence number didn't change,
+ * means a scan didn't happen, so the node queues a scan and increments the
+ * sequence number in LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+ int status, i;
+ u32 seqno = 0;
+
+ os = &osb->osb_orphan_scan;
+
+ status = ocfs2_orphan_scan_lock(osb, &seqno, LKM_EXMODE);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ goto out;
+ }
+
+ if (os->os_seqno != seqno) {
+ os->os_seqno = seqno;
+ goto unlock;
+ }
+
+ for (i = 0; i < osb->max_slots; i++)
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL);
+
+ /*
+ * We queued a recovery on orphan slots, increment the sequence
+ * number and update LVB so other node will skip the scan for a while
+ */
+ seqno++;
+unlock:
+ ocfs2_orphan_scan_unlock(osb, seqno, LKM_EXMODE);
+out:
+ return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(kapi_work_struct_t *work)
+{
+ struct ocfs2_orphan_scan *os;
+ struct ocfs2_super *osb;
+
+ os = work_to_object(work, struct ocfs2_orphan_scan,
+ os_orphan_scan_work.work);
+ osb = os->os_osb;
+
+ mutex_lock(&os->os_lock);
+ ocfs2_queue_orphan_scan(osb);
+ schedule_delayed_work(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ mutex_lock(&os->os_lock);
+ cancel_delayed_work(&os->os_orphan_scan_work);
+ mutex_unlock(&os->os_lock);
+}
+
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_osb = osb;
+ mutex_init(&os->os_lock);
+
+ KAPI_INIT_DELAYED_WORK(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_work, os);
+ schedule_delayed_work(&os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ return 0;
+}
+
struct ocfs2_orphan_filldir_priv {
struct inode *head;
struct ocfs2_super *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index df8e9de..fa18a84 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -133,6 +133,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
}
/* Exported only for the journal struct init code in super.c. Do not call. */
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
void ocfs2_complete_recovery(kapi_work_struct_t *work);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index e84185d..7989966 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -145,6 +145,14 @@ struct ocfs2_lock_res {
#endif
};
+struct ocfs2_orphan_scan {
+ struct mutex os_lock;
+ struct ocfs2_super *os_osb;
+ struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
+ struct delayed_work os_orphan_scan_work;
+ u32 os_seqno; /* incremented on every scan */
+};
+
struct ocfs2_dlm_debug {
struct kref d_refcnt;
struct dentry *d_locking_state;
@@ -319,6 +327,7 @@ struct ocfs2_super
struct ocfs2_node_map osb_recovering_orphan_dirs;
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
+ struct ocfs2_orphan_scan osb_orphan_scan;
/* the group we used to allocate inodes. */
u64 osb_inode_alloc_group;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e37..89e2645 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
+ OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_NUM_LOCK_TYPES
};
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_FLOCK:
c = 'F';
break;
+ case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+ c = 'P';
+ break;
default:
c = '\0';
}
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+ [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a421e7d..cd66b4d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1495,6 +1495,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_truncate_log_shutdown(osb);
+ ocfs2_orphan_scan_stop(osb);
+
/* disable any new recovery threads and wait for any currently
* running ones to exit. Do this before setting the vol_state. */
mutex_lock(&osb->recovery_lock);
@@ -1640,6 +1642,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->disable_recovery = 0;
osb->recovery_thread_task = NULL;
+ status = ocfs2_orphan_scan_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
+ mlog_errno(status);
+ goto bail;
+ }
+
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
--
1.5.6.5
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-04 6:40 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
@ 2009-06-09 23:47 ` Sunil Mushran
0 siblings, 0 replies; 18+ messages in thread
From: Sunil Mushran @ 2009-06-09 23:47 UTC (permalink / raw)
To: ocfs2-devel
Srini,
I was re-reviewing these patches as part of 1.4 merge and something
caught my eye. Specifically that this may slowdown umount unnecessarily.
Consider the case if scan_work() is fired a tick before scan_stop().
Currently, scan_stop() will cancel the newly queued scan_work() but do
nothing to the tasks queued by the earlier fire. umount thread will have
to wait for all those tasks to complete.
One solution is to add a flag, atomic_t os_stop_scan, in struct
ocfs2_orphan_scan. Call atomic_set() at the top of ocfs2_orphan_scan_stop().
In ocfs2_queue_orphan_scan(), check if the flag is set before and after
ocfs2_orphan_scan_lock(). If set, exit without queuing the tasks.
Secondly, call scan_stop() earlier in umount. Definitely before truncatelog
shutdown. Actually make it even before localalloc shutdown.
Make the patch atop what is in Joel's git tree already.
Thanks
Sunil
Srinivas Eeda wrote:
> +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> + int status, i;
> + u32 seqno = 0;
> +
> + os = &osb->osb_orphan_scan;
> +
> + status = ocfs2_orphan_scan_lock(osb, &seqno, LKM_EXMODE);
> + if (status < 0) {
> + if (status != -EAGAIN)
> + mlog_errno(status);
> + goto out;
> + }
> +
> + if (os->os_seqno != seqno) {
> + os->os_seqno = seqno;
> + goto unlock;
> + }
> +
> + for (i = 0; i < osb->max_slots; i++)
> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL);
> +
> + /*
> + * We queued a recovery on orphan slots, increment the sequence
> + * number and update LVB so other node will skip the scan for a while
> + */
> + seqno++;
> +unlock:
> + ocfs2_orphan_scan_unlock(osb, seqno, LKM_EXMODE);
> +out:
> + return;
> +}
> +
> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
> +void ocfs2_orphan_scan_work(kapi_work_struct_t *work)
> +{
> + struct ocfs2_orphan_scan *os;
> + struct ocfs2_super *osb;
> +
> + os = work_to_object(work, struct ocfs2_orphan_scan,
> + os_orphan_scan_work.work);
> + osb = os->os_osb;
> +
> + mutex_lock(&os->os_lock);
> + ocfs2_queue_orphan_scan(osb);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + mutex_unlock(&os->os_lock);
> +}
> +
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + mutex_lock(&os->os_lock);
> + cancel_delayed_work(&os->os_orphan_scan_work);
> + mutex_unlock(&os->os_lock);
> +}
> +
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + os->os_osb = osb;
> + mutex_init(&os->os_lock);
> +
> + KAPI_INIT_DELAYED_WORK(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_work, os);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + return 0;
> +}
> +
>
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index e84185d..7989966 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -145,6 +145,14 @@ struct ocfs2_lock_res {
> #endif
> };
>
> +struct ocfs2_orphan_scan {
> + struct mutex os_lock;
> + struct ocfs2_super *os_osb;
> + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
> + struct delayed_work os_orphan_scan_work;
> + u32 os_seqno; /* incremented on every scan */
> +};
> +
>
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index a421e7d..cd66b4d 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1495,6 +1495,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>
> ocfs2_truncate_log_shutdown(osb);
>
> + ocfs2_orphan_scan_stop(osb);
> +
> /* disable any new recovery threads and wait for any currently
> * running ones to exit. Do this before setting the vol_state. */
> mutex_lock(&osb->recovery_lock);
> @@ -1640,6 +1642,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
> osb->disable_recovery = 0;
> osb->recovery_thread_task = NULL;
>
> + status = ocfs2_orphan_scan_init(osb);
> + if (status) {
> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
> + mlog_errno(status);
> + goto bail;
> + }
> +
> init_waitqueue_head(&osb->checkpoint_event);
> atomic_set(&osb->needs_checkpoint, 0);
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
2009-06-04 0:16 ` Sunil Mushran
@ 2009-06-10 5:37 ` Tao Ma
2009-06-10 6:50 ` Srinivas Eeda
2009-06-10 7:58 ` Joel Becker
1 sibling, 2 replies; 18+ messages in thread
From: Tao Ma @ 2009-06-10 5:37 UTC (permalink / raw)
To: ocfs2-devel
Hi Srini/Joel/Sunil,
I also have some thoughts for it. Wish it isn't too late.
Currently, orphan scan just iterate all the slots and call
ocfs2_queue_recovery_completion, but I don't think it is proper for a
node to query another mounted one since that node will query it by
itself. What's more, it will affect reflink greatly.
In my current implementation of reflink, It will work like this:
1. create a inode in orphan dir
2. reflink all the extents.
3. move the inode from orphan dir to the destination.
For efficiency, I just lock orphan dir in step 1 and 3, and release the
lock in step 2 in case reflink will take a long time and we don't block
other "unlink" process. And in step 1, the created inode looks really
like a deleted one so that any crash in step 2 won't prevent it from
being deleted by fsck or recovery.
But with your patch, we may have a race in step 2 that your recovery
will delete the inode created in step 1. So my suggestion is that your
orphan scan just skip the mounted node so it won't affect other nodes'
ongoing reflink. As for the node itself, it is very easy to postpone the
orphan scan by setting a flag in ocfs2_super when reflink is ongoing(I
will do it).
Make sense?
Regards,
Tao
Srinivas Eeda wrote:
> When a dentry is unlinked, the unlinking node takes an EX on the dentry lock
> before moving the dentry to the orphan directory. The other nodes, that all had
> a PR on the same dentry lock, flag the corresponding inode as MAYBE_ORPHANED
> during the downconvert. The inode is finally deleted when the last node to iput
> the inode notices the MAYBE_ORPHANED flag.
>
> A problem arises if a node is forced to free dentry locks because of memory
> pressure. If this happens, the node will no longer get downconvert notifications
> for the dentries that have been unlinked on another node. If it also happens
> that node is actively using the corresponding inode and happens to be the one
> performing the last iput on that inode, it will fail to delete the inode as it
> will not have the MAYBE_ORPHANED flag set.
>
> This patch fixes this shortcoming by introducing a periodic scan of the orphan
> directories to delete such inodes. Care has been taken to distribute the
> workload across the cluster so that no one node has to perform the task all the
> time.
>
> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
> ---
> fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++
> fs/ocfs2/dlmglue.h | 11 +++++
> fs/ocfs2/journal.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/journal.h | 4 ++
> fs/ocfs2/ocfs2.h | 10 ++++
> fs/ocfs2/ocfs2_lockid.h | 5 ++
> fs/ocfs2/super.c | 9 ++++
> 7 files changed, 196 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index e15fc7d..0f35b83 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
> .flags = 0,
> };
>
> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
> + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
> +};
> +
> static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
> .get_osb = ocfs2_get_dentry_osb,
> .post_unlock = ocfs2_dentry_post_unlock,
> @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
> &ocfs2_nfs_sync_lops, osb);
> }
>
> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> + struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan_lvb *lvb;
> +
> + ocfs2_lock_res_init_once(res);
> + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> + &ocfs2_orphan_scan_lops, osb);
> + lvb = ocfs2_dlm_lvb(&res->l_lksb);
> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
> +}
> +
> void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
> struct ocfs2_file_private *fp)
> {
> @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
> mlog_exit_void();
> }
>
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_orphan_scan_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> + int status = 0;
> +
> + lockres = &osb->osb_orphan_scan.os_lockres;
> + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
> + if (status < 0)
> + return status;
> +
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
> + *seqno = be32_to_cpu(lvb->lvb_os_seqno);
> + return status;
> +}
> +
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + struct ocfs2_orphan_scan_lvb *lvb;
> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
> +
> + lockres = &osb->osb_orphan_scan.os_lockres;
> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
> + lvb->lvb_os_seqno = cpu_to_be32(seqno);
> + ocfs2_cluster_unlock(osb, lockres, level);
> +}
> +
> int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex)
> {
> @@ -2842,6 +2890,7 @@ local:
> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
> ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
> + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
>
> osb->cconn = conn;
>
> @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
> ocfs2_lock_res_free(&osb->osb_super_lockres);
> ocfs2_lock_res_free(&osb->osb_rename_lockres);
> ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
> + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
>
> ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
> osb->cconn = NULL;
> @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
> ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
> ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
> + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
> }
>
> int ocfs2_drop_inode_locks(struct inode *inode)
> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
> index e1fd572..31b90d7 100644
> --- a/fs/ocfs2/dlmglue.h
> +++ b/fs/ocfs2/dlmglue.h
> @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
> __be32 lvb_free_entry;
> };
>
> +#define OCFS2_ORPHAN_LVB_VERSION 1
> +
> +struct ocfs2_orphan_scan_lvb {
> + __u8 lvb_version;
> + __u8 lvb_reserved[3];
> + __be32 lvb_os_seqno;
> +};
> +
> /* ocfs2_inode_lock_full() 'arg_flags' flags */
> /* don't wait on recovery. */
> #define OCFS2_META_LOCK_RECOVERY (0x01)
> @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
> int ex);
> void ocfs2_super_unlock(struct ocfs2_super *osb,
> int ex);
> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
> +
> int ocfs2_rename_lock(struct ocfs2_super *osb);
> void ocfs2_rename_unlock(struct ocfs2_super *osb);
> int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index a20a0f1..dc7cea3 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -28,6 +28,8 @@
> #include <linux/slab.h>
> #include <linux/highmem.h>
> #include <linux/kthread.h>
> +#include <linux/time.h>
> +#include <linux/random.h>
>
> #define MLOG_MASK_PREFIX ML_JOURNAL
> #include <cluster/masklog.h>
> @@ -52,6 +54,8 @@
>
> DEFINE_SPINLOCK(trans_inc_lock);
>
> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
> +
> static int ocfs2_force_read_journal(struct inode *inode);
> static int ocfs2_recover_node(struct ocfs2_super *osb,
> int node_num, int slot_num);
> @@ -1841,6 +1845,108 @@ bail:
> return status;
> }
>
> +/*
> + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
> + * randomness to the timeout to minimize multple nodes firing the timer at the
> + * same time.
> + */
> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
> +{
> + unsigned long time;
> +
> + get_random_bytes(&time, sizeof(time));
> + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
> + return msecs_to_jiffies(time);
> +}
> +
> +/*
> + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
> + * every slot which queues a recovery of slot on ocfs2_wq thread. This is done
> + * to cleanup any orphans that are left over in orphan slots.
> + *
> + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
> + * It gets an EX lock on os_lockres and checks sequence number stored in LVB. If
> + * the sequence number is changed it means some node has done the scan. Skip the
> + * scan and tracks the sequence number. If the sequence number didn't change,
> + * means a scan didn't happen, so the node queues a scan and increments the
> + * sequence number in LVB.
> + */
> +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> + int status, i;
> + u32 seqno = 0;
> +
> + os = &osb->osb_orphan_scan;
> +
> + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
> + if (status < 0) {
> + if (status != -EAGAIN)
> + mlog_errno(status);
> + goto out;
> + }
> +
> + if (os->os_seqno != seqno) {
> + os->os_seqno = seqno;
> + goto unlock;
> + }
> +
> + for (i = 0; i < osb->max_slots; i++)
> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
> + NULL);
> + /*
> + * We queued a recovery on orphan slots, increment the sequence
> + * number and update LVB so other node will skip the scan for a while
> + */
> + seqno++;
> +unlock:
> + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
> +out:
> + return;
> +}
> +
> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
> +void ocfs2_orphan_scan_work(struct work_struct *work)
> +{
> + struct ocfs2_orphan_scan *os;
> + struct ocfs2_super *osb;
> +
> + os = container_of(work, struct ocfs2_orphan_scan,
> + os_orphan_scan_work.work);
> + osb = os->os_osb;
> +
> + mutex_lock(&os->os_lock);
> + ocfs2_queue_orphan_scan(osb);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + mutex_unlock(&os->os_lock);
> +}
> +
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + mutex_lock(&os->os_lock);
> + cancel_delayed_work(&os->os_orphan_scan_work);
> + mutex_unlock(&os->os_lock);
> +}
> +
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
> +{
> + struct ocfs2_orphan_scan *os;
> +
> + os = &osb->osb_orphan_scan;
> + os->os_osb = osb;
> + mutex_init(&os->os_lock);
> +
> + INIT_DELAYED_WORK(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_work);
> + schedule_delayed_work(&os->os_orphan_scan_work,
> + ocfs2_orphan_scan_timeout());
> + return 0;
> +}
> +
> struct ocfs2_orphan_filldir_priv {
> struct inode *head;
> struct ocfs2_super *osb;
> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
> index 619dd7f..3483202 100644
> --- a/fs/ocfs2/journal.h
> +++ b/fs/ocfs2/journal.h
> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
> }
>
> /* Exported only for the journal struct init code in super.c. Do not call. */
> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
> +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
> +
> void ocfs2_complete_recovery(struct work_struct *work);
> void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 1386281..373fb1c 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
> #endif
> };
>
> +struct ocfs2_orphan_scan {
> + struct mutex os_lock;
> + struct ocfs2_super *os_osb;
> + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
> + struct delayed_work os_orphan_scan_work;
> + u32 os_seqno; /* incremented on every scan */
> +};
> +
> struct ocfs2_dlm_debug {
> struct kref d_refcnt;
> struct dentry *d_locking_state;
> @@ -341,6 +349,8 @@ struct ocfs2_super
> unsigned int *osb_orphan_wipes;
> wait_queue_head_t osb_wipe_event;
>
> + struct ocfs2_orphan_scan osb_orphan_scan;
> +
> /* used to protect metaecc calculation check of xattr. */
> spinlock_t osb_xattr_lock;
>
> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
> index a53ce87..fcdba09 100644
> --- a/fs/ocfs2/ocfs2_lockid.h
> +++ b/fs/ocfs2/ocfs2_lockid.h
> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
> OCFS2_LOCK_TYPE_FLOCK,
> OCFS2_LOCK_TYPE_QINFO,
> OCFS2_LOCK_TYPE_NFS_SYNC,
> + OCFS2_LOCK_TYPE_ORPHAN_SCAN,
> OCFS2_NUM_LOCK_TYPES
> };
>
> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
> case OCFS2_LOCK_TYPE_NFS_SYNC:
> c = 'Y';
> break;
> + case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
> + c = 'P';
> + break;
> default:
> c = '\0';
> }
> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
> [OCFS2_LOCK_TYPE_OPEN] = "Open",
> [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
> [OCFS2_LOCK_TYPE_QINFO] = "Quota",
> + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
> };
>
> static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 79ff8d9..44ac27e 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>
> ocfs2_truncate_log_shutdown(osb);
>
> + ocfs2_orphan_scan_stop(osb);
> +
> /* This will disable recovery and flush any recovery work. */
> ocfs2_recovery_exit(osb);
>
> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
> goto bail;
> }
>
> + status = ocfs2_orphan_scan_init(osb);
> + if (status) {
> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
> + mlog_errno(status);
> + goto bail;
> + }
> +
> init_waitqueue_head(&osb->checkpoint_event);
> atomic_set(&osb->needs_checkpoint, 0);
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-10 5:37 ` Tao Ma
@ 2009-06-10 6:50 ` Srinivas Eeda
2009-06-10 7:38 ` Tao Ma
2009-06-10 7:58 ` Joel Becker
1 sibling, 1 reply; 18+ messages in thread
From: Srinivas Eeda @ 2009-06-10 6:50 UTC (permalink / raw)
To: ocfs2-devel
Tao Ma wrote:
> Hi Srini/Joel/Sunil,
> I also have some thoughts for it. Wish it isn't too late.
>
> Currently, orphan scan just iterate all the slots and call
> ocfs2_queue_recovery_completion, but I don't think it is proper for a
> node to query another mounted one since that node will query it by
> itself. What's more, it will affect reflink greatly.
> In my current implementation of reflink, It will work like this:
> 1. create a inode in orphan dir
> 2. reflink all the extents.
> 3. move the inode from orphan dir to the destination.
>
> For efficiency, I just lock orphan dir in step 1 and 3, and release
> the lock in step 2 in case reflink will take a long time and we don't
> block other "unlink" process. And in step 1, the created inode looks
> really like a deleted one so that any crash in step 2 won't prevent it
> from being deleted by fsck or recovery.
>
> But with your patch, we may have a race in step 2 that your recovery
> will delete the inode created in step 1. So my suggestion is that your
> orphan scan just skip the mounted node so it won't affect other nodes'
> ongoing reflink. As for the node itself, it is very easy to postpone
> the orphan scan by setting a flag in ocfs2_super when reflink is
> ongoing(I will do it).
>
> Make sense?
Yes, I can restrict the node to recover it's own and offline slots. I
can make the node to recover it's own slot every time the timer fires
and offline slots in round robin way(current way)
>
> Regards,
> Tao
>
> Srinivas Eeda wrote:
>> When a dentry is unlinked, the unlinking node takes an EX on the
>> dentry lock
>> before moving the dentry to the orphan directory. The other nodes,
>> that all had
>> a PR on the same dentry lock, flag the corresponding inode as
>> MAYBE_ORPHANED
>> during the downconvert. The inode is finally deleted when the last
>> node to iput
>> the inode notices the MAYBE_ORPHANED flag.
>>
>> A problem arises if a node is forced to free dentry locks because of
>> memory
>> pressure. If this happens, the node will no longer get downconvert
>> notifications
>> for the dentries that have been unlinked on another node. If it also
>> happens
>> that node is actively using the corresponding inode and happens to be
>> the one
>> performing the last iput on that inode, it will fail to delete the
>> inode as it
>> will not have the MAYBE_ORPHANED flag set.
>>
>> This patch fixes this shortcoming by introducing a periodic scan of
>> the orphan
>> directories to delete such inodes. Care has been taken to distribute the
>> workload across the cluster so that no one node has to perform the
>> task all the
>> time.
>>
>> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
>> ---
>> fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++
>> fs/ocfs2/dlmglue.h | 11 +++++
>> fs/ocfs2/journal.c | 106
>> +++++++++++++++++++++++++++++++++++++++++++++++
>> fs/ocfs2/journal.h | 4 ++
>> fs/ocfs2/ocfs2.h | 10 ++++
>> fs/ocfs2/ocfs2_lockid.h | 5 ++
>> fs/ocfs2/super.c | 9 ++++
>> 7 files changed, 196 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>> index e15fc7d..0f35b83 100644
>> --- a/fs/ocfs2/dlmglue.c
>> +++ b/fs/ocfs2/dlmglue.c
>> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops
>> ocfs2_nfs_sync_lops = {
>> .flags = 0,
>> };
>>
>> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
>> + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
>> +};
>> +
>> static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
>> .get_osb = ocfs2_get_dentry_osb,
>> .post_unlock = ocfs2_dentry_post_unlock,
>> @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct
>> ocfs2_lock_res *res,
>> &ocfs2_nfs_sync_lops, osb);
>> }
>>
>> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>> + struct ocfs2_super *osb)
>> +{
>> + struct ocfs2_orphan_scan_lvb *lvb;
>> +
>> + ocfs2_lock_res_init_once(res);
>> + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0,
>> res->l_name);
>> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
>> + &ocfs2_orphan_scan_lops, osb);
>> + lvb = ocfs2_dlm_lvb(&res->l_lksb);
>> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
>> +}
>> +
>> void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
>> struct ocfs2_file_private *fp)
>> {
>> @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
>> mlog_exit_void();
>> }
>>
>> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
>> +{
>> + struct ocfs2_lock_res *lockres;
>> + struct ocfs2_orphan_scan_lvb *lvb;
>> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
>> + int status = 0;
>> +
>> + lockres = &osb->osb_orphan_scan.os_lockres;
>> + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if
>> (status < 0)
>> + return status;
>> +
>> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
>> + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
>> + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + return status;
>> +}
>> +
>> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno,
>> int ex)
>> +{
>> + struct ocfs2_lock_res *lockres;
>> + struct ocfs2_orphan_scan_lvb *lvb;
>> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
>> +
>> + lockres = &osb->osb_orphan_scan.os_lockres;
>> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
>> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
>> + lvb->lvb_os_seqno = cpu_to_be32(seqno);
>> + ocfs2_cluster_unlock(osb, lockres, level); +}
>> +
>> int ocfs2_super_lock(struct ocfs2_super *osb,
>> int ex)
>> {
>> @@ -2842,6 +2890,7 @@ local:
>> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
>> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
>> ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
>> +
>> ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
>>
>> osb->cconn = conn;
>>
>> @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
>> ocfs2_lock_res_free(&osb->osb_super_lockres);
>> ocfs2_lock_res_free(&osb->osb_rename_lockres);
>> ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
>> + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
>>
>> ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
>> osb->cconn = NULL;
>> @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct
>> ocfs2_super *osb)
>> ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
>> ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
>> ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
>> + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
>> }
>>
>> int ocfs2_drop_inode_locks(struct inode *inode)
>> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
>> index e1fd572..31b90d7 100644
>> --- a/fs/ocfs2/dlmglue.h
>> +++ b/fs/ocfs2/dlmglue.h
>> @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
>> __be32 lvb_free_entry;
>> };
>>
>> +#define OCFS2_ORPHAN_LVB_VERSION 1
>> +
>> +struct ocfs2_orphan_scan_lvb {
>> + __u8 lvb_version;
>> + __u8 lvb_reserved[3];
>> + __be32 lvb_os_seqno;
>> +};
>> +
>> /* ocfs2_inode_lock_full() 'arg_flags' flags */
>> /* don't wait on recovery. */
>> #define OCFS2_META_LOCK_RECOVERY (0x01)
>> @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
>> int ex);
>> void ocfs2_super_unlock(struct ocfs2_super *osb,
>> int ex);
>> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int
>> ex);
>> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno,
>> int ex);
>> +
>> int ocfs2_rename_lock(struct ocfs2_super *osb);
>> void ocfs2_rename_unlock(struct ocfs2_super *osb);
>> int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>> index a20a0f1..dc7cea3 100644
>> --- a/fs/ocfs2/journal.c
>> +++ b/fs/ocfs2/journal.c
>> @@ -28,6 +28,8 @@
>> #include <linux/slab.h>
>> #include <linux/highmem.h>
>> #include <linux/kthread.h>
>> +#include <linux/time.h>
>> +#include <linux/random.h>
>>
>> #define MLOG_MASK_PREFIX ML_JOURNAL
>> #include <cluster/masklog.h>
>> @@ -52,6 +54,8 @@
>>
>> DEFINE_SPINLOCK(trans_inc_lock);
>>
>> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
>> +
>> static int ocfs2_force_read_journal(struct inode *inode);
>> static int ocfs2_recover_node(struct ocfs2_super *osb,
>> int node_num, int slot_num);
>> @@ -1841,6 +1845,108 @@ bail:
>> return status;
>> }
>>
>> +/*
>> + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT.
>> Add some
>> + * randomness to the timeout to minimize multple nodes firing the
>> timer at the
>> + * same time.
>> + */
>> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
>> +{
>> + unsigned long time;
>> +
>> + get_random_bytes(&time, sizeof(time));
>> + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
>> + return msecs_to_jiffies(time);
>> +}
>> +
>> +/*
>> + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
>> + * every slot which queues a recovery of slot on ocfs2_wq thread.
>> This is done
>> + * to cleanup any orphans that are left over in orphan slots.
>> + *
>> + * ocfs2_queue_orphan_scan gets called every
>> ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
>> + * It gets an EX lock on os_lockres and checks sequence number
>> stored in LVB. If
>> + * the sequence number is changed it means some node has done the
>> scan. Skip the
>> + * scan and tracks the sequence number. If the sequence number
>> didn't change,
>> + * means a scan didn't happen, so the node queues a scan and
>> increments the
>> + * sequence number in LVB.
>> + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
>> +{
>> + struct ocfs2_orphan_scan *os;
>> + int status, i;
>> + u32 seqno = 0;
>> +
>> + os = &osb->osb_orphan_scan;
>> +
>> + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
>> + if (status < 0) {
>> + if (status != -EAGAIN)
>> + mlog_errno(status);
>> + goto out;
>> + }
>> +
>> + if (os->os_seqno != seqno) {
>> + os->os_seqno = seqno;
>> + goto unlock;
>> + }
>> +
>> + for (i = 0; i < osb->max_slots; i++)
>> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
>> + NULL);
>> + /*
>> + * We queued a recovery on orphan slots, increment the sequence
>> + * number and update LVB so other node will skip the scan for a
>> while
>> + */
>> + seqno++;
>> +unlock:
>> + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
>> +out:
>> + return;
>> +}
>> +
>> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT
>> millsec */
>> +void ocfs2_orphan_scan_work(struct work_struct *work)
>> +{
>> + struct ocfs2_orphan_scan *os;
>> + struct ocfs2_super *osb;
>> +
>> + os = container_of(work, struct ocfs2_orphan_scan,
>> + os_orphan_scan_work.work);
>> + osb = os->os_osb;
>> +
>> + mutex_lock(&os->os_lock);
>> + ocfs2_queue_orphan_scan(osb);
>> + schedule_delayed_work(&os->os_orphan_scan_work,
>> + ocfs2_orphan_scan_timeout());
>> + mutex_unlock(&os->os_lock);
>> +}
>> +
>> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
>> +{
>> + struct ocfs2_orphan_scan *os;
>> +
>> + os = &osb->osb_orphan_scan;
>> + mutex_lock(&os->os_lock);
>> + cancel_delayed_work(&os->os_orphan_scan_work);
>> + mutex_unlock(&os->os_lock);
>> +}
>> +
>> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
>> +{
>> + struct ocfs2_orphan_scan *os;
>> +
>> + os = &osb->osb_orphan_scan;
>> + os->os_osb = osb;
>> + mutex_init(&os->os_lock);
>> +
>> + INIT_DELAYED_WORK(&os->os_orphan_scan_work,
>> + ocfs2_orphan_scan_work);
>> + schedule_delayed_work(&os->os_orphan_scan_work,
>> + ocfs2_orphan_scan_timeout());
>> + return 0;
>> +}
>> +
>> struct ocfs2_orphan_filldir_priv {
>> struct inode *head;
>> struct ocfs2_super *osb;
>> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
>> index 619dd7f..3483202 100644
>> --- a/fs/ocfs2/journal.h
>> +++ b/fs/ocfs2/journal.h
>> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct
>> ocfs2_super *osb,
>> }
>>
>> /* Exported only for the journal struct init code in super.c. Do not
>> call. */
>> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
>> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
>> +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
>> +
>> void ocfs2_complete_recovery(struct work_struct *work);
>> void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>>
>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>> index 1386281..373fb1c 100644
>> --- a/fs/ocfs2/ocfs2.h
>> +++ b/fs/ocfs2/ocfs2.h
>> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
>> #endif
>> };
>>
>> +struct ocfs2_orphan_scan {
>> + struct mutex os_lock;
>> + struct ocfs2_super *os_osb;
>> + struct ocfs2_lock_res os_lockres; /* lock to synchronize
>> scans */
>> + struct delayed_work os_orphan_scan_work;
>> + u32 os_seqno; /* incremented on every scan */
>> +};
>> +
>> struct ocfs2_dlm_debug {
>> struct kref d_refcnt;
>> struct dentry *d_locking_state;
>> @@ -341,6 +349,8 @@ struct ocfs2_super
>> unsigned int *osb_orphan_wipes;
>> wait_queue_head_t osb_wipe_event;
>>
>> + struct ocfs2_orphan_scan osb_orphan_scan; +
>> /* used to protect metaecc calculation check of xattr. */
>> spinlock_t osb_xattr_lock;
>>
>> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
>> index a53ce87..fcdba09 100644
>> --- a/fs/ocfs2/ocfs2_lockid.h
>> +++ b/fs/ocfs2/ocfs2_lockid.h
>> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
>> OCFS2_LOCK_TYPE_FLOCK,
>> OCFS2_LOCK_TYPE_QINFO,
>> OCFS2_LOCK_TYPE_NFS_SYNC,
>> + OCFS2_LOCK_TYPE_ORPHAN_SCAN,
>> OCFS2_NUM_LOCK_TYPES
>> };
>>
>> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum
>> ocfs2_lock_type type)
>> case OCFS2_LOCK_TYPE_NFS_SYNC:
>> c = 'Y';
>> break;
>> + case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
>> + c = 'P';
>> + break;
>> default:
>> c = '\0';
>> }
>> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
>> [OCFS2_LOCK_TYPE_OPEN] = "Open",
>> [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
>> [OCFS2_LOCK_TYPE_QINFO] = "Quota",
>> + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
>> };
>>
>> static inline const char *ocfs2_lock_type_string(enum
>> ocfs2_lock_type type)
>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>> index 79ff8d9..44ac27e 100644
>> --- a/fs/ocfs2/super.c
>> +++ b/fs/ocfs2/super.c
>> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct
>> super_block *sb, int mnt_err)
>>
>> ocfs2_truncate_log_shutdown(osb);
>>
>> + ocfs2_orphan_scan_stop(osb);
>> +
>> /* This will disable recovery and flush any recovery work. */
>> ocfs2_recovery_exit(osb);
>>
>> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct
>> super_block *sb,
>> goto bail;
>> }
>>
>> + status = ocfs2_orphan_scan_init(osb);
>> + if (status) {
>> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
>> + mlog_errno(status);
>> + goto bail;
>> + }
>> +
>> init_waitqueue_head(&osb->checkpoint_event);
>> atomic_set(&osb->needs_checkpoint, 0);
>>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-10 6:50 ` Srinivas Eeda
@ 2009-06-10 7:38 ` Tao Ma
0 siblings, 0 replies; 18+ messages in thread
From: Tao Ma @ 2009-06-10 7:38 UTC (permalink / raw)
To: ocfs2-devel
Srinivas Eeda wrote:
> Tao Ma wrote:
>> Hi Srini/Joel/Sunil,
>> I also have some thoughts for it. Wish it isn't too late.
>>
>> Currently, orphan scan just iterate all the slots and call
>> ocfs2_queue_recovery_completion, but I don't think it is proper for a
>> node to query another mounted one since that node will query it by
>> itself. What's more, it will affect reflink greatly.
>> In my current implementation of reflink, It will work like this:
>> 1. create a inode in orphan dir
>> 2. reflink all the extents.
>> 3. move the inode from orphan dir to the destination.
>>
>> For efficiency, I just lock orphan dir in step 1 and 3, and release
>> the lock in step 2 in case reflink will take a long time and we don't
>> block other "unlink" process. And in step 1, the created inode looks
>> really like a deleted one so that any crash in step 2 won't prevent it
>> from being deleted by fsck or recovery.
>>
>> But with your patch, we may have a race in step 2 that your recovery
>> will delete the inode created in step 1. So my suggestion is that your
>> orphan scan just skip the mounted node so it won't affect other nodes'
>> ongoing reflink. As for the node itself, it is very easy to postpone
>> the orphan scan by setting a flag in ocfs2_super when reflink is
>> ongoing(I will do it).
>>
>> Make sense?
> Yes, I can restrict the node to recover it's own and offline slots. I
> can make the node to recover it's own slot every time the timer fires
> and offline slots in round robin way(current way)
yeah, that would be cool.
Regards,
Tao
>>
>> Regards,
>> Tao
>>
>> Srinivas Eeda wrote:
>>> When a dentry is unlinked, the unlinking node takes an EX on the
>>> dentry lock
>>> before moving the dentry to the orphan directory. The other nodes,
>>> that all had
>>> a PR on the same dentry lock, flag the corresponding inode as
>>> MAYBE_ORPHANED
>>> during the downconvert. The inode is finally deleted when the last
>>> node to iput
>>> the inode notices the MAYBE_ORPHANED flag.
>>>
>>> A problem arises if a node is forced to free dentry locks because of
>>> memory
>>> pressure. If this happens, the node will no longer get downconvert
>>> notifications
>>> for the dentries that have been unlinked on another node. If it also
>>> happens
>>> that node is actively using the corresponding inode and happens to be
>>> the one
>>> performing the last iput on that inode, it will fail to delete the
>>> inode as it
>>> will not have the MAYBE_ORPHANED flag set.
>>>
>>> This patch fixes this shortcoming by introducing a periodic scan of
>>> the orphan
>>> directories to delete such inodes. Care has been taken to distribute the
>>> workload across the cluster so that no one node has to perform the
>>> task all the
>>> time.
>>>
>>> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
>>> ---
>>> fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++
>>> fs/ocfs2/dlmglue.h | 11 +++++
>>> fs/ocfs2/journal.c | 106
>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>> fs/ocfs2/journal.h | 4 ++
>>> fs/ocfs2/ocfs2.h | 10 ++++
>>> fs/ocfs2/ocfs2_lockid.h | 5 ++
>>> fs/ocfs2/super.c | 9 ++++
>>> 7 files changed, 196 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>> index e15fc7d..0f35b83 100644
>>> --- a/fs/ocfs2/dlmglue.c
>>> +++ b/fs/ocfs2/dlmglue.c
>>> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops
>>> ocfs2_nfs_sync_lops = {
>>> .flags = 0,
>>> };
>>>
>>> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
>>> + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
>>> +};
>>> +
>>> static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
>>> .get_osb = ocfs2_get_dentry_osb,
>>> .post_unlock = ocfs2_dentry_post_unlock,
>>> @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct
>>> ocfs2_lock_res *res,
>>> &ocfs2_nfs_sync_lops, osb);
>>> }
>>>
>>> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>>> + struct ocfs2_super *osb)
>>> +{
>>> + struct ocfs2_orphan_scan_lvb *lvb;
>>> +
>>> + ocfs2_lock_res_init_once(res);
>>> + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0,
>>> res->l_name);
>>> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
>>> + &ocfs2_orphan_scan_lops, osb);
>>> + lvb = ocfs2_dlm_lvb(&res->l_lksb);
>>> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
>>> +}
>>> +
>>> void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
>>> struct ocfs2_file_private *fp)
>>> {
>>> @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
>>> mlog_exit_void();
>>> }
>>>
>>> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
>>> +{
>>> + struct ocfs2_lock_res *lockres;
>>> + struct ocfs2_orphan_scan_lvb *lvb;
>>> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
>>> + int status = 0;
>>> +
>>> + lockres = &osb->osb_orphan_scan.os_lockres;
>>> + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if
>>> (status < 0)
>>> + return status;
>>> +
>>> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
>>> + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
>>> + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + return status;
>>> +}
>>> +
>>> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno,
>>> int ex)
>>> +{
>>> + struct ocfs2_lock_res *lockres;
>>> + struct ocfs2_orphan_scan_lvb *lvb;
>>> + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
>>> +
>>> + lockres = &osb->osb_orphan_scan.os_lockres;
>>> + lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
>>> + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
>>> + lvb->lvb_os_seqno = cpu_to_be32(seqno);
>>> + ocfs2_cluster_unlock(osb, lockres, level); +}
>>> +
>>> int ocfs2_super_lock(struct ocfs2_super *osb,
>>> int ex)
>>> {
>>> @@ -2842,6 +2890,7 @@ local:
>>> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
>>> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
>>> ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
>>> +
>>> ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
>>>
>>> osb->cconn = conn;
>>>
>>> @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
>>> ocfs2_lock_res_free(&osb->osb_super_lockres);
>>> ocfs2_lock_res_free(&osb->osb_rename_lockres);
>>> ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
>>> + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
>>>
>>> ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
>>> osb->cconn = NULL;
>>> @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct
>>> ocfs2_super *osb)
>>> ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
>>> ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
>>> ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
>>> + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
>>> }
>>>
>>> int ocfs2_drop_inode_locks(struct inode *inode)
>>> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
>>> index e1fd572..31b90d7 100644
>>> --- a/fs/ocfs2/dlmglue.h
>>> +++ b/fs/ocfs2/dlmglue.h
>>> @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
>>> __be32 lvb_free_entry;
>>> };
>>>
>>> +#define OCFS2_ORPHAN_LVB_VERSION 1
>>> +
>>> +struct ocfs2_orphan_scan_lvb {
>>> + __u8 lvb_version;
>>> + __u8 lvb_reserved[3];
>>> + __be32 lvb_os_seqno;
>>> +};
>>> +
>>> /* ocfs2_inode_lock_full() 'arg_flags' flags */
>>> /* don't wait on recovery. */
>>> #define OCFS2_META_LOCK_RECOVERY (0x01)
>>> @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
>>> int ex);
>>> void ocfs2_super_unlock(struct ocfs2_super *osb,
>>> int ex);
>>> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int
>>> ex);
>>> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno,
>>> int ex);
>>> +
>>> int ocfs2_rename_lock(struct ocfs2_super *osb);
>>> void ocfs2_rename_unlock(struct ocfs2_super *osb);
>>> int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
>>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>>> index a20a0f1..dc7cea3 100644
>>> --- a/fs/ocfs2/journal.c
>>> +++ b/fs/ocfs2/journal.c
>>> @@ -28,6 +28,8 @@
>>> #include <linux/slab.h>
>>> #include <linux/highmem.h>
>>> #include <linux/kthread.h>
>>> +#include <linux/time.h>
>>> +#include <linux/random.h>
>>>
>>> #define MLOG_MASK_PREFIX ML_JOURNAL
>>> #include <cluster/masklog.h>
>>> @@ -52,6 +54,8 @@
>>>
>>> DEFINE_SPINLOCK(trans_inc_lock);
>>>
>>> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
>>> +
>>> static int ocfs2_force_read_journal(struct inode *inode);
>>> static int ocfs2_recover_node(struct ocfs2_super *osb,
>>> int node_num, int slot_num);
>>> @@ -1841,6 +1845,108 @@ bail:
>>> return status;
>>> }
>>>
>>> +/*
>>> + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT.
>>> Add some
>>> + * randomness to the timeout to minimize multple nodes firing the
>>> timer at the
>>> + * same time.
>>> + */
>>> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
>>> +{
>>> + unsigned long time;
>>> +
>>> + get_random_bytes(&time, sizeof(time));
>>> + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
>>> + return msecs_to_jiffies(time);
>>> +}
>>> +
>>> +/*
>>> + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
>>> + * every slot which queues a recovery of slot on ocfs2_wq thread.
>>> This is done
>>> + * to cleanup any orphans that are left over in orphan slots.
>>> + *
>>> + * ocfs2_queue_orphan_scan gets called every
>>> ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
>>> + * It gets an EX lock on os_lockres and checks sequence number
>>> stored in LVB. If
>>> + * the sequence number is changed it means some node has done the
>>> scan. Skip the
>>> + * scan and tracks the sequence number. If the sequence number
>>> didn't change,
>>> + * means a scan didn't happen, so the node queues a scan and
>>> increments the
>>> + * sequence number in LVB.
>>> + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
>>> +{
>>> + struct ocfs2_orphan_scan *os;
>>> + int status, i;
>>> + u32 seqno = 0;
>>> +
>>> + os = &osb->osb_orphan_scan;
>>> +
>>> + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
>>> + if (status < 0) {
>>> + if (status != -EAGAIN)
>>> + mlog_errno(status);
>>> + goto out;
>>> + }
>>> +
>>> + if (os->os_seqno != seqno) {
>>> + os->os_seqno = seqno;
>>> + goto unlock;
>>> + }
>>> +
>>> + for (i = 0; i < osb->max_slots; i++)
>>> + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
>>> + NULL);
>>> + /*
>>> + * We queued a recovery on orphan slots, increment the sequence
>>> + * number and update LVB so other node will skip the scan for a
>>> while
>>> + */
>>> + seqno++;
>>> +unlock:
>>> + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
>>> +out:
>>> + return;
>>> +}
>>> +
>>> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT
>>> millsec */
>>> +void ocfs2_orphan_scan_work(struct work_struct *work)
>>> +{
>>> + struct ocfs2_orphan_scan *os;
>>> + struct ocfs2_super *osb;
>>> +
>>> + os = container_of(work, struct ocfs2_orphan_scan,
>>> + os_orphan_scan_work.work);
>>> + osb = os->os_osb;
>>> +
>>> + mutex_lock(&os->os_lock);
>>> + ocfs2_queue_orphan_scan(osb);
>>> + schedule_delayed_work(&os->os_orphan_scan_work,
>>> + ocfs2_orphan_scan_timeout());
>>> + mutex_unlock(&os->os_lock);
>>> +}
>>> +
>>> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
>>> +{
>>> + struct ocfs2_orphan_scan *os;
>>> +
>>> + os = &osb->osb_orphan_scan;
>>> + mutex_lock(&os->os_lock);
>>> + cancel_delayed_work(&os->os_orphan_scan_work);
>>> + mutex_unlock(&os->os_lock);
>>> +}
>>> +
>>> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
>>> +{
>>> + struct ocfs2_orphan_scan *os;
>>> +
>>> + os = &osb->osb_orphan_scan;
>>> + os->os_osb = osb;
>>> + mutex_init(&os->os_lock);
>>> +
>>> + INIT_DELAYED_WORK(&os->os_orphan_scan_work,
>>> + ocfs2_orphan_scan_work);
>>> + schedule_delayed_work(&os->os_orphan_scan_work,
>>> + ocfs2_orphan_scan_timeout());
>>> + return 0;
>>> +}
>>> +
>>> struct ocfs2_orphan_filldir_priv {
>>> struct inode *head;
>>> struct ocfs2_super *osb;
>>> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
>>> index 619dd7f..3483202 100644
>>> --- a/fs/ocfs2/journal.h
>>> +++ b/fs/ocfs2/journal.h
>>> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct
>>> ocfs2_super *osb,
>>> }
>>>
>>> /* Exported only for the journal struct init code in super.c. Do not
>>> call. */
>>> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
>>> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
>>> +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
>>> +
>>> void ocfs2_complete_recovery(struct work_struct *work);
>>> void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>>>
>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>> index 1386281..373fb1c 100644
>>> --- a/fs/ocfs2/ocfs2.h
>>> +++ b/fs/ocfs2/ocfs2.h
>>> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
>>> #endif
>>> };
>>>
>>> +struct ocfs2_orphan_scan {
>>> + struct mutex os_lock;
>>> + struct ocfs2_super *os_osb;
>>> + struct ocfs2_lock_res os_lockres; /* lock to synchronize
>>> scans */
>>> + struct delayed_work os_orphan_scan_work;
>>> + u32 os_seqno; /* incremented on every scan */
>>> +};
>>> +
>>> struct ocfs2_dlm_debug {
>>> struct kref d_refcnt;
>>> struct dentry *d_locking_state;
>>> @@ -341,6 +349,8 @@ struct ocfs2_super
>>> unsigned int *osb_orphan_wipes;
>>> wait_queue_head_t osb_wipe_event;
>>>
>>> + struct ocfs2_orphan_scan osb_orphan_scan; +
>>> /* used to protect metaecc calculation check of xattr. */
>>> spinlock_t osb_xattr_lock;
>>>
>>> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
>>> index a53ce87..fcdba09 100644
>>> --- a/fs/ocfs2/ocfs2_lockid.h
>>> +++ b/fs/ocfs2/ocfs2_lockid.h
>>> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
>>> OCFS2_LOCK_TYPE_FLOCK,
>>> OCFS2_LOCK_TYPE_QINFO,
>>> OCFS2_LOCK_TYPE_NFS_SYNC,
>>> + OCFS2_LOCK_TYPE_ORPHAN_SCAN,
>>> OCFS2_NUM_LOCK_TYPES
>>> };
>>>
>>> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum
>>> ocfs2_lock_type type)
>>> case OCFS2_LOCK_TYPE_NFS_SYNC:
>>> c = 'Y';
>>> break;
>>> + case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
>>> + c = 'P';
>>> + break;
>>> default:
>>> c = '\0';
>>> }
>>> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
>>> [OCFS2_LOCK_TYPE_OPEN] = "Open",
>>> [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
>>> [OCFS2_LOCK_TYPE_QINFO] = "Quota",
>>> + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
>>> };
>>>
>>> static inline const char *ocfs2_lock_type_string(enum
>>> ocfs2_lock_type type)
>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>> index 79ff8d9..44ac27e 100644
>>> --- a/fs/ocfs2/super.c
>>> +++ b/fs/ocfs2/super.c
>>> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct
>>> super_block *sb, int mnt_err)
>>>
>>> ocfs2_truncate_log_shutdown(osb);
>>>
>>> + ocfs2_orphan_scan_stop(osb);
>>> +
>>> /* This will disable recovery and flush any recovery work. */
>>> ocfs2_recovery_exit(osb);
>>>
>>> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct
>>> super_block *sb,
>>> goto bail;
>>> }
>>>
>>> + status = ocfs2_orphan_scan_init(osb);
>>> + if (status) {
>>> + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
>>> + mlog_errno(status);
>>> + goto bail;
>>> + }
>>> +
>>> init_waitqueue_head(&osb->checkpoint_event);
>>> atomic_set(&osb->needs_checkpoint, 0);
>>>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-10 5:37 ` Tao Ma
2009-06-10 6:50 ` Srinivas Eeda
@ 2009-06-10 7:58 ` Joel Becker
2009-07-17 7:09 ` Tao Ma
1 sibling, 1 reply; 18+ messages in thread
From: Joel Becker @ 2009-06-10 7:58 UTC (permalink / raw)
To: ocfs2-devel
On Wed, Jun 10, 2009 at 01:37:53PM +0800, Tao Ma wrote:
> I also have some thoughts for it. Wish it isn't too late.
Well, if we come up with changes it will affect what I push, but
that's OK.
> Currently, orphan scan just iterate all the slots and call
> ocfs2_queue_recovery_completion, but I don't think it is proper for a node
> to query another mounted one since that node will query it by
> itself.
Node 1 has an inode it was using. The dentry went away due to
memory pressure. Node 1 closes the inode, but it's on the free list.
The node has the open lock.
Node 2 unlinks the inode. It grabs the dentry lock to notify
others, but node 1 has no dentry and doesn't get the message. It
trylocks the open lock, sees that another node has a PR, and does
nothing.
Later node 2 runs its orphan dir. It igets the inode, trylocks
the open lock, sees the PR still, and does nothing.
Basically, we have to trigger an orphan iput on node 1. The
only way for this to happen is if node 1 runs node 2's orphan dir. This
patch exists because that wasn't happening.
> What's more, it will affect reflink greatly.
> In my current implementation of reflink, It will work like this:
> 1. create a inode in orphan dir
> 2. reflink all the extents.
> 3. move the inode from orphan dir to the destination.
>
> For efficiency, I just lock orphan dir in step 1 and 3, and release the
> lock in step 2 in case reflink will take a long time and we don't block
> other "unlink" process. And in step 1, the created inode looks really like
> a deleted one so that any crash in step 2 won't prevent it from being
> deleted by fsck or recovery.
>
> But with your patch, we may have a race in step 2 that your recovery will
> delete the inode created in step 1. So my suggestion is that your orphan
> scan just skip the mounted node so it won't affect other nodes' ongoing
> reflink. As for the node itself, it is very easy to postpone the orphan
> scan by setting a flag in ocfs2_super when reflink is ongoing(I will do
> it).
You should have an in-core inode, right? That holds the open
lock, preventing the others from deleting it. If you crash, then your
open lock goes away, and it can be recovered.
More importantly, your orphan dir can be run on regular recovery
async as well. It has to work in all cases.
Joel
--
"I'm drifting and drifting
Just like a ship out on the sea.
Cause I ain't got nobody, baby,
In this world to care for me."
Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-06-10 7:58 ` Joel Becker
@ 2009-07-17 7:09 ` Tao Ma
2009-07-17 7:45 ` Srinivas Eeda
0 siblings, 1 reply; 18+ messages in thread
From: Tao Ma @ 2009-07-17 7:09 UTC (permalink / raw)
To: ocfs2-devel
Hi Joel,
This reply may be really too late. :)
Joel Becker wrote:
> On Wed, Jun 10, 2009 at 01:37:53PM +0800, Tao Ma wrote:
>> I also have some thoughts for it. Wish it isn't too late.
>
> Well, if we come up with changes it will affect what I push, but
> that's OK.
>
>> Currently, orphan scan just iterate all the slots and call
>> ocfs2_queue_recovery_completion, but I don't think it is proper for a node
>> to query another mounted one since that node will query it by
>> itself.
>
> Node 1 has an inode it was using. The dentry went away due to
> memory pressure. Node 1 closes the inode, but it's on the free list.
> The node has the open lock.
> Node 2 unlinks the inode. It grabs the dentry lock to notify
> others, but node 1 has no dentry and doesn't get the message. It
> trylocks the open lock, sees that another node has a PR, and does
> nothing.
I just went through the codes of orphan delete, and I think in this
case, we should have already released the open lock in node 1? When
dentry in node 1 went away, it iput. And when node 1 close the inode, it
iputs and open_lock is unlocked already. So node 2 should be OK to
delete the file.
I guess the only case orphan scan help is that dentry in node 1 went
away while the file is opened and at that time node 2 unlink the file.
Am I wrong?
> Later node 2 runs its orphan dir. It igets the inode, trylocks
> the open lock, sees the PR still, and does nothing.
> Basically, we have to trigger an orphan iput on node 1. The
> only way for this to happen is if node 1 runs node 2's orphan dir. This
> patch exists because that wasn't happening.
If the above case I described is right, orphan scan would work after
node 1 close the inode. node 2 will scan its slot, and then try
iget->iput->try_open_lock->delete_inode, the file will be deleted
finally. So we won't trigger an iput in node1.
>
>> What's more, it will affect reflink greatly.
>> In my current implementation of reflink, It will work like this:
>> 1. create a inode in orphan dir
>> 2. reflink all the extents.
>> 3. move the inode from orphan dir to the destination.
>>
>> For efficiency, I just lock orphan dir in step 1 and 3, and release the
>> lock in step 2 in case reflink will take a long time and we don't block
>> other "unlink" process. And in step 1, the created inode looks really like
>> a deleted one so that any crash in step 2 won't prevent it from being
>> deleted by fsck or recovery.
>>
>> But with your patch, we may have a race in step 2 that your recovery will
>> delete the inode created in step 1. So my suggestion is that your orphan
>> scan just skip the mounted node so it won't affect other nodes' ongoing
>> reflink. As for the node itself, it is very easy to postpone the orphan
>> scan by setting a flag in ocfs2_super when reflink is ongoing(I will do
>> it).
>
> You should have an in-core inode, right? That holds the open
> lock, preventing the others from deleting it. If you crash, then your
> open lock goes away, and it can be recovered.
> More importantly, your orphan dir can be run on regular recovery
> async as well. It has to work in all cases.
yes, I have already added open_lock. So orphan scan won't affect reflink
actually. I just want to clarify the scenario orphan scan really works. ;)
Regards,
Tao
^ permalink raw reply [flat|nested] 18+ messages in thread
* [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots
2009-07-17 7:09 ` Tao Ma
@ 2009-07-17 7:45 ` Srinivas Eeda
0 siblings, 0 replies; 18+ messages in thread
From: Srinivas Eeda @ 2009-07-17 7:45 UTC (permalink / raw)
To: ocfs2-devel
Tao Ma wrote:
> Hi Joel,
> This reply may be really too late. :)
>
> Joel Becker wrote:
>> On Wed, Jun 10, 2009 at 01:37:53PM +0800, Tao Ma wrote:
>>> I also have some thoughts for it. Wish it isn't too late.
>>
>> Well, if we come up with changes it will affect what I push, but
>> that's OK.
>>
>>> Currently, orphan scan just iterate all the slots and call
>>> ocfs2_queue_recovery_completion, but I don't think it is proper for
>>> a node to query another mounted one since that node will query it by
>>> itself.
>>
>> Node 1 has an inode it was using. The dentry went away due to
>> memory pressure. Node 1 closes the inode, but it's on the free list.
>> The node has the open lock.
>> Node 2 unlinks the inode. It grabs the dentry lock to notify
>> others, but node 1 has no dentry and doesn't get the message. It
>> trylocks the open lock, sees that another node has a PR, and does
>> nothing.
> I just went through the codes of orphan delete, and I think in this
> case, we should have already released the open lock in node 1? When
> dentry in node 1 went away, it iput. And when node 1 close the inode,
> it iputs and open_lock is unlocked already. So node 2 should be OK to
> delete the file.
>
> I guess the only case orphan scan help is that dentry in node 1 went
> away while the file is opened and at that time node 2 unlink the file.
> Am I wrong?
correct, but the file may not be opened. inode is node 1's cache.
>> Later node 2 runs its orphan dir. It igets the inode, trylocks
>> the open lock, sees the PR still, and does nothing.
>> Basically, we have to trigger an orphan iput on node 1. The
>> only way for this to happen is if node 1 runs node 2's orphan dir. This
>> patch exists because that wasn't happening.
> If the above case I described is right, orphan scan would work after
> node 1 close the inode. node 2 will scan its slot, and then try
> iget->iput->try_open_lock->delete_inode, the file will be deleted
> finally. So we won't trigger an iput in node1.
yes, the only problem is inode could be in node 1's cache for a very
long time. But yes, once after node 1 flushes the inode and node 2 scans
the slot it will be able to delete the file. In a multiple nodes cases,
inode could be in multiple nodes cache.
>>
>>> What's more, it will affect reflink greatly.
>>> In my current implementation of reflink, It will work like this:
>>> 1. create a inode in orphan dir
>>> 2. reflink all the extents.
>>> 3. move the inode from orphan dir to the destination.
>>>
>>> For efficiency, I just lock orphan dir in step 1 and 3, and release
>>> the lock in step 2 in case reflink will take a long time and we
>>> don't block other "unlink" process. And in step 1, the created inode
>>> looks really like a deleted one so that any crash in step 2 won't
>>> prevent it from being deleted by fsck or recovery.
>>>
>>> But with your patch, we may have a race in step 2 that your recovery
>>> will delete the inode created in step 1. So my suggestion is that
>>> your orphan scan just skip the mounted node so it won't affect other
>>> nodes' ongoing reflink. As for the node itself, it is very easy to
>>> postpone the orphan scan by setting a flag in ocfs2_super when
>>> reflink is ongoing(I will do it).
>>
>> You should have an in-core inode, right? That holds the open
>> lock, preventing the others from deleting it. If you crash, then your
>> open lock goes away, and it can be recovered.
>> More importantly, your orphan dir can be run on regular recovery
>> async as well. It has to work in all cases.
> yes, I have already added open_lock. So orphan scan won't affect
> reflink actually. I just want to clarify the scenario orphan scan
> really works. ;)
>
> Regards,
> Tao
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2009-07-17 7:45 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-04 0:02 [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Srinivas Eeda
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
2009-06-04 0:16 ` Sunil Mushran
2009-06-10 5:37 ` Tao Ma
2009-06-10 6:50 ` Srinivas Eeda
2009-06-10 7:38 ` Tao Ma
2009-06-10 7:58 ` Joel Becker
2009-07-17 7:09 ` Tao Ma
2009-07-17 7:45 ` Srinivas Eeda
2009-06-04 0:02 ` [Ocfs2-devel] [PATCH 2/2] ocfs2 patch to track delayed orphan scan timer statistics Srinivas Eeda
2009-06-04 2:27 ` [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 3) Joel Becker
-- strict thread matches above, loose matches on Subject: below --
2009-06-04 6:40 [Ocfs2-devel] Backport that adds delayed orphan scan timer to 1.4 Srinivas Eeda
2009-06-04 6:40 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
2009-06-09 23:47 ` Sunil Mushran
2009-06-02 23:37 [Ocfs2-devel] Patches that adds delayed orphan scan timer (rev 2) Srinivas Eeda
2009-06-02 23:37 ` [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all orphan slots Srinivas Eeda
2009-06-03 0:27 ` Sunil Mushran
2009-06-02 6:11 [Ocfs2-devel] Patches that adds delayed orphan scan timer Srinivas Eeda
2009-06-02 6:11 ` [Ocfs2-devel] [PATCH 1/2] OCFS2: timer to queue scan of all orphan slots Srinivas Eeda
2009-06-02 18:26 ` Sunil Mushran
2009-06-02 18:34 ` Sunil Mushran
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.