* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
@ 2008-10-23 4:19 wengang wang
2008-10-23 8:16 ` Joel Becker
2008-10-24 1:57 ` wengang wang
0 siblings, 2 replies; 8+ messages in thread
From: wengang wang @ 2008-10-23 4:19 UTC (permalink / raw)
To: ocfs2-devel
Ocfs2 supports exporting.
PROBLEM:
There are 2 problems
(1) Current version of ocfs2_get_dentry() may read from disk
the inode WITHOUT any cross cluster lock. This may lead to load a stale inode.
(2) for deleting an inode, ocfs2_remove_inode() doesn't sync/checkpoint to disk.
This also may lead ocfs2_get_dentry() from other node read out stale inode.
PROBLEM DETAIL:
for problem (1),
For inode deletion, after the vote--disk updating, on all nodes in the cluster
domain, there shouldn't be an in-memory inode in question or the in-memory inode
is with OCFS2_INODE_DELETE flag indicating this inode is deleted from other
node.
If the ocfs2_get_dentry() happens during the process of delete-voting and disk
inode deletion. it may introduce a situation that
(A) there is the in-memory inode and
(B) this inode is without OCFS2_INODE_DELETE.
For later operations on the stale inode, this may leads to crash because of the
mismatch of the in-memory generation against the on-disk one if a new inode
occupied the same block.
for problem (2),
in ocfs2_delete_inode(), after disk updates, ocfs2_remove_inode() doesn't
sync/checkpiont to make sure the IO has finished. ocfs2_get_dentry() may read out
a stale inode when JBD doesn't checkpoint yet(updates still only in memory).
SOLUTION:
(I) adds cross cluster lock for deletion and reading inode from nfs. Deletion
takes EX lock which blocks readings on the same inode block; readings take PR
lock which blocks deleting the same inode block.
(II) checkpoints disk updates for deletion within the cross cluster lock.
SOLUTION DETAILS:
By adding the cross cluster lock, reading a block via ocfs2_get_dentry() may be
blocked when a different block is under deleting from other nodes.
To abate that, a couple of such cross cluster locks are used. all blocks go to
those locks. It's unlucky for the reading of a block which is goes to the same
lock as a different block under deleting goes to.
Signed-off-by: Wengang wang <wen.gang.wang@oracle.com>
--
dlmglue.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
dlmglue.h | 6 +++
export.c | 8 ++++
inode.c | 17 ++++++++
ocfs2.h | 7 +++
ocfs2_lockid.h | 4 ++
6 files changed, 152 insertions(+), 3 deletions(-)
Index: fs/ocfs2/dlmglue.h
===================================================================
--- fs/ocfs2/dlmglue.h (revision 3101)
+++ fs/ocfs2/dlmglue.h (working copy)
@@ -79,6 +79,12 @@ void ocfs2_super_unlock(struct ocfs2_sup
int ex);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
+
+int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno,
+ int ex);
+void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno,
+ int ex);
+
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
/* for the vote thread */
Index: fs/ocfs2/export.c
===================================================================
--- fs/ocfs2/export.c (revision 3101)
+++ fs/ocfs2/export.c (working copy)
@@ -49,6 +49,7 @@ static struct dentry *ocfs2_get_dentry(s
struct ocfs2_inode_handle *handle = vobjp;
struct inode *inode;
struct dentry *result;
+ int status;
mlog_entry("(0x%p, 0x%p)\n", sb, handle);
@@ -57,7 +58,14 @@ static struct dentry *ocfs2_get_dentry(s
return ERR_PTR(-ESTALE);
}
+ /* lock this disk block against updating it from other nodes */
+ status = ocfs2_dealloc_lock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ return ERR_PTR(status);
+ }
inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+ ocfs2_dealloc_unlock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
if (IS_ERR(inode))
return (void *)inode;
Index: fs/ocfs2/inode.c
===================================================================
--- fs/ocfs2/inode.c (revision 3101)
+++ fs/ocfs2/inode.c (working copy)
@@ -533,7 +533,9 @@ static int ocfs2_remove_inode(struct ino
mlog_errno(status);
bail_commit:
+ ocfs2_handle_set_sync(handle, 1);
ocfs2_commit_trans(handle);
+ ocfs2_checkpoint_inode(inode);
bail_unlock:
ocfs2_meta_unlock(inode_alloc_inode, 1);
mutex_unlock(&inode_alloc_inode->i_mutex);
@@ -829,6 +831,16 @@ void ocfs2_delete_inode(struct inode *in
goto bail;
}
+ /* prevents reading this disk block during the vote
+ * and disk updating */
+ status = ocfs2_dealloc_lock(OCFS2_SB(inode->i_sb),
+ (u64)inode->i_ino, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unblock;
+ }
+
/* Lock down the inode. This gives us an up to date view of
* it's metadata (for verification), and allows us to
* serialize delete_inode votes. */
@@ -837,7 +849,7 @@ void ocfs2_delete_inode(struct inode *in
if (status != -ENOENT)
mlog_errno(status);
ocfs2_cleanup_delete_inode(inode, 0);
- goto bail_unblock;
+ goto bail_unlock_dealloc_lock;
}
/* Query the cluster. This will be the final decision made
@@ -874,6 +886,9 @@ void ocfs2_delete_inode(struct inode *in
bail_unlock_inode:
ocfs2_meta_unlock(inode, 1);
brelse(di_bh);
+bail_unlock_dealloc_lock:
+ ocfs2_dealloc_unlock(OCFS2_SB(inode->i_sb),
+ (u64)inode->i_ino, 1);
bail_unblock:
status = sigprocmask(SIG_SETMASK, &oldset, NULL);
if (status < 0)
Index: fs/ocfs2/ocfs2_lockid.h
===================================================================
--- fs/ocfs2/ocfs2_lockid.h (revision 3101)
+++ fs/ocfs2/ocfs2_lockid.h (working copy)
@@ -40,6 +40,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_DATA,
OCFS2_LOCK_TYPE_SUPER,
OCFS2_LOCK_TYPE_RENAME,
+ OCFS2_LOCK_TYPE_DEALLOC,
OCFS2_NUM_LOCK_TYPES
};
@@ -59,6 +60,9 @@ static inline char ocfs2_lock_type_char(
case OCFS2_LOCK_TYPE_RENAME:
c = 'R';
break;
+ case OCFS2_LOCK_TYPE_DEALLOC:
+ c = 'E';
+ break;
default:
c = '\0';
}
Index: fs/ocfs2/ocfs2.h
===================================================================
--- fs/ocfs2/ocfs2.h (revision 3101)
+++ fs/ocfs2/ocfs2.h (working copy)
@@ -44,6 +44,8 @@
#include "endian.h"
#include "ocfs2_lockid.h"
+#define OCFS2_DEALLOC_NR 16
+
struct ocfs2_extent_map {
u32 em_clusters;
struct rb_root em_extents;
@@ -267,6 +269,11 @@ struct ocfs2_super
struct dlm_ctxt *dlm;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
+
+ /* holds block locks which protect updating/reading
+ * on the same disk block*/
+ struct ocfs2_lock_res osb_dealloc_lockres[OCFS2_DEALLOC_NR];
+
struct dlm_eviction_cb osb_eviction_cb;
struct ocfs2_dlm_debug *osb_dlm_debug;
Index: fs/ocfs2/dlmglue.c
===================================================================
--- fs/ocfs2/dlmglue.c (revision 3101)
+++ fs/ocfs2/dlmglue.c (working copy)
@@ -66,6 +66,9 @@ static void ocfs2_super_bast_func(void *
static void ocfs2_rename_ast_func(void *opaque);
static void ocfs2_rename_bast_func(void *opaque,
int level);
+static void ocfs2_dealloc_ast_func(void *opaque);
+static void ocfs2_dealloc_bast_func(void *opaquei,
+ int level);
/* so far, all locks have gotten along with the same unlock ast */
static void ocfs2_unlock_ast_func(void *opaque,
@@ -122,6 +125,13 @@ static struct ocfs2_lock_res_ops ocfs2_r
.unblock = ocfs2_unblock_osb_lock,
};
+static struct ocfs2_lock_res_ops ocfs2_dealloc_lops = {
+ .ast = ocfs2_dealloc_ast_func,
+ .bast = ocfs2_dealloc_bast_func,
+ .unlock_ast = ocfs2_unlock_ast_func,
+ .unblock = ocfs2_unblock_osb_lock,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -138,10 +148,16 @@ static inline int ocfs2_is_rename_lock(s
return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
}
+static inline int ocfs2_is_dealloc_lock(struct ocfs2_lock_res *lockres)
+{
+ return lockres->l_type == OCFS2_LOCK_TYPE_DEALLOC;
+}
+
static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
{
BUG_ON(!ocfs2_is_super_lock(lockres)
- && !ocfs2_is_rename_lock(lockres));
+ && !ocfs2_is_rename_lock(lockres)
+ && !ocfs2_is_dealloc_lock(lockres));
return (struct ocfs2_super *) lockres->l_priv;
}
@@ -314,6 +330,16 @@ static void ocfs2_rename_lock_res_init(s
&ocfs2_rename_lops, osb);
}
+static void ocfs2_dealloc_lock_res_init(struct ocfs2_lock_res *res,
+ u64 blkno,
+ struct ocfs2_super *osb)
+{
+ /* Dealloc lockreses don't come from a slab so we call init
+ * once on it manually. */
+ ocfs2_lock_res_init_once(res);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_DEALLOC, blkno,
+ 0, &ocfs2_dealloc_lops, osb);
+}
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
mlog_entry_void();
@@ -727,6 +753,36 @@ static void ocfs2_rename_bast_func(void
mlog_exit_void();
}
+static void ocfs2_dealloc_ast_func(void *opaque)
+{
+ struct ocfs2_lock_res *lockres = opaque;
+
+ mlog_entry_void();
+ mlog(0, "Dealloc AST fired\n");
+
+ BUG_ON(!ocfs2_is_dealloc_lock(lockres));
+
+ ocfs2_generic_ast_func(lockres, 1);
+ mlog_exit_void();
+}
+
+static void ocfs2_dealloc_bast_func(void *opaque,
+ int level)
+{
+ struct ocfs2_lock_res *lockres = opaque;
+ struct ocfs2_super *osb;
+
+ mlog_entry_void();
+ mlog(0, "Dealloc BAST fired\n");
+
+ BUG_ON(!ocfs2_is_dealloc_lock(lockres));
+
+ osb = ocfs2_lock_res_super(lockres);
+ ocfs2_generic_bast_func(osb, lockres, level);
+
+ mlog_exit_void();
+}
+
static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
int convert)
{
@@ -1729,6 +1785,39 @@ void ocfs2_rename_unlock(struct ocfs2_su
ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
}
+/* protects reading/updating the same block
+ * all blocks go to OCFS2_DEALLOC_NR locks
+ */
+int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno, int ex)
+{
+ int status;
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ struct ocfs2_lock_res *lockres;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, NULL, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+ return status;
+}
+
+void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno, int ex)
+{
+ struct ocfs2_lock_res *lockres;
+ int level = ex ? LKM_EXMODE : LKM_PRMODE;
+
+ lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
/* Reference counting of the dlm debug structure. We want this because
* open references on the debug inodes can live on after a mount, so
* we can't rely on the ocfs2_super to always exist. */
@@ -1989,6 +2078,7 @@ static void ocfs2_dlm_shutdown_debug(str
int ocfs2_dlm_init(struct ocfs2_super *osb)
{
int status;
+ int i;
u32 dlm_key;
struct dlm_ctxt *dlm = NULL;
@@ -2030,6 +2120,11 @@ int ocfs2_dlm_init(struct ocfs2_super *o
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+
+ for(i=0; i<OCFS2_DEALLOC_NR; i++) {
+ ocfs2_dealloc_lock_res_init(&osb->osb_dealloc_lockres[i],
+ (u64)i, osb);
+ }
osb->dlm = dlm;
@@ -2047,6 +2142,8 @@ bail:
void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
{
+ int i;
+
mlog_entry_void();
dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
@@ -2060,6 +2157,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_sup
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
+ for(i=0; i<OCFS2_DEALLOC_NR; i++) {
+ ocfs2_lock_res_free(&osb->osb_dealloc_lockres[i]);
+ }
dlm_unregister_domain(osb->dlm);
osb->dlm = NULL;
@@ -2255,6 +2355,7 @@ void ocfs2_mark_lockres_freeing(struct o
static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
{
int status;
+ int i;
mlog_entry_void();
@@ -2269,7 +2370,15 @@ static void ocfs2_drop_osb_locks(struct
status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
if (status < 0)
mlog_errno(status);
-
+
+ for(i=0; i<OCFS2_DEALLOC_NR; i++) {
+ ocfs2_mark_lockres_freeing(&osb->osb_dealloc_lockres[i]);
+ status = ocfs2_drop_lock(osb, &osb->osb_dealloc_lockres[i],
+ NULL);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
mlog_exit(status);
}
^ permalink raw reply [flat|nested] 8+ messages in thread* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-23 4:19 [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode wengang wang
@ 2008-10-23 8:16 ` Joel Becker
2008-10-23 8:33 ` wengang wang
2008-10-24 1:57 ` wengang wang
1 sibling, 1 reply; 8+ messages in thread
From: Joel Becker @ 2008-10-23 8:16 UTC (permalink / raw)
To: ocfs2-devel
On Thu, Oct 23, 2008 at 12:19:21PM +0800, wengang wang wrote:
> Ocfs2 supports exporting.
>
> PROBLEM:
> There are 2 problems
> (1) Current version of ocfs2_get_dentry() may read from disk
> the inode WITHOUT any cross cluster lock. This may lead to load a stale inode.
> (2) for deleting an inode, ocfs2_remove_inode() doesn't sync/checkpoint to disk.
> This also may lead ocfs2_get_dentry() from other node read out stale inode.
>
<snip>
> SOLUTION:
> (I) adds cross cluster lock for deletion and reading inode from nfs. Deletion
> takes EX lock which blocks readings on the same inode block; readings take PR
> lock which blocks deleting the same inode block.
> (II) checkpoints disk updates for deletion within the cross cluster lock.
Cluster locking in an already slow path really bothers me,
especially since I gotta believe we already have the state to do this
locally.
What's the problem other than ESTALE? That's perfectly valid in
the world of NFS.
Joel
--
"Up and down that road in our worn out shoes,
Talking bout good things and singing the blues."
Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-23 8:16 ` Joel Becker
@ 2008-10-23 8:33 ` wengang wang
2008-10-23 9:09 ` Joel Becker
0 siblings, 1 reply; 8+ messages in thread
From: wengang wang @ 2008-10-23 8:33 UTC (permalink / raw)
To: ocfs2-devel
Joel,
Joel Becker wrote:
> On Thu, Oct 23, 2008 at 12:19:21PM +0800, wengang wang wrote:
>
>> Ocfs2 supports exporting.
>>
>> PROBLEM:
>> There are 2 problems
>> (1) Current version of ocfs2_get_dentry() may read from disk
>> the inode WITHOUT any cross cluster lock. This may lead to load a stale inode.
>> (2) for deleting an inode, ocfs2_remove_inode() doesn't sync/checkpoint to disk.
>> This also may lead ocfs2_get_dentry() from other node read out stale inode.
>>
>>
> <snip>
>
>> SOLUTION:
>> (I) adds cross cluster lock for deletion and reading inode from nfs. Deletion
>> takes EX lock which blocks readings on the same inode block; readings take PR
>> lock which blocks deleting the same inode block.
>> (II) checkpoints disk updates for deletion within the cross cluster lock.
>>
>
> Cluster locking in an already slow path really bothers me,
> especially since I gotta believe we already have the state to do this
> locally.
>
surely, it hurts performance.
while, by my test, the ocfs2_get_dentry() is not called very frequently.
actually we can take the cluster lock only when we need do disk read,
instead of each time
ocfs2_get_dentry() is called.
> What's the problem other than ESTALE? That's perfectly valid in
> the world of NFS.
>
>
ESTALE is not a big problem, what is important is that:
it cause kernel panic during ocfs2_meta_lock_update() at later
operations when it updates metadata from disk.
code
---------------------------------------------------
...
mlog_bug_on_msg(inode->i_generation !=
le32_to_cpu(fe->i_generation),
"Invalid dinode %"MLFu64" disk
generation: %u "
"inode->i_generation: %u\n",
oi->ip_blkno, le32_to_cpu(fe->i_generation),
inode->i_generation);
...
---------------------------------------------------
see bug
https://bug.oraclecorp.com/pls/bug/webbug_edit.edit_info_top?rptno=7029797.
the patch is my fix for that bug.
by testing, seems it fixes that bug.
thanks
wengang.
^ permalink raw reply [flat|nested] 8+ messages in thread* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-23 8:33 ` wengang wang
@ 2008-10-23 9:09 ` Joel Becker
2008-10-23 9:22 ` wengang wang
0 siblings, 1 reply; 8+ messages in thread
From: Joel Becker @ 2008-10-23 9:09 UTC (permalink / raw)
To: ocfs2-devel
On Thu, Oct 23, 2008 at 04:33:32PM +0800, wengang wang wrote:
> Joel Becker wrote:
> > What's the problem other than ESTALE? That's perfectly valid in
> > the world of NFS.
> >
> >
> ESTALE is not a big problem, what is important is that:
> it cause kernel panic during ocfs2_meta_lock_update() at later
> operations when it updates metadata from disk.
This is the bug. Why is a stale inode getting meta_lock called
against it? That's what we should be preventing, no?
Joel
--
"I inject pure kryptonite into my brain.
It improves my kung fu, and it eases the pain."
Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-23 9:09 ` Joel Becker
@ 2008-10-23 9:22 ` wengang wang
0 siblings, 0 replies; 8+ messages in thread
From: wengang wang @ 2008-10-23 9:22 UTC (permalink / raw)
To: ocfs2-devel
Joel Becker wrote:
> On Thu, Oct 23, 2008 at 04:33:32PM +0800, wengang wang wrote:
>
>> Joel Becker wrote:
>>
>>> What's the problem other than ESTALE? That's perfectly valid in
>>> the world of NFS.
>>>
>>>
>>>
>> ESTALE is not a big problem, what is important is that:
>> it cause kernel panic during ocfs2_meta_lock_update() at later
>> operations when it updates metadata from disk.
>>
>
> This is the bug. Why is a stale inode getting meta_lock called
> against it? That's what we should be preventing, no?
>
>
for the time, system doesn't know whether it's stale yet.
the fix just makes the inode to be stale(OCFS2_INODE_DELETE), or make
sure there is no such a
stale inode in memory when it's stale(deleted or a new different inode
occupies the same block).
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-23 4:19 [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode wengang wang
2008-10-23 8:16 ` Joel Becker
@ 2008-10-24 1:57 ` wengang wang
2008-10-24 2:09 ` wengang wang
1 sibling, 1 reply; 8+ messages in thread
From: wengang wang @ 2008-10-24 1:57 UTC (permalink / raw)
To: ocfs2-devel
Problem happens in this case:
NODE A NODE B
in ocfs2_delete_inode
(delete inode with ino 27777 gen 8888)
send delete-vote request to B
(now with cluster lock against
orphandir and inode 27777)
inode with ino 27777 is not in memory
response OK to A.
read inode 27777 into memory via ocfs2_get_dentry()
without any cluster lock.
update disk block 27777
unlock against orphandir and inode 27777
creates a new inode ino 27777 gen 8889
after a long time,
metalock against inode 27777
update metadata from disk
gen 8888 mismatches with 8889
panic.
thanks,
wengang.
wengang wang wrote:
> Ocfs2 supports exporting.
>
> PROBLEM:
> There are 2 problems
> (1) Current version of ocfs2_get_dentry() may read from disk
> the inode WITHOUT any cross cluster lock. This may lead to load a stale inode.
> (2) for deleting an inode, ocfs2_remove_inode() doesn't sync/checkpoint to disk.
> This also may lead ocfs2_get_dentry() from other node read out stale inode.
>
> PROBLEM DETAIL:
> for problem (1),
> For inode deletion, after the vote--disk updating, on all nodes in the cluster
> domain, there shouldn't be an in-memory inode in question or the in-memory inode
> is with OCFS2_INODE_DELETE flag indicating this inode is deleted from other
> node.
>
> If the ocfs2_get_dentry() happens during the process of delete-voting and disk
> inode deletion. it may introduce a situation that
> (A) there is the in-memory inode and
> (B) this inode is without OCFS2_INODE_DELETE.
>
> For later operations on the stale inode, this may leads to crash because of the
> mismatch of the in-memory generation against the on-disk one if a new inode
> occupied the same block.
>
> for problem (2),
> in ocfs2_delete_inode(), after disk updates, ocfs2_remove_inode() doesn't
> sync/checkpiont to make sure the IO has finished. ocfs2_get_dentry() may read out
> a stale inode when JBD doesn't checkpoint yet(updates still only in memory).
>
> SOLUTION:
> (I) adds cross cluster lock for deletion and reading inode from nfs. Deletion
> takes EX lock which blocks readings on the same inode block; readings take PR
> lock which blocks deleting the same inode block.
> (II) checkpoints disk updates for deletion within the cross cluster lock.
>
> SOLUTION DETAILS:
> By adding the cross cluster lock, reading a block via ocfs2_get_dentry() may be
> blocked when a different block is under deleting from other nodes.
> To abate that, a couple of such cross cluster locks are used. all blocks go to
> those locks. It's unlucky for the reading of a block which is goes to the same
> lock as a different block under deleting goes to.
>
>
> Signed-off-by: Wengang wang <wen.gang.wang@oracle.com>
> --
>
> dlmglue.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> dlmglue.h | 6 +++
> export.c | 8 ++++
> inode.c | 17 ++++++++
> ocfs2.h | 7 +++
> ocfs2_lockid.h | 4 ++
> 6 files changed, 152 insertions(+), 3 deletions(-)
>
> Index: fs/ocfs2/dlmglue.h
> ===================================================================
> --- fs/ocfs2/dlmglue.h (revision 3101)
> +++ fs/ocfs2/dlmglue.h (working copy)
> @@ -79,6 +79,12 @@ void ocfs2_super_unlock(struct ocfs2_sup
> int ex);
> int ocfs2_rename_lock(struct ocfs2_super *osb);
> void ocfs2_rename_unlock(struct ocfs2_super *osb);
> +
> +int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno,
> + int ex);
> +void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno,
> + int ex);
> +
> void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
>
> /* for the vote thread */
> Index: fs/ocfs2/export.c
> ===================================================================
> --- fs/ocfs2/export.c (revision 3101)
> +++ fs/ocfs2/export.c (working copy)
> @@ -49,6 +49,7 @@ static struct dentry *ocfs2_get_dentry(s
> struct ocfs2_inode_handle *handle = vobjp;
> struct inode *inode;
> struct dentry *result;
> + int status;
>
> mlog_entry("(0x%p, 0x%p)\n", sb, handle);
>
> @@ -57,7 +58,14 @@ static struct dentry *ocfs2_get_dentry(s
> return ERR_PTR(-ESTALE);
> }
>
> + /* lock this disk block against updating it from other nodes */
> + status = ocfs2_dealloc_lock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
> + if (status < 0) {
> + mlog_errno(status);
> + return ERR_PTR(status);
> + }
> inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
> + ocfs2_dealloc_unlock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
>
> if (IS_ERR(inode))
> return (void *)inode;
> Index: fs/ocfs2/inode.c
> ===================================================================
> --- fs/ocfs2/inode.c (revision 3101)
> +++ fs/ocfs2/inode.c (working copy)
> @@ -533,7 +533,9 @@ static int ocfs2_remove_inode(struct ino
> mlog_errno(status);
>
> bail_commit:
> + ocfs2_handle_set_sync(handle, 1);
> ocfs2_commit_trans(handle);
> + ocfs2_checkpoint_inode(inode);
> bail_unlock:
> ocfs2_meta_unlock(inode_alloc_inode, 1);
> mutex_unlock(&inode_alloc_inode->i_mutex);
> @@ -829,6 +831,16 @@ void ocfs2_delete_inode(struct inode *in
> goto bail;
> }
>
> + /* prevents reading this disk block during the vote
> + * and disk updating */
> + status = ocfs2_dealloc_lock(OCFS2_SB(inode->i_sb),
> + (u64)inode->i_ino, 1);
> + if (status < 0) {
> + mlog_errno(status);
> + ocfs2_cleanup_delete_inode(inode, 0);
> + goto bail_unblock;
> + }
> +
> /* Lock down the inode. This gives us an up to date view of
> * it's metadata (for verification), and allows us to
> * serialize delete_inode votes. */
> @@ -837,7 +849,7 @@ void ocfs2_delete_inode(struct inode *in
> if (status != -ENOENT)
> mlog_errno(status);
> ocfs2_cleanup_delete_inode(inode, 0);
> - goto bail_unblock;
> + goto bail_unlock_dealloc_lock;
> }
>
> /* Query the cluster. This will be the final decision made
> @@ -874,6 +886,9 @@ void ocfs2_delete_inode(struct inode *in
> bail_unlock_inode:
> ocfs2_meta_unlock(inode, 1);
> brelse(di_bh);
> +bail_unlock_dealloc_lock:
> + ocfs2_dealloc_unlock(OCFS2_SB(inode->i_sb),
> + (u64)inode->i_ino, 1);
> bail_unblock:
> status = sigprocmask(SIG_SETMASK, &oldset, NULL);
> if (status < 0)
> Index: fs/ocfs2/ocfs2_lockid.h
> ===================================================================
> --- fs/ocfs2/ocfs2_lockid.h (revision 3101)
> +++ fs/ocfs2/ocfs2_lockid.h (working copy)
> @@ -40,6 +40,7 @@ enum ocfs2_lock_type {
> OCFS2_LOCK_TYPE_DATA,
> OCFS2_LOCK_TYPE_SUPER,
> OCFS2_LOCK_TYPE_RENAME,
> + OCFS2_LOCK_TYPE_DEALLOC,
> OCFS2_NUM_LOCK_TYPES
> };
>
> @@ -59,6 +60,9 @@ static inline char ocfs2_lock_type_char(
> case OCFS2_LOCK_TYPE_RENAME:
> c = 'R';
> break;
> + case OCFS2_LOCK_TYPE_DEALLOC:
> + c = 'E';
> + break;
> default:
> c = '\0';
> }
> Index: fs/ocfs2/ocfs2.h
> ===================================================================
> --- fs/ocfs2/ocfs2.h (revision 3101)
> +++ fs/ocfs2/ocfs2.h (working copy)
> @@ -44,6 +44,8 @@
> #include "endian.h"
> #include "ocfs2_lockid.h"
>
> +#define OCFS2_DEALLOC_NR 16
> +
> struct ocfs2_extent_map {
> u32 em_clusters;
> struct rb_root em_extents;
> @@ -267,6 +269,11 @@ struct ocfs2_super
> struct dlm_ctxt *dlm;
> struct ocfs2_lock_res osb_super_lockres;
> struct ocfs2_lock_res osb_rename_lockres;
> +
> + /* holds block locks which protect updating/reading
> + * on the same disk block*/
> + struct ocfs2_lock_res osb_dealloc_lockres[OCFS2_DEALLOC_NR];
> +
> struct dlm_eviction_cb osb_eviction_cb;
> struct ocfs2_dlm_debug *osb_dlm_debug;
>
> Index: fs/ocfs2/dlmglue.c
> ===================================================================
> --- fs/ocfs2/dlmglue.c (revision 3101)
> +++ fs/ocfs2/dlmglue.c (working copy)
> @@ -66,6 +66,9 @@ static void ocfs2_super_bast_func(void *
> static void ocfs2_rename_ast_func(void *opaque);
> static void ocfs2_rename_bast_func(void *opaque,
> int level);
> +static void ocfs2_dealloc_ast_func(void *opaque);
> +static void ocfs2_dealloc_bast_func(void *opaquei,
> + int level);
>
> /* so far, all locks have gotten along with the same unlock ast */
> static void ocfs2_unlock_ast_func(void *opaque,
> @@ -122,6 +125,13 @@ static struct ocfs2_lock_res_ops ocfs2_r
> .unblock = ocfs2_unblock_osb_lock,
> };
>
> +static struct ocfs2_lock_res_ops ocfs2_dealloc_lops = {
> + .ast = ocfs2_dealloc_ast_func,
> + .bast = ocfs2_dealloc_bast_func,
> + .unlock_ast = ocfs2_unlock_ast_func,
> + .unblock = ocfs2_unblock_osb_lock,
> +};
> +
> static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
> {
> return lockres->l_type == OCFS2_LOCK_TYPE_META ||
> @@ -138,10 +148,16 @@ static inline int ocfs2_is_rename_lock(s
> return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
> }
>
> +static inline int ocfs2_is_dealloc_lock(struct ocfs2_lock_res *lockres)
> +{
> + return lockres->l_type == OCFS2_LOCK_TYPE_DEALLOC;
> +}
> +
> static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
> {
> BUG_ON(!ocfs2_is_super_lock(lockres)
> - && !ocfs2_is_rename_lock(lockres));
> + && !ocfs2_is_rename_lock(lockres)
> + && !ocfs2_is_dealloc_lock(lockres));
>
> return (struct ocfs2_super *) lockres->l_priv;
> }
> @@ -314,6 +330,16 @@ static void ocfs2_rename_lock_res_init(s
> &ocfs2_rename_lops, osb);
> }
>
> +static void ocfs2_dealloc_lock_res_init(struct ocfs2_lock_res *res,
> + u64 blkno,
> + struct ocfs2_super *osb)
> +{
> + /* Dealloc lockreses don't come from a slab so we call init
> + * once on it manually. */
> + ocfs2_lock_res_init_once(res);
> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_DEALLOC, blkno,
> + 0, &ocfs2_dealloc_lops, osb);
> +}
> void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
> {
> mlog_entry_void();
> @@ -727,6 +753,36 @@ static void ocfs2_rename_bast_func(void
> mlog_exit_void();
> }
>
> +static void ocfs2_dealloc_ast_func(void *opaque)
> +{
> + struct ocfs2_lock_res *lockres = opaque;
> +
> + mlog_entry_void();
> + mlog(0, "Dealloc AST fired\n");
> +
> + BUG_ON(!ocfs2_is_dealloc_lock(lockres));
> +
> + ocfs2_generic_ast_func(lockres, 1);
> + mlog_exit_void();
> +}
> +
> +static void ocfs2_dealloc_bast_func(void *opaque,
> + int level)
> +{
> + struct ocfs2_lock_res *lockres = opaque;
> + struct ocfs2_super *osb;
> +
> + mlog_entry_void();
> + mlog(0, "Dealloc BAST fired\n");
> +
> + BUG_ON(!ocfs2_is_dealloc_lock(lockres));
> +
> + osb = ocfs2_lock_res_super(lockres);
> + ocfs2_generic_bast_func(osb, lockres, level);
> +
> + mlog_exit_void();
> +}
> +
> static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
> int convert)
> {
> @@ -1729,6 +1785,39 @@ void ocfs2_rename_unlock(struct ocfs2_su
> ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
> }
>
> +/* protects reading/updating the same block
> + * all blocks go to OCFS2_DEALLOC_NR locks
> + */
> +int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno, int ex)
> +{
> + int status;
> + int level = ex ? LKM_EXMODE : LKM_PRMODE;
> + struct ocfs2_lock_res *lockres;
> +
> + if (ocfs2_is_hard_readonly(osb))
> + return -EROFS;
> +
> + if (ocfs2_mount_local(osb))
> + return 0;
> +
> + lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
> + status = ocfs2_cluster_lock(osb, lockres, level, 0, NULL, 0);
> + if (status < 0)
> + mlog_errno(status);
> +
> + return status;
> +}
> +
> +void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno, int ex)
> +{
> + struct ocfs2_lock_res *lockres;
> + int level = ex ? LKM_EXMODE : LKM_PRMODE;
> +
> + lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
> + if (!ocfs2_mount_local(osb))
> + ocfs2_cluster_unlock(osb, lockres, level);
> +}
> +
> /* Reference counting of the dlm debug structure. We want this because
> * open references on the debug inodes can live on after a mount, so
> * we can't rely on the ocfs2_super to always exist. */
> @@ -1989,6 +2078,7 @@ static void ocfs2_dlm_shutdown_debug(str
> int ocfs2_dlm_init(struct ocfs2_super *osb)
> {
> int status;
> + int i;
> u32 dlm_key;
> struct dlm_ctxt *dlm = NULL;
>
> @@ -2030,6 +2120,11 @@ int ocfs2_dlm_init(struct ocfs2_super *o
> local:
> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
> +
> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
> + ocfs2_dealloc_lock_res_init(&osb->osb_dealloc_lockres[i],
> + (u64)i, osb);
> + }
>
> osb->dlm = dlm;
>
> @@ -2047,6 +2142,8 @@ bail:
>
> void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
> {
> + int i;
> +
> mlog_entry_void();
>
> dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
> @@ -2060,6 +2157,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_sup
>
> ocfs2_lock_res_free(&osb->osb_super_lockres);
> ocfs2_lock_res_free(&osb->osb_rename_lockres);
> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
> + ocfs2_lock_res_free(&osb->osb_dealloc_lockres[i]);
> + }
>
> dlm_unregister_domain(osb->dlm);
> osb->dlm = NULL;
> @@ -2255,6 +2355,7 @@ void ocfs2_mark_lockres_freeing(struct o
> static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
> {
> int status;
> + int i;
>
> mlog_entry_void();
>
> @@ -2269,7 +2370,15 @@ static void ocfs2_drop_osb_locks(struct
> status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
> if (status < 0)
> mlog_errno(status);
> -
> +
> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
> + ocfs2_mark_lockres_freeing(&osb->osb_dealloc_lockres[i]);
> + status = ocfs2_drop_lock(osb, &osb->osb_dealloc_lockres[i],
> + NULL);
> + if (status < 0)
> + mlog_errno(status);
> + }
> +
> mlog_exit(status);
> }
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
^ permalink raw reply [flat|nested] 8+ messages in thread* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-24 1:57 ` wengang wang
@ 2008-10-24 2:09 ` wengang wang
2008-10-29 1:03 ` wengang wang
0 siblings, 1 reply; 8+ messages in thread
From: wengang wang @ 2008-10-24 2:09 UTC (permalink / raw)
To: ocfs2-devel
format was destroyed, paste it as attachment.
thanks
wengang.
wengang wang wrote:
> Problem happens in this case:
>
> NODE A NODE B
>
> in ocfs2_delete_inode
> (delete inode with ino 27777 gen 8888)
> send delete-vote request to B
> (now with cluster lock against
> orphandir and inode 27777)
>
>
>
> inode with ino 27777 is not in memory
>
> response OK to A.
>
>
> read inode 27777 into memory via ocfs2_get_dentry()
>
> without any cluster lock.
>
>
> update disk block 27777
>
> unlock against orphandir and inode 27777
>
> creates a new inode ino 27777 gen 8889
>
>
> after a long time,
>
> metalock against inode 27777
>
> update metadata from disk
>
> gen 8888 mismatches with 8889
>
> panic.
>
>
> thanks,
> wengang.
>
>
> wengang wang wrote:
>
>> Ocfs2 supports exporting.
>>
>> PROBLEM:
>> There are 2 problems
>> (1) Current version of ocfs2_get_dentry() may read from disk
>> the inode WITHOUT any cross cluster lock. This may lead to load a stale inode.
>> (2) for deleting an inode, ocfs2_remove_inode() doesn't sync/checkpoint to disk.
>> This also may lead ocfs2_get_dentry() from other node read out stale inode.
>>
>> PROBLEM DETAIL:
>> for problem (1),
>> For inode deletion, after the vote--disk updating, on all nodes in the cluster
>> domain, there shouldn't be an in-memory inode in question or the in-memory inode
>> is with OCFS2_INODE_DELETE flag indicating this inode is deleted from other
>> node.
>>
>> If the ocfs2_get_dentry() happens during the process of delete-voting and disk
>> inode deletion. it may introduce a situation that
>> (A) there is the in-memory inode and
>> (B) this inode is without OCFS2_INODE_DELETE.
>>
>> For later operations on the stale inode, this may leads to crash because of the
>> mismatch of the in-memory generation against the on-disk one if a new inode
>> occupied the same block.
>>
>> for problem (2),
>> in ocfs2_delete_inode(), after disk updates, ocfs2_remove_inode() doesn't
>> sync/checkpiont to make sure the IO has finished. ocfs2_get_dentry() may read out
>> a stale inode when JBD doesn't checkpoint yet(updates still only in memory).
>>
>> SOLUTION:
>> (I) adds cross cluster lock for deletion and reading inode from nfs. Deletion
>> takes EX lock which blocks readings on the same inode block; readings take PR
>> lock which blocks deleting the same inode block.
>> (II) checkpoints disk updates for deletion within the cross cluster lock.
>>
>> SOLUTION DETAILS:
>> By adding the cross cluster lock, reading a block via ocfs2_get_dentry() may be
>> blocked when a different block is under deleting from other nodes.
>> To abate that, a couple of such cross cluster locks are used. all blocks go to
>> those locks. It's unlucky for the reading of a block which is goes to the same
>> lock as a different block under deleting goes to.
>>
>>
>> Signed-off-by: Wengang wang <wen.gang.wang@oracle.com>
>> --
>>
>> dlmglue.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>> dlmglue.h | 6 +++
>> export.c | 8 ++++
>> inode.c | 17 ++++++++
>> ocfs2.h | 7 +++
>> ocfs2_lockid.h | 4 ++
>> 6 files changed, 152 insertions(+), 3 deletions(-)
>>
>> Index: fs/ocfs2/dlmglue.h
>> ===================================================================
>> --- fs/ocfs2/dlmglue.h (revision 3101)
>> +++ fs/ocfs2/dlmglue.h (working copy)
>> @@ -79,6 +79,12 @@ void ocfs2_super_unlock(struct ocfs2_sup
>> int ex);
>> int ocfs2_rename_lock(struct ocfs2_super *osb);
>> void ocfs2_rename_unlock(struct ocfs2_super *osb);
>> +
>> +int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno,
>> + int ex);
>> +void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno,
>> + int ex);
>> +
>> void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
>>
>> /* for the vote thread */
>> Index: fs/ocfs2/export.c
>> ===================================================================
>> --- fs/ocfs2/export.c (revision 3101)
>> +++ fs/ocfs2/export.c (working copy)
>> @@ -49,6 +49,7 @@ static struct dentry *ocfs2_get_dentry(s
>> struct ocfs2_inode_handle *handle = vobjp;
>> struct inode *inode;
>> struct dentry *result;
>> + int status;
>>
>> mlog_entry("(0x%p, 0x%p)\n", sb, handle);
>>
>> @@ -57,7 +58,14 @@ static struct dentry *ocfs2_get_dentry(s
>> return ERR_PTR(-ESTALE);
>> }
>>
>> + /* lock this disk block against updating it from other nodes */
>> + status = ocfs2_dealloc_lock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
>> + if (status < 0) {
>> + mlog_errno(status);
>> + return ERR_PTR(status);
>> + }
>> inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
>> + ocfs2_dealloc_unlock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
>>
>> if (IS_ERR(inode))
>> return (void *)inode;
>> Index: fs/ocfs2/inode.c
>> ===================================================================
>> --- fs/ocfs2/inode.c (revision 3101)
>> +++ fs/ocfs2/inode.c (working copy)
>> @@ -533,7 +533,9 @@ static int ocfs2_remove_inode(struct ino
>> mlog_errno(status);
>>
>> bail_commit:
>> + ocfs2_handle_set_sync(handle, 1);
>> ocfs2_commit_trans(handle);
>> + ocfs2_checkpoint_inode(inode);
>> bail_unlock:
>> ocfs2_meta_unlock(inode_alloc_inode, 1);
>> mutex_unlock(&inode_alloc_inode->i_mutex);
>> @@ -829,6 +831,16 @@ void ocfs2_delete_inode(struct inode *in
>> goto bail;
>> }
>>
>> + /* prevents reading this disk block during the vote
>> + * and disk updating */
>> + status = ocfs2_dealloc_lock(OCFS2_SB(inode->i_sb),
>> + (u64)inode->i_ino, 1);
>> + if (status < 0) {
>> + mlog_errno(status);
>> + ocfs2_cleanup_delete_inode(inode, 0);
>> + goto bail_unblock;
>> + }
>> +
>> /* Lock down the inode. This gives us an up to date view of
>> * it's metadata (for verification), and allows us to
>> * serialize delete_inode votes. */
>> @@ -837,7 +849,7 @@ void ocfs2_delete_inode(struct inode *in
>> if (status != -ENOENT)
>> mlog_errno(status);
>> ocfs2_cleanup_delete_inode(inode, 0);
>> - goto bail_unblock;
>> + goto bail_unlock_dealloc_lock;
>> }
>>
>> /* Query the cluster. This will be the final decision made
>> @@ -874,6 +886,9 @@ void ocfs2_delete_inode(struct inode *in
>> bail_unlock_inode:
>> ocfs2_meta_unlock(inode, 1);
>> brelse(di_bh);
>> +bail_unlock_dealloc_lock:
>> + ocfs2_dealloc_unlock(OCFS2_SB(inode->i_sb),
>> + (u64)inode->i_ino, 1);
>> bail_unblock:
>> status = sigprocmask(SIG_SETMASK, &oldset, NULL);
>> if (status < 0)
>> Index: fs/ocfs2/ocfs2_lockid.h
>> ===================================================================
>> --- fs/ocfs2/ocfs2_lockid.h (revision 3101)
>> +++ fs/ocfs2/ocfs2_lockid.h (working copy)
>> @@ -40,6 +40,7 @@ enum ocfs2_lock_type {
>> OCFS2_LOCK_TYPE_DATA,
>> OCFS2_LOCK_TYPE_SUPER,
>> OCFS2_LOCK_TYPE_RENAME,
>> + OCFS2_LOCK_TYPE_DEALLOC,
>> OCFS2_NUM_LOCK_TYPES
>> };
>>
>> @@ -59,6 +60,9 @@ static inline char ocfs2_lock_type_char(
>> case OCFS2_LOCK_TYPE_RENAME:
>> c = 'R';
>> break;
>> + case OCFS2_LOCK_TYPE_DEALLOC:
>> + c = 'E';
>> + break;
>> default:
>> c = '\0';
>> }
>> Index: fs/ocfs2/ocfs2.h
>> ===================================================================
>> --- fs/ocfs2/ocfs2.h (revision 3101)
>> +++ fs/ocfs2/ocfs2.h (working copy)
>> @@ -44,6 +44,8 @@
>> #include "endian.h"
>> #include "ocfs2_lockid.h"
>>
>> +#define OCFS2_DEALLOC_NR 16
>> +
>> struct ocfs2_extent_map {
>> u32 em_clusters;
>> struct rb_root em_extents;
>> @@ -267,6 +269,11 @@ struct ocfs2_super
>> struct dlm_ctxt *dlm;
>> struct ocfs2_lock_res osb_super_lockres;
>> struct ocfs2_lock_res osb_rename_lockres;
>> +
>> + /* holds block locks which protect updating/reading
>> + * on the same disk block*/
>> + struct ocfs2_lock_res osb_dealloc_lockres[OCFS2_DEALLOC_NR];
>> +
>> struct dlm_eviction_cb osb_eviction_cb;
>> struct ocfs2_dlm_debug *osb_dlm_debug;
>>
>> Index: fs/ocfs2/dlmglue.c
>> ===================================================================
>> --- fs/ocfs2/dlmglue.c (revision 3101)
>> +++ fs/ocfs2/dlmglue.c (working copy)
>> @@ -66,6 +66,9 @@ static void ocfs2_super_bast_func(void *
>> static void ocfs2_rename_ast_func(void *opaque);
>> static void ocfs2_rename_bast_func(void *opaque,
>> int level);
>> +static void ocfs2_dealloc_ast_func(void *opaque);
>> +static void ocfs2_dealloc_bast_func(void *opaquei,
>> + int level);
>>
>> /* so far, all locks have gotten along with the same unlock ast */
>> static void ocfs2_unlock_ast_func(void *opaque,
>> @@ -122,6 +125,13 @@ static struct ocfs2_lock_res_ops ocfs2_r
>> .unblock = ocfs2_unblock_osb_lock,
>> };
>>
>> +static struct ocfs2_lock_res_ops ocfs2_dealloc_lops = {
>> + .ast = ocfs2_dealloc_ast_func,
>> + .bast = ocfs2_dealloc_bast_func,
>> + .unlock_ast = ocfs2_unlock_ast_func,
>> + .unblock = ocfs2_unblock_osb_lock,
>> +};
>> +
>> static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
>> {
>> return lockres->l_type == OCFS2_LOCK_TYPE_META ||
>> @@ -138,10 +148,16 @@ static inline int ocfs2_is_rename_lock(s
>> return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
>> }
>>
>> +static inline int ocfs2_is_dealloc_lock(struct ocfs2_lock_res *lockres)
>> +{
>> + return lockres->l_type == OCFS2_LOCK_TYPE_DEALLOC;
>> +}
>> +
>> static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
>> {
>> BUG_ON(!ocfs2_is_super_lock(lockres)
>> - && !ocfs2_is_rename_lock(lockres));
>> + && !ocfs2_is_rename_lock(lockres)
>> + && !ocfs2_is_dealloc_lock(lockres));
>>
>> return (struct ocfs2_super *) lockres->l_priv;
>> }
>> @@ -314,6 +330,16 @@ static void ocfs2_rename_lock_res_init(s
>> &ocfs2_rename_lops, osb);
>> }
>>
>> +static void ocfs2_dealloc_lock_res_init(struct ocfs2_lock_res *res,
>> + u64 blkno,
>> + struct ocfs2_super *osb)
>> +{
>> + /* Dealloc lockreses don't come from a slab so we call init
>> + * once on it manually. */
>> + ocfs2_lock_res_init_once(res);
>> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_DEALLOC, blkno,
>> + 0, &ocfs2_dealloc_lops, osb);
>> +}
>> void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
>> {
>> mlog_entry_void();
>> @@ -727,6 +753,36 @@ static void ocfs2_rename_bast_func(void
>> mlog_exit_void();
>> }
>>
>> +static void ocfs2_dealloc_ast_func(void *opaque)
>> +{
>> + struct ocfs2_lock_res *lockres = opaque;
>> +
>> + mlog_entry_void();
>> + mlog(0, "Dealloc AST fired\n");
>> +
>> + BUG_ON(!ocfs2_is_dealloc_lock(lockres));
>> +
>> + ocfs2_generic_ast_func(lockres, 1);
>> + mlog_exit_void();
>> +}
>> +
>> +static void ocfs2_dealloc_bast_func(void *opaque,
>> + int level)
>> +{
>> + struct ocfs2_lock_res *lockres = opaque;
>> + struct ocfs2_super *osb;
>> +
>> + mlog_entry_void();
>> + mlog(0, "Dealloc BAST fired\n");
>> +
>> + BUG_ON(!ocfs2_is_dealloc_lock(lockres));
>> +
>> + osb = ocfs2_lock_res_super(lockres);
>> + ocfs2_generic_bast_func(osb, lockres, level);
>> +
>> + mlog_exit_void();
>> +}
>> +
>> static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
>> int convert)
>> {
>> @@ -1729,6 +1785,39 @@ void ocfs2_rename_unlock(struct ocfs2_su
>> ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
>> }
>>
>> +/* protects reading/updating the same block
>> + * all blocks go to OCFS2_DEALLOC_NR locks
>> + */
>> +int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno, int ex)
>> +{
>> + int status;
>> + int level = ex ? LKM_EXMODE : LKM_PRMODE;
>> + struct ocfs2_lock_res *lockres;
>> +
>> + if (ocfs2_is_hard_readonly(osb))
>> + return -EROFS;
>> +
>> + if (ocfs2_mount_local(osb))
>> + return 0;
>> +
>> + lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
>> + status = ocfs2_cluster_lock(osb, lockres, level, 0, NULL, 0);
>> + if (status < 0)
>> + mlog_errno(status);
>> +
>> + return status;
>> +}
>> +
>> +void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno, int ex)
>> +{
>> + struct ocfs2_lock_res *lockres;
>> + int level = ex ? LKM_EXMODE : LKM_PRMODE;
>> +
>> + lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
>> + if (!ocfs2_mount_local(osb))
>> + ocfs2_cluster_unlock(osb, lockres, level);
>> +}
>> +
>> /* Reference counting of the dlm debug structure. We want this because
>> * open references on the debug inodes can live on after a mount, so
>> * we can't rely on the ocfs2_super to always exist. */
>> @@ -1989,6 +2078,7 @@ static void ocfs2_dlm_shutdown_debug(str
>> int ocfs2_dlm_init(struct ocfs2_super *osb)
>> {
>> int status;
>> + int i;
>> u32 dlm_key;
>> struct dlm_ctxt *dlm = NULL;
>>
>> @@ -2030,6 +2120,11 @@ int ocfs2_dlm_init(struct ocfs2_super *o
>> local:
>> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
>> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
>> +
>> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
>> + ocfs2_dealloc_lock_res_init(&osb->osb_dealloc_lockres[i],
>> + (u64)i, osb);
>> + }
>>
>> osb->dlm = dlm;
>>
>> @@ -2047,6 +2142,8 @@ bail:
>>
>> void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
>> {
>> + int i;
>> +
>> mlog_entry_void();
>>
>> dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
>> @@ -2060,6 +2157,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_sup
>>
>> ocfs2_lock_res_free(&osb->osb_super_lockres);
>> ocfs2_lock_res_free(&osb->osb_rename_lockres);
>> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
>> + ocfs2_lock_res_free(&osb->osb_dealloc_lockres[i]);
>> + }
>>
>> dlm_unregister_domain(osb->dlm);
>> osb->dlm = NULL;
>> @@ -2255,6 +2355,7 @@ void ocfs2_mark_lockres_freeing(struct o
>> static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
>> {
>> int status;
>> + int i;
>>
>> mlog_entry_void();
>>
>> @@ -2269,7 +2370,15 @@ static void ocfs2_drop_osb_locks(struct
>> status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
>> if (status < 0)
>> mlog_errno(status);
>> -
>> +
>> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
>> + ocfs2_mark_lockres_freeing(&osb->osb_dealloc_lockres[i]);
>> + status = ocfs2_drop_lock(osb, &osb->osb_dealloc_lockres[i],
>> + NULL);
>> + if (status < 0)
>> + mlog_errno(status);
>> + }
>> +
>> mlog_exit(status);
>> }
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com
>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>
>>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: case
Url: http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20081024/4e4979e1/attachment.pl
^ permalink raw reply [flat|nested] 8+ messages in thread* [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode.
2008-10-24 2:09 ` wengang wang
@ 2008-10-29 1:03 ` wengang wang
0 siblings, 0 replies; 8+ messages in thread
From: wengang wang @ 2008-10-29 1:03 UTC (permalink / raw)
To: ocfs2-devel
Hi Sunil and Mark,
any comment for this patch?
the patch is for a bug fix. the customer hits this problem repeatedly.
no matter if the patch is OK or not, please give me a response.
thanks,
wengang.
wengang wang wrote:
> format was destroyed, paste it as attachment.
>
> thanks
> wengang.
>
> wengang wang wrote:
>> Problem happens in this case:
>>
>> NODE A
>> NODE B
>>
>> in ocfs2_delete_inode
>> (delete inode with ino 27777 gen 8888)
>> send delete-vote request to B
>> (now with cluster lock against
>> orphandir and inode 27777)
>>
>>
>>
>> inode with ino 27777 is not in memory
>>
>> response OK to A.
>>
>>
>> read inode 27777 into memory via ocfs2_get_dentry()
>>
>> without any cluster lock.
>>
>>
>> update disk block 27777
>>
>> unlock against orphandir and inode 27777
>>
>> creates a new inode ino 27777 gen 8889
>>
>>
>> after a long time,
>>
>> metalock against inode 27777
>>
>> update metadata from disk
>>
>> gen 8888 mismatches with 8889
>>
>> panic.
>>
>>
>> thanks,
>> wengang.
>>
>>
>> wengang wang wrote:
>>
>>> Ocfs2 supports exporting.
>>> PROBLEM:
>>> There are 2 problems
>>> (1) Current version of ocfs2_get_dentry() may read from disk
>>> the inode WITHOUT any cross cluster lock. This may lead to load a
>>> stale inode.
>>> (2) for deleting an inode, ocfs2_remove_inode() doesn't
>>> sync/checkpoint to disk.
>>> This also may lead ocfs2_get_dentry() from other node read out stale
>>> inode.
>>>
>>> PROBLEM DETAIL:
>>> for problem (1),
>>> For inode deletion, after the vote--disk updating, on all nodes in
>>> the cluster domain, there shouldn't be an in-memory inode in
>>> question or the in-memory inode
>>> is with OCFS2_INODE_DELETE flag indicating this inode is deleted
>>> from other
>>> node.
>>>
>>> If the ocfs2_get_dentry() happens during the process of
>>> delete-voting and disk
>>> inode deletion. it may introduce a situation that
>>> (A) there is the in-memory inode and
>>> (B) this inode is without OCFS2_INODE_DELETE.
>>>
>>> For later operations on the stale inode, this may leads to crash
>>> because of the mismatch of the in-memory generation against the
>>> on-disk one if a new inode occupied the same block.
>>>
>>> for problem (2),
>>> in ocfs2_delete_inode(), after disk updates, ocfs2_remove_inode()
>>> doesn't sync/checkpiont to make sure the IO has finished.
>>> ocfs2_get_dentry() may read out
>>> a stale inode when JBD doesn't checkpoint yet(updates still only in
>>> memory).
>>>
>>> SOLUTION:
>>> (I) adds cross cluster lock for deletion and reading inode from nfs.
>>> Deletion
>>> takes EX lock which blocks readings on the same inode block;
>>> readings take PR
>>> lock which blocks deleting the same inode block.
>>> (II) checkpoints disk updates for deletion within the cross cluster
>>> lock.
>>>
>>> SOLUTION DETAILS:
>>> By adding the cross cluster lock, reading a block via
>>> ocfs2_get_dentry() may be
>>> blocked when a different block is under deleting from other nodes.
>>> To abate that, a couple of such cross cluster locks are used. all
>>> blocks go to those locks. It's unlucky for the reading of a block
>>> which is goes to the same lock as a different block under deleting
>>> goes to.
>>>
>>>
>>> Signed-off-by: Wengang wang <wen.gang.wang@oracle.com>
>>> --
>>>
>>> dlmglue.c | 113
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>> dlmglue.h | 6 +++
>>> export.c | 8 ++++
>>> inode.c | 17 ++++++++
>>> ocfs2.h | 7 +++
>>> ocfs2_lockid.h | 4 ++
>>> 6 files changed, 152 insertions(+), 3 deletions(-)
>>>
>>> Index: fs/ocfs2/dlmglue.h
>>> ===================================================================
>>> --- fs/ocfs2/dlmglue.h (revision 3101)
>>> +++ fs/ocfs2/dlmglue.h (working copy)
>>> @@ -79,6 +79,12 @@ void ocfs2_super_unlock(struct ocfs2_sup
>>> int ex);
>>> int ocfs2_rename_lock(struct ocfs2_super *osb);
>>> void ocfs2_rename_unlock(struct ocfs2_super *osb);
>>> +
>>> +int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno,
>>> + int ex);
>>> +void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno,
>>> + int ex);
>>> +
>>> void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
>>>
>>> /* for the vote thread */
>>> Index: fs/ocfs2/export.c
>>> ===================================================================
>>> --- fs/ocfs2/export.c (revision 3101)
>>> +++ fs/ocfs2/export.c (working copy)
>>> @@ -49,6 +49,7 @@ static struct dentry *ocfs2_get_dentry(s
>>> struct ocfs2_inode_handle *handle = vobjp;
>>> struct inode *inode;
>>> struct dentry *result;
>>> + int status;
>>>
>>> mlog_entry("(0x%p, 0x%p)\n", sb, handle);
>>>
>>> @@ -57,7 +58,14 @@ static struct dentry *ocfs2_get_dentry(s
>>> return ERR_PTR(-ESTALE);
>>> }
>>>
>>> + /* lock this disk block against updating it from other nodes */
>>> + status = ocfs2_dealloc_lock(OCFS2_SB(sb),
>>> (u64)handle->ih_blkno, 0);
>>> + if (status < 0) {
>>> + mlog_errno(status);
>>> + return ERR_PTR(status);
>>> + }
>>> inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
>>> + ocfs2_dealloc_unlock(OCFS2_SB(sb), (u64)handle->ih_blkno, 0);
>>>
>>> if (IS_ERR(inode))
>>> return (void *)inode;
>>> Index: fs/ocfs2/inode.c
>>> ===================================================================
>>> --- fs/ocfs2/inode.c (revision 3101)
>>> +++ fs/ocfs2/inode.c (working copy)
>>> @@ -533,7 +533,9 @@ static int ocfs2_remove_inode(struct ino
>>> mlog_errno(status);
>>>
>>> bail_commit:
>>> + ocfs2_handle_set_sync(handle, 1);
>>> ocfs2_commit_trans(handle);
>>> + ocfs2_checkpoint_inode(inode);
>>> bail_unlock:
>>> ocfs2_meta_unlock(inode_alloc_inode, 1);
>>> mutex_unlock(&inode_alloc_inode->i_mutex);
>>> @@ -829,6 +831,16 @@ void ocfs2_delete_inode(struct inode *in
>>> goto bail;
>>> }
>>>
>>> + /* prevents reading this disk block during the vote + * and
>>> disk updating */
>>> + status = ocfs2_dealloc_lock(OCFS2_SB(inode->i_sb),
>>> + (u64)inode->i_ino, 1);
>>> + if (status < 0) {
>>> + mlog_errno(status);
>>> + ocfs2_cleanup_delete_inode(inode, 0);
>>> + goto bail_unblock;
>>> + }
>>> +
>>> /* Lock down the inode. This gives us an up to date view of
>>> * it's metadata (for verification), and allows us to
>>> * serialize delete_inode votes. */
>>> @@ -837,7 +849,7 @@ void ocfs2_delete_inode(struct inode *in
>>> if (status != -ENOENT)
>>> mlog_errno(status);
>>> ocfs2_cleanup_delete_inode(inode, 0);
>>> - goto bail_unblock;
>>> + goto bail_unlock_dealloc_lock;
>>> }
>>>
>>> /* Query the cluster. This will be the final decision made
>>> @@ -874,6 +886,9 @@ void ocfs2_delete_inode(struct inode *in
>>> bail_unlock_inode:
>>> ocfs2_meta_unlock(inode, 1);
>>> brelse(di_bh);
>>> +bail_unlock_dealloc_lock:
>>> + ocfs2_dealloc_unlock(OCFS2_SB(inode->i_sb),
>>> + (u64)inode->i_ino, 1);
>>> bail_unblock:
>>> status = sigprocmask(SIG_SETMASK, &oldset, NULL);
>>> if (status < 0)
>>> Index: fs/ocfs2/ocfs2_lockid.h
>>> ===================================================================
>>> --- fs/ocfs2/ocfs2_lockid.h (revision 3101)
>>> +++ fs/ocfs2/ocfs2_lockid.h (working copy)
>>> @@ -40,6 +40,7 @@ enum ocfs2_lock_type {
>>> OCFS2_LOCK_TYPE_DATA,
>>> OCFS2_LOCK_TYPE_SUPER,
>>> OCFS2_LOCK_TYPE_RENAME,
>>> + OCFS2_LOCK_TYPE_DEALLOC,
>>> OCFS2_NUM_LOCK_TYPES
>>> };
>>>
>>> @@ -59,6 +60,9 @@ static inline char ocfs2_lock_type_char(
>>> case OCFS2_LOCK_TYPE_RENAME:
>>> c = 'R';
>>> break;
>>> + case OCFS2_LOCK_TYPE_DEALLOC:
>>> + c = 'E';
>>> + break;
>>> default:
>>> c = '\0';
>>> }
>>> Index: fs/ocfs2/ocfs2.h
>>> ===================================================================
>>> --- fs/ocfs2/ocfs2.h (revision 3101)
>>> +++ fs/ocfs2/ocfs2.h (working copy)
>>> @@ -44,6 +44,8 @@
>>> #include "endian.h"
>>> #include "ocfs2_lockid.h"
>>>
>>> +#define OCFS2_DEALLOC_NR 16
>>> +
>>> struct ocfs2_extent_map {
>>> u32 em_clusters;
>>> struct rb_root em_extents;
>>> @@ -267,6 +269,11 @@ struct ocfs2_super
>>> struct dlm_ctxt *dlm;
>>> struct ocfs2_lock_res osb_super_lockres;
>>> struct ocfs2_lock_res osb_rename_lockres;
>>> +
>>> + /* holds block locks which protect updating/reading + * on
>>> the same disk block*/
>>> + struct ocfs2_lock_res osb_dealloc_lockres[OCFS2_DEALLOC_NR];
>>> +
>>> struct dlm_eviction_cb osb_eviction_cb;
>>> struct ocfs2_dlm_debug *osb_dlm_debug;
>>>
>>> Index: fs/ocfs2/dlmglue.c
>>> ===================================================================
>>> --- fs/ocfs2/dlmglue.c (revision 3101)
>>> +++ fs/ocfs2/dlmglue.c (working copy)
>>> @@ -66,6 +66,9 @@ static void ocfs2_super_bast_func(void *
>>> static void ocfs2_rename_ast_func(void *opaque);
>>> static void ocfs2_rename_bast_func(void *opaque,
>>> int level);
>>> +static void ocfs2_dealloc_ast_func(void *opaque);
>>> +static void ocfs2_dealloc_bast_func(void *opaquei,
>>> + int level);
>>>
>>> /* so far, all locks have gotten along with the same unlock ast */
>>> static void ocfs2_unlock_ast_func(void *opaque,
>>> @@ -122,6 +125,13 @@ static struct ocfs2_lock_res_ops ocfs2_r
>>> .unblock = ocfs2_unblock_osb_lock,
>>> };
>>>
>>> +static struct ocfs2_lock_res_ops ocfs2_dealloc_lops = {
>>> + .ast = ocfs2_dealloc_ast_func,
>>> + .bast = ocfs2_dealloc_bast_func,
>>> + .unlock_ast = ocfs2_unlock_ast_func,
>>> + .unblock = ocfs2_unblock_osb_lock,
>>> +};
>>> +
>>> static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
>>> {
>>> return lockres->l_type == OCFS2_LOCK_TYPE_META ||
>>> @@ -138,10 +148,16 @@ static inline int ocfs2_is_rename_lock(s
>>> return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
>>> }
>>>
>>> +static inline int ocfs2_is_dealloc_lock(struct ocfs2_lock_res
>>> *lockres)
>>> +{
>>> + return lockres->l_type == OCFS2_LOCK_TYPE_DEALLOC;
>>> +}
>>> +
>>> static inline struct ocfs2_super *ocfs2_lock_res_super(struct
>>> ocfs2_lock_res *lockres)
>>> {
>>> BUG_ON(!ocfs2_is_super_lock(lockres)
>>> - && !ocfs2_is_rename_lock(lockres));
>>> + && !ocfs2_is_rename_lock(lockres)
>>> + && !ocfs2_is_dealloc_lock(lockres));
>>>
>>> return (struct ocfs2_super *) lockres->l_priv;
>>> }
>>> @@ -314,6 +330,16 @@ static void ocfs2_rename_lock_res_init(s
>>> &ocfs2_rename_lops, osb);
>>> }
>>>
>>> +static void ocfs2_dealloc_lock_res_init(struct ocfs2_lock_res *res,
>>> + u64 blkno,
>>> + struct ocfs2_super *osb)
>>> +{
>>> + /* Dealloc lockreses don't come from a slab so we call init
>>> + * once on it manually. */
>>> + ocfs2_lock_res_init_once(res);
>>> + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_DEALLOC,
>>> blkno,
>>> + 0, &ocfs2_dealloc_lops, osb);
>>> +}
>>> void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
>>> {
>>> mlog_entry_void();
>>> @@ -727,6 +753,36 @@ static void ocfs2_rename_bast_func(void
>>> mlog_exit_void();
>>> }
>>>
>>> +static void ocfs2_dealloc_ast_func(void *opaque)
>>> +{
>>> + struct ocfs2_lock_res *lockres = opaque;
>>> +
>>> + mlog_entry_void();
>>> + mlog(0, "Dealloc AST fired\n");
>>> +
>>> + BUG_ON(!ocfs2_is_dealloc_lock(lockres));
>>> +
>>> + ocfs2_generic_ast_func(lockres, 1);
>>> + mlog_exit_void();
>>> +}
>>> +
>>> +static void ocfs2_dealloc_bast_func(void *opaque,
>>> + int level)
>>> +{
>>> + struct ocfs2_lock_res *lockres = opaque;
>>> + struct ocfs2_super *osb;
>>> +
>>> + mlog_entry_void();
>>> + mlog(0, "Dealloc BAST fired\n");
>>> +
>>> + BUG_ON(!ocfs2_is_dealloc_lock(lockres));
>>> +
>>> + osb = ocfs2_lock_res_super(lockres);
>>> + ocfs2_generic_bast_func(osb, lockres, level);
>>> +
>>> + mlog_exit_void();
>>> +}
>>> +
>>> static inline void ocfs2_recover_from_dlm_error(struct
>>> ocfs2_lock_res *lockres,
>>> int convert)
>>> {
>>> @@ -1729,6 +1785,39 @@ void ocfs2_rename_unlock(struct ocfs2_su
>>> ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
>>> }
>>>
>>> +/* protects reading/updating the same block
>>> + * all blocks go to OCFS2_DEALLOC_NR locks
>>> + */
>>> +int ocfs2_dealloc_lock(struct ocfs2_super *osb, u64 blkno, int ex)
>>> +{
>>> + int status;
>>> + int level = ex ? LKM_EXMODE : LKM_PRMODE;
>>> + struct ocfs2_lock_res *lockres;
>>> +
>>> + if (ocfs2_is_hard_readonly(osb))
>>> + return -EROFS;
>>> +
>>> + if (ocfs2_mount_local(osb))
>>> + return 0;
>>> +
>>> + lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
>>> + status = ocfs2_cluster_lock(osb, lockres, level, 0, NULL, 0);
>>> + if (status < 0)
>>> + mlog_errno(status);
>>> +
>>> + return status;
>>> +}
>>> +
>>> +void ocfs2_dealloc_unlock(struct ocfs2_super *osb, u64 blkno, int ex)
>>> +{
>>> + struct ocfs2_lock_res *lockres;
>>> + int level = ex ? LKM_EXMODE : LKM_PRMODE;
>>> +
>>> + lockres = &osb->osb_dealloc_lockres[blkno % OCFS2_DEALLOC_NR];
>>> + if (!ocfs2_mount_local(osb))
>>> + ocfs2_cluster_unlock(osb, lockres, level);
>>> +}
>>> +
>>> /* Reference counting of the dlm debug structure. We want this because
>>> * open references on the debug inodes can live on after a mount, so
>>> * we can't rely on the ocfs2_super to always exist. */
>>> @@ -1989,6 +2078,7 @@ static void ocfs2_dlm_shutdown_debug(str
>>> int ocfs2_dlm_init(struct ocfs2_super *osb)
>>> {
>>> int status;
>>> + int i;
>>> u32 dlm_key;
>>> struct dlm_ctxt *dlm = NULL;
>>>
>>> @@ -2030,6 +2120,11 @@ int ocfs2_dlm_init(struct ocfs2_super *o
>>> local:
>>> ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
>>> ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
>>> +
>>> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
>>> + ocfs2_dealloc_lock_res_init(&osb->osb_dealloc_lockres[i],
>>> + (u64)i, osb);
>>> + }
>>>
>>> osb->dlm = dlm;
>>>
>>> @@ -2047,6 +2142,8 @@ bail:
>>>
>>> void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
>>> {
>>> + int i;
>>> +
>>> mlog_entry_void();
>>>
>>> dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
>>> @@ -2060,6 +2157,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_sup
>>>
>>> ocfs2_lock_res_free(&osb->osb_super_lockres);
>>> ocfs2_lock_res_free(&osb->osb_rename_lockres);
>>> + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
>>> + ocfs2_lock_res_free(&osb->osb_dealloc_lockres[i]);
>>> + }
>>>
>>> dlm_unregister_domain(osb->dlm);
>>> osb->dlm = NULL;
>>> @@ -2255,6 +2355,7 @@ void ocfs2_mark_lockres_freeing(struct o
>>> static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
>>> {
>>> int status;
>>> + int i;
>>>
>>> mlog_entry_void();
>>>
>>> @@ -2269,7 +2370,15 @@ static void ocfs2_drop_osb_locks(struct
>>> status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
>>> if (status < 0)
>>> mlog_errno(status);
>>> -
>>> + + for(i=0; i<OCFS2_DEALLOC_NR; i++) {
>>> + ocfs2_mark_lockres_freeing(&osb->osb_dealloc_lockres[i]);
>>> + status = ocfs2_drop_lock(osb, &osb->osb_dealloc_lockres[i],
>>> + NULL);
>>> + if (status < 0)
>>> + mlog_errno(status);
>>> + }
>>> +
>>> mlog_exit(status);
>>> }
>>>
>>>
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel at oss.oracle.com
>>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com
>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>
> ------------------------------------------------------------------------
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2008-10-29 1:03 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-23 4:19 [Ocfs2-devel] [PATCH 1/1] OCFS2: fix for nfs getting stale inode wengang wang
2008-10-23 8:16 ` Joel Becker
2008-10-23 8:33 ` wengang wang
2008-10-23 9:09 ` Joel Becker
2008-10-23 9:22 ` wengang wang
2008-10-24 1:57 ` wengang wang
2008-10-24 2:09 ` wengang wang
2008-10-29 1:03 ` wengang wang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.