[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V1).

All of lore.kernel.org
 help / color / mirror / Atom feed

From: wengang wang <wen.gang.wang@oracle.com>
To: ocfs2-devel@oss.oracle.com
Subject: [Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V1).
Date: Fri, 05 Dec 2008 10:35:03 +0800	[thread overview]
Message-ID: <49389357.9000504@oracle.com> (raw)
In-Reply-To: <20081204200534.GA19340@mail.oracle.com>


Joel Becker wrote:
> [snip]
>> +#define mlog_warnno(st) do {						\
>> +	int _st = (st);							\
>> +	mlog(ML_NOTICE, "status = %lld\n", (long long)_st);		\
>> +} while (0)
>> +
>>     
>
> 	I don't think we want mlog_warnno().  Really, it should just be
> an mlog(0, ...) if anything.  We don't want to print for every stale
> inode.
>
>   
yes, I'm also considering about that. STALE is normal to nfs.
will change it.
>> @@ -48,23 +50,125 @@ struct ocfs2_inode_handle
>>  static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
>>  {
>>  	struct ocfs2_inode_handle *handle = vobjp;
>> -	struct inode *inode;
>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>>  	struct dentry *result;
>> -
>> +	struct inode *inode, *inode_alloc_inode;
>> +	struct buffer_head *inode_bh = NULL, *alloc_bh = NULL;
>> +	struct buffer_head *group_bh = NULL;
>> +	struct ocfs2_dinode *inode_fe, *alloc_fe;
>> +	struct ocfs2_group_desc *group;
>> +	u64 blkno = handle->ih_blkno, bg_blkno;
>> +	unsigned short suballoc_bit, suballoc_slot;
>> +	int status;
>> +	
>>  	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
>>  
>> -	if (handle->ih_blkno == 0) {
>> -		mlog_errno(-ESTALE);
>> +	if (blkno == 0) {
>> +		mlog_warnno(-ESTALE);
>>  		return ERR_PTR(-ESTALE);
>>  	}
>>  
>> +	inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno));
>> +	/* found in-memory inode, goes to check generation */
>> +	if (inode)
>> +		goto check_gen;
>> +
>> +	/* dirty reads(hit disk) the inode to get suballoc_slot and
>> +	 * suballoc_bit 
>> +	 * */
>> +	status = ocfs2_read_block(osb, blkno, &inode_bh, 0, NULL);
>> +	if (status < 0) {
>> +		mlog_errno(status);
>> +		return ERR_PTR(status);
>> +	}
>> +
>> +	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
>> +	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
>> +		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
>> +		     (unsigned long long)blkno, 7, inode_fe->i_signature);
>> +		status = -EINVAL;
>> +		goto rls_bh;
>> +	}
>> +
>> +	suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
>> +	suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
>> +
>> +	/* checks suballoc_bit in bitmap is still SET */
>> +	inode_alloc_inode =
>> +		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
>> +					    suballoc_slot);
>> +	if (!inode_alloc_inode) {
>> +		status = -EEXIST;
>> +		goto rls_bh;
>> +	}
>> +
>> +	mutex_lock(&inode_alloc_inode->i_mutex);
>> +	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
>> +	if (status < 0) 
>> +		goto unlock_mutex;
>> +
>> +	alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
>> +	BUG_ON((suballoc_bit + 1) >
>> +			ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
>>     
>
> 	I'd rather leave ocfs2_bits_per_group() static to suballoc.c.
>
>   
I will try to do that better...
>> +
>> +	bg_blkno = ocfs2_which_suballoc_group(blkno, suballoc_bit);
>> +	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, 
>> +				  inode_alloc_inode);
>> +	if (status < 0) 
>> +		goto inode_unlock;
>> +
>> +	group = (struct ocfs2_group_desc *) group_bh->b_data;
>> +	status = ocfs2_check_group_descriptor(sb, alloc_fe, group);
>> +	if (status < 0) 
>> +		goto rls_group_bh;
>> +
>> +	/* if the bit is clear, it's a stale inode */
>> +	if (!ocfs2_test_bit(suballoc_bit, (unsigned long *)group->bg_bitmap)) {
>> +		status = -ESTALE;
>> +		goto rls_group_bh;
>> +	}
>> +
>>     
>
> 	In fact, why not make this entire logic a new function in
> suballoc.c called ocfs2_test_suballoc_bit().  You pass it a locked
> suballocator inode, the blkno, and the suballoc_bit.  It returns the
> status of the bit.
>
>   
hm, yes, good idea.
>>  	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
>>  
>> -	if (IS_ERR(inode))
>> +	/* if suballoc slot changed since last ocfs2_read_block(), we may have
>> +	 * the lock on a wrong suballoc inode. but if so, since the inode was 
>> +	 * changed, the inode we want must be stale then. 
>> +	 * while, we don't check that here(since ocfs2_iget() doesn't bring
>> +	 * ocfs2_dinode to us, if we do check, we needs other code). instead,
>> +	 * we check on generation later. that is because if the suballoc slot
>> +	 * changed, the generation must changed(since they are in the same disk
>> +	 * sector). 
>> +	 * */ 
>>     
>
> 	inode's don't change suballoc inodes.  In fact, your call to
> ocfs2_check_group_descriptor() will crash the filesystem if it
> determines that the group descriptor and the suballoc inode don't match.
> But that can't happen in a correctly functioning filesystem.
>
>   
Ok, I see.  I will remove the comments.
>> +check_gen:
>>  	if (handle->ih_generation != inode->i_generation) {
>>  		iput(inode);
>> +		mlog_warnno(-ESTALE);
>>  		return ERR_PTR(-ESTALE);
>>  	}
>>     
>
> 	This is what I mean.  We don't want to log every stale inode.
> Stale inodes are a normal part of NFS operation.
>   
yes, will change that.
>   
>   
>> Index: fs/ocfs2/inode.c
>> ===================================================================
>> --- fs/ocfs2/inode.c	(revision 38)
>> +++ fs/ocfs2/inode.c	(working copy)
>> @@ -111,6 +111,17 @@ void ocfs2_get_inode_flags(struct ocfs2_
>>  		oi->ip_attr |= OCFS2_DIRSYNC_FL;
>>  }
>>  
>> +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
>> +{
>> +	struct ocfs2_find_inode_args args;
>> +
>> +	args.fi_blkno = blkno;
>> +	args.fi_flags = 0;
>> +	args.fi_ino = ino_from_blkno(sb, blkno);
>> +	args.fi_sysfile_type = 0;
>> +
>> +	return ilookup5(sb, blkno, ocfs2_find_actor, &args);
>> +}
>>     
>
> 	This is good.
>
>   
>>  struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
>>  			 int sysfile_type)
>>  {
>> @@ -571,6 +582,9 @@ out:
>>  	return status;
>>  }
>>  
>> +/* callers must have cluster lock inode_alloc_inode taken before calling
>> + * ocfs2_remove_inode
>> + * */
>>  static int ocfs2_remove_inode(struct inode *inode,
>>  			      struct buffer_head *di_bh,
>>  			      struct inode *orphan_dir_inode,
>> @@ -592,11 +606,12 @@ static int ocfs2_remove_inode(struct ino
>>  		goto bail;
>>  	}
>>  
>> -	mutex_lock(&inode_alloc_inode->i_mutex);
>> -	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
>> +	status = ocfs2_read_block(OCFS2_SB(inode_alloc_inode->i_sb),
>> +				  OCFS2_I(inode_alloc_inode)->ip_blkno,
>> +				  &inode_alloc_bh,
>> +				  OCFS2_BH_CACHED,
>> +				  inode_alloc_inode);
>>     
>
> 	If you require callers to lock the alloc inode, have them pass
> in the alloc inode and bh to the function.  Then you don't have to call
> get_system_file_inode or read_block.
>
>   
Yes, I had thought so.
while, the grandparent knows the alloc inode and direct parent doesn't use
it at. so gave up.

I will consider it further..
>> Index: fs/ocfs2/dlmglue.c
>> ===================================================================
>> --- fs/ocfs2/dlmglue.c	(revision 38)
>> +++ fs/ocfs2/dlmglue.c	(working copy)
>> @@ -1940,19 +1940,25 @@ static int ocfs2_inode_lock_update(struc
>>  			status = -EIO;
>>  			goto bail_refresh;
>>  		}
>> -		mlog_bug_on_msg(inode->i_generation !=
>> -				le32_to_cpu(fe->i_generation),
>> -				"Invalid dinode %llu disk generation: %u "
>> -				"inode->i_generation: %u\n",
>> -				(unsigned long long)oi->ip_blkno,
>> -				le32_to_cpu(fe->i_generation),
>> -				inode->i_generation);
>> -		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
>> -				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
>> -				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
>> -				(unsigned long long)oi->ip_blkno,
>> -				(unsigned long long)le64_to_cpu(fe->i_dtime),
>> -				le32_to_cpu(fe->i_flags));
>> +		if (inode->i_generation != le32_to_cpu(fe->i_generation)) {
>> +			mlog(ML_NOTICE, "Invalid dinode %llu "
>> +			     "disk generation: %u inode->i_generation: %u\n",
>> +			     (unsigned long long)oi->ip_blkno,
>> +			     le32_to_cpu(fe->i_generation),
>> +			     inode->i_generation);
>> +			status = -ESTALE;
>> +			goto bail_refresh;
>> +		}
>> +		if (le64_to_cpu(fe->i_dtime) ||
>> +		    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
>> +			mlog(ML_NOTICE,
>> +			     "Stale dinode %llu dtime: %llu flags: 0x%x\n",
>> +			     (unsigned long long)oi->ip_blkno,
>> +			     (unsigned long long)le64_to_cpu(fe->i_dtime),
>> +			     le32_to_cpu(fe->i_flags));
>> +			status = -ESTALE;
>> +			goto bail_refresh;
>>     
>
> 	Once again, we don't want to ML_NOTICE these -ESTALE conditions.
> -ESTALE is a normal thing.  Make that mlog(0, ...).
>
>   
ok.

thanks,
wengang.

     prev parent reply	other threads:[~2008-12-05  2:35 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-12-04 10:58 [Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V1) wengang wang
2008-12-04 20:05 ` Joel Becker
2008-12-05  2:35   ` wengang wang [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49389357.9000504@oracle.com \
    --to=wen.gang.wang@oracle.com \
    --cc=ocfs2-devel@oss.oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.