From: Zheng Liu <gnehzuil.liu@gmail.com>
To: T Makphaibulchoke <tmac@hp.com>
Cc: tytso@mit.edu, adilger.kernel@dilger.ca,
linux-ext4@vger.kernel.org, linux-kernel@vger.kernel.org,
aswin@hp.com, aswin_proj@lists.hp.com
Subject: Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
Date: Thu, 3 Oct 2013 10:05:26 +0800 [thread overview]
Message-ID: <20131003020526.GA22501@gmail.com> (raw)
In-Reply-To: <1380728219-60784-2-git-send-email-tmac@hp.com>
Hello,
On Wed, Oct 02, 2013 at 09:36:59AM -0600, T Makphaibulchoke wrote:
> Instead of using a single per super block mutex, s_orphan_lock, to serialize
> all orphan list updates, a separate mutex and spinlock are used to
> protect the on disk and in memory orphan lists respecvitely.
>
> At the same time, a per inode mutex is used to serialize orphan list
> updates of a single inode.
>
> Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
> ---
> fs/ext4/namei.c | 139 ++++++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 100 insertions(+), 39 deletions(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 35f55a0..d7d3d0f 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> struct super_block *sb = inode->i_sb;
> struct ext4_iloc iloc;
> int err = 0, rc;
> + struct ext4_inode_info *ext4_inode = EXT4_I(inode);
> + struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
> + struct mutex *inode_orphan_mutex;
> + spinlock_t *orphan_lock;
> + struct mutex *ondisk_orphan_mutex;
> + struct list_head *prev;
> + unsigned int next_inum;
> + struct inode *next_inode;
>
> - if (!EXT4_SB(sb)->s_journal)
> + if (ext4_sb->s_journal)
^^^^^^^^
typo: !ext4_sb->s_journal
I am not sure whether or not this will impact the result because when
journal is enabled the inode will not be added into orphan list.
Regards,
- Zheng
> return 0;
>
> - mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
> - if (!list_empty(&EXT4_I(inode)->i_orphan))
> + inode_orphan_mutex = &ext4_inode->i_orphan_mutex;
> + orphan_lock = &ext4_sb->s_orphan_lock;
> + ondisk_orphan_mutex = &ext4_sb->s_ondisk_orphan_lock;
> + mutex_lock(inode_orphan_mutex);
> + prev = ext4_inode->i_orphan.prev;
> + if (prev != &ext4_inode->i_orphan)
> goto out_unlock;
>
> /*
> @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> err = ext4_reserve_inode_write(handle, inode, &iloc);
> if (err)
> goto out_unlock;
> + mutex_lock(ondisk_orphan_mutex);
> /*
> * Due to previous errors inode may be already a part of on-disk
> * orphan list. If so skip on-disk list modification.
> */
> if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
> - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
> + mutex_unlock(ondisk_orphan_mutex);
> goto mem_insert;
> + }
>
> /* Insert this inode at the head of the on-disk orphan list... */
> - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> + next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> + NEXT_ORPHAN(inode) = next_inum;
> + next_inode = ilookup(sb, next_inum);
> + if (next_inode)
> + EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
> + ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> + ext4_inode->i_prev_orphan = 0;
> + mutex_unlock(ondisk_orphan_mutex);
> + if (next_inode)
> + iput(next_inode);
> err = ext4_handle_dirty_super(handle, sb);
> rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
> if (!err)
> @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> * This is safe: on error we're going to ignore the orphan list
> * anyway on the next recovery. */
> mem_insert:
> - if (!err)
> - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
> + if (!err) {
> + spin_lock(orphan_lock);
> + list_add(&ext4_inode->i_orphan, &ext4_sb->s_orphan);
> + spin_unlock(orphan_lock);
> + }
>
> jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
> jbd_debug(4, "orphan inode %lu will point to %d\n",
> inode->i_ino, NEXT_ORPHAN(inode));
> out_unlock:
> - mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
> + mutex_unlock(inode_orphan_mutex);
> ext4_std_error(inode->i_sb, err);
> return err;
> }
> @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
> {
> struct list_head *prev;
> struct ext4_inode_info *ei = EXT4_I(inode);
> - struct ext4_sb_info *sbi;
> + struct inode *i_next = NULL;
> + struct inode *i_prev = NULL;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> __u32 ino_next;
> + unsigned int ino_prev;
> struct ext4_iloc iloc;
> int err = 0;
> + struct mutex *inode_orphan_mutex;
> + spinlock_t *orphan_lock;
> + struct mutex *ondisk_orphan_mutex;
>
> - if ((!EXT4_SB(inode->i_sb)->s_journal) &&
> - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
> + if ((!sbi->s_journal) &&
> + !(sbi->s_mount_state & EXT4_ORPHAN_FS))
> return 0;
>
> - mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> - if (list_empty(&ei->i_orphan))
> - goto out;
> -
> - ino_next = NEXT_ORPHAN(inode);
> + inode_orphan_mutex = &ei->i_orphan_mutex;
> + orphan_lock = &sbi->s_orphan_lock;
> + ondisk_orphan_mutex = &sbi->s_ondisk_orphan_lock;
> + mutex_lock(inode_orphan_mutex);
> prev = ei->i_orphan.prev;
> - sbi = EXT4_SB(inode->i_sb);
> + if (prev == &ei->i_orphan) {
> + mutex_unlock(inode_orphan_mutex);
> + return err;
> + }
>
> jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
>
> + spin_lock(orphan_lock);
> list_del_init(&ei->i_orphan);
> + spin_unlock(orphan_lock);
>
> /* If we're on an error path, we may not have a valid
> * transaction handle with which to update the orphan list on
> * disk, but we still need to remove the inode from the linked
> * list in memory. */
> - if (!handle)
> - goto out;
> + if (!handle) {
> + mutex_unlock(inode_orphan_mutex);
> + return err;
> + }
>
> err = ext4_reserve_inode_write(handle, inode, &iloc);
> - if (err)
> + if (err) {
> + mutex_unlock(inode_orphan_mutex);
> goto out_err;
> + }
>
> - if (prev == &sbi->s_orphan) {
> + mutex_lock(ondisk_orphan_mutex);
> + ino_next = NEXT_ORPHAN(inode);
> + if (ino_next > 0) {
> + i_next = ilookup(inode->i_sb, ino_next);
> + assert(i_next);
> + }
> + ino_prev = ei->i_prev_orphan;
> + if (!ino_prev) {
> jbd_debug(4, "superblock will point to %u\n", ino_next);
> BUFFER_TRACE(sbi->s_sbh, "get_write_access");
> err = ext4_journal_get_write_access(handle, sbi->s_sbh);
> - if (err)
> - goto out_brelse;
> - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> - err = ext4_handle_dirty_super(handle, inode->i_sb);
> + if (!err) {
> + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> + if (i_next)
> + EXT4_I(i_next)->i_prev_orphan = 0;
> + }
> + mutex_unlock(ondisk_orphan_mutex);
> + if (!err)
> + err = ext4_handle_dirty_super(handle, inode->i_sb);
> } else {
> struct ext4_iloc iloc2;
> - struct inode *i_prev =
> - &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
> -
> - jbd_debug(4, "orphan inode %lu will point to %u\n",
> - i_prev->i_ino, ino_next);
> - err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> - if (err)
> - goto out_brelse;
> - NEXT_ORPHAN(i_prev) = ino_next;
> - err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
> + i_prev = ilookup(inode->i_sb, ino_prev);
> +
> + assert(i_prev);
> + if (i_prev) {
> + jbd_debug(4, "orphan inode %lu will point to %u\n",
> + i_prev->i_ino, ino_next);
> + err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> + } else
> + err = -ENOENT;
> + if (!err) {
> + NEXT_ORPHAN(i_prev) = ino_next;
> + if (i_next)
> + EXT4_I(i_next)->i_prev_orphan = ino_prev;
> + }
> + mutex_unlock(ondisk_orphan_mutex);
> + if (i_prev && !err)
> + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
> }
> if (err)
> goto out_brelse;
> NEXT_ORPHAN(inode) = 0;
> + mutex_unlock(inode_orphan_mutex);
> err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> -
> out_err:
> + if (i_next)
> + iput(i_next);
> + if (i_prev)
> + iput(i_prev);
> ext4_std_error(inode->i_sb, err);
> -out:
> - mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> return err;
>
> out_brelse:
> + mutex_unlock(inode_orphan_mutex);
> brelse(iloc.bh);
> goto out_err;
> }
> --
> 1.7.11.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2013-10-03 2:05 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-10-02 15:36 [PATCH 1/2] fs/ext4: adding and initalizing new members of ext4_inode_info and ext4_sb_info T Makphaibulchoke
2013-10-02 15:36 ` [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex T Makphaibulchoke
2013-10-03 2:05 ` Zheng Liu [this message]
2013-10-03 8:31 ` Thavatchai Makphaibulchoke
2013-10-04 0:41 ` Andreas Dilger
2013-10-03 23:08 ` Thavatchai Makphaibulchoke
2013-10-04 0:37 ` [PATCH 1/2] fs/ext4: adding and initalizing new members of ext4_inode_info and ext4_sb_info Andreas Dilger
2013-10-03 23:14 ` Thavatchai Makphaibulchoke
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20131003020526.GA22501@gmail.com \
--to=gnehzuil.liu@gmail.com \
--cc=adilger.kernel@dilger.ca \
--cc=aswin@hp.com \
--cc=aswin_proj@lists.hp.com \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=tmac@hp.com \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.