From mboxrd@z Thu Jan  1 00:00:00 1970
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 10 Jun 2009 15:38:03 +0800
Subject: [Ocfs2-devel] [PATCH 1/2] ocfs2: timer to queue scan of all
 orphan slots
In-Reply-To: <4A2F57AA.8090209@oracle.com>
References: <1244073776-18748-1-git-send-email-srinivas.eeda@oracle.com>
	<1244073776-18748-2-git-send-email-srinivas.eeda@oracle.com>
	<4A2F46B1.5090301@oracle.com> <4A2F57AA.8090209@oracle.com>
Message-ID: <4A2F62DB.4030509@oracle.com>
List-Id: <ocfs2-devel.oss.oracle.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
To: ocfs2-devel@oss.oracle.com


Srinivas Eeda wrote:
> Tao Ma wrote:
>> Hi Srini/Joel/Sunil,
>>     I also have some thoughts for it. Wish it isn't too late.
>>
>>     Currently, orphan scan just iterate all the slots and call 
>> ocfs2_queue_recovery_completion, but I don't think it is proper for a 
>> node to query another mounted one since that node will query it by 
>> itself.    What's more, it will affect reflink greatly.
>> In my current implementation of reflink, It will work like this:
>> 1. create a inode in orphan dir
>> 2. reflink all the extents.
>> 3. move the inode from orphan dir to the destination.
>>
>> For efficiency, I just lock orphan dir in step 1 and 3, and release 
>> the lock in step 2 in case reflink will take a long time and we don't 
>> block other "unlink" process. And in step 1, the created inode looks 
>> really like a deleted one so that any crash in step 2 won't prevent it 
>> from being deleted by fsck or recovery.
>>
>> But with your patch, we may have a race in step 2 that your recovery 
>> will delete the inode created in step 1. So my suggestion is that your 
>> orphan scan just skip the mounted node so it won't affect other nodes' 
>> ongoing reflink. As for the node itself, it is very easy to postpone 
>> the orphan scan by setting a flag in ocfs2_super when reflink is 
>> ongoing(I will do it).
>>
>> Make sense?
> Yes, I can restrict the node to recover it's own and offline slots. I 
> can make the node to recover it's own slot every time the timer fires 
> and offline slots in round robin way(current way)
yeah, that would be cool.

Regards,
Tao
>>
>> Regards,
>> Tao
>>
>> Srinivas Eeda wrote:
>>> When a dentry is unlinked, the unlinking node takes an EX on the 
>>> dentry lock
>>> before moving the dentry to the orphan directory. The other nodes, 
>>> that all had
>>> a PR on the same dentry lock, flag the corresponding inode as 
>>> MAYBE_ORPHANED
>>> during the downconvert. The inode is finally deleted when the last 
>>> node to iput
>>> the inode notices the MAYBE_ORPHANED flag.
>>>
>>> A problem arises if a node is forced to free dentry locks because of 
>>> memory
>>> pressure. If this happens, the node will no longer get downconvert 
>>> notifications
>>> for the dentries that have been unlinked on another node. If it also 
>>> happens
>>> that node is actively using the corresponding inode and happens to be 
>>> the one
>>> performing the last iput on that inode, it will fail to delete the 
>>> inode as it
>>> will not have the MAYBE_ORPHANED flag set.
>>>
>>> This patch fixes this shortcoming by introducing a periodic scan of 
>>> the orphan
>>> directories to delete such inodes. Care has been taken to distribute the
>>> workload across the cluster so that no one node has to perform the 
>>> task all the
>>> time.
>>>
>>> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
>>> ---
>>>  fs/ocfs2/dlmglue.c      |   51 ++++++++++++++++++++++
>>>  fs/ocfs2/dlmglue.h      |   11 +++++
>>>  fs/ocfs2/journal.c      |  106 
>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>>  fs/ocfs2/journal.h      |    4 ++
>>>  fs/ocfs2/ocfs2.h        |   10 ++++
>>>  fs/ocfs2/ocfs2_lockid.h |    5 ++
>>>  fs/ocfs2/super.c        |    9 ++++
>>>  7 files changed, 196 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>> index e15fc7d..0f35b83 100644
>>> --- a/fs/ocfs2/dlmglue.c
>>> +++ b/fs/ocfs2/dlmglue.c
>>> @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops 
>>> ocfs2_nfs_sync_lops = {
>>>      .flags        = 0,
>>>  };
>>>  
>>> +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
>>> +    .flags        = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
>>> +};
>>> +
>>>  static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
>>>      .get_osb    = ocfs2_get_dentry_osb,
>>>      .post_unlock    = ocfs2_dentry_post_unlock,
>>> @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct 
>>> ocfs2_lock_res *res,
>>>                     &ocfs2_nfs_sync_lops, osb);
>>>  }
>>>  
>>> +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>>> +                        struct ocfs2_super *osb)
>>> +{
>>> +    struct ocfs2_orphan_scan_lvb *lvb;
>>> +
>>> +    ocfs2_lock_res_init_once(res);
>>> +    ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, 
>>> res->l_name);
>>> +    ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
>>> +                   &ocfs2_orphan_scan_lops, osb);
>>> +    lvb = ocfs2_dlm_lvb(&res->l_lksb);
>>> +    lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
>>> +}
>>> +
>>>  void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
>>>                    struct ocfs2_file_private *fp)
>>>  {
>>> @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
>>>      mlog_exit_void();
>>>  }
>>>  
>>> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
>>> +{
>>> +    struct ocfs2_lock_res *lockres;
>>> +    struct ocfs2_orphan_scan_lvb *lvb;
>>> +    int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
>>> +    int status = 0;
>>> +
>>> +    lockres = &osb->osb_orphan_scan.os_lockres;
>>> +    status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); +    if 
>>> (status < 0)
>>> +        return status;
>>> +
>>> +    lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
>>> +    if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
>>> +        *seqno = be32_to_cpu(lvb->lvb_os_seqno); +    return status;
>>> +}
>>> +
>>> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, 
>>> int ex)
>>> +{
>>> +    struct ocfs2_lock_res *lockres;
>>> +    struct ocfs2_orphan_scan_lvb *lvb;
>>> +    int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
>>> +
>>> +    lockres = &osb->osb_orphan_scan.os_lockres;
>>> +    lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
>>> +    lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
>>> +    lvb->lvb_os_seqno = cpu_to_be32(seqno);
>>> +    ocfs2_cluster_unlock(osb, lockres, level); +}
>>> +
>>>  int ocfs2_super_lock(struct ocfs2_super *osb,
>>>               int ex)
>>>  {
>>> @@ -2842,6 +2890,7 @@ local:
>>>      ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
>>>      ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
>>>      ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
>>> +    
>>> ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
>>>  
>>>      osb->cconn = conn;
>>>  
>>> @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
>>>      ocfs2_lock_res_free(&osb->osb_super_lockres);
>>>      ocfs2_lock_res_free(&osb->osb_rename_lockres);
>>>      ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
>>> +    ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
>>>  
>>>      ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
>>>      osb->cconn = NULL;
>>> @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct 
>>> ocfs2_super *osb)
>>>      ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
>>>      ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
>>>      ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
>>> +    ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
>>>  }
>>>  
>>>  int ocfs2_drop_inode_locks(struct inode *inode)
>>> diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
>>> index e1fd572..31b90d7 100644
>>> --- a/fs/ocfs2/dlmglue.h
>>> +++ b/fs/ocfs2/dlmglue.h
>>> @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
>>>      __be32    lvb_free_entry;
>>>  };
>>>  
>>> +#define OCFS2_ORPHAN_LVB_VERSION 1
>>> +
>>> +struct ocfs2_orphan_scan_lvb {
>>> +    __u8    lvb_version;
>>> +    __u8    lvb_reserved[3];
>>> +    __be32    lvb_os_seqno;
>>> +};
>>> +
>>>  /* ocfs2_inode_lock_full() 'arg_flags' flags */
>>>  /* don't wait on recovery. */
>>>  #define OCFS2_META_LOCK_RECOVERY    (0x01)
>>> @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
>>>               int ex);
>>>  void ocfs2_super_unlock(struct ocfs2_super *osb,
>>>              int ex);
>>> +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int 
>>> ex);
>>> +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, 
>>> int ex);
>>> +
>>>  int ocfs2_rename_lock(struct ocfs2_super *osb);
>>>  void ocfs2_rename_unlock(struct ocfs2_super *osb);
>>>  int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
>>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>>> index a20a0f1..dc7cea3 100644
>>> --- a/fs/ocfs2/journal.c
>>> +++ b/fs/ocfs2/journal.c
>>> @@ -28,6 +28,8 @@
>>>  #include <linux/slab.h>
>>>  #include <linux/highmem.h>
>>>  #include <linux/kthread.h>
>>> +#include <linux/time.h>
>>> +#include <linux/random.h>
>>>  
>>>  #define MLOG_MASK_PREFIX ML_JOURNAL
>>>  #include <cluster/masklog.h>
>>> @@ -52,6 +54,8 @@
>>>  
>>>  DEFINE_SPINLOCK(trans_inc_lock);
>>>  
>>> +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
>>> +
>>>  static int ocfs2_force_read_journal(struct inode *inode);
>>>  static int ocfs2_recover_node(struct ocfs2_super *osb,
>>>                    int node_num, int slot_num);
>>> @@ -1841,6 +1845,108 @@ bail:
>>>      return status;
>>>  }
>>>  
>>> +/*
>>> + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. 
>>> Add some
>>> + * randomness to the timeout to minimize multple nodes firing the 
>>> timer at the
>>> + * same time.
>>> + */
>>> +static inline unsigned long ocfs2_orphan_scan_timeout(void)
>>> +{
>>> +    unsigned long time;
>>> +
>>> +    get_random_bytes(&time, sizeof(time));
>>> +    time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
>>> +    return msecs_to_jiffies(time);
>>> +}
>>> +
>>> +/*
>>> + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
>>> + * every slot which queues a recovery of slot on ocfs2_wq thread. 
>>> This is done
>>> + * to cleanup any orphans that are left over in orphan slots.
>>> + *
>>> + * ocfs2_queue_orphan_scan gets called every 
>>> ORPHAN_SCAN_SCHEDULE_TIMEOUT seconds
>>> + * It gets an EX lock on os_lockres and checks sequence number 
>>> stored in LVB. If
>>> + * the sequence number is changed it means some node has done the 
>>> scan. Skip the
>>> + * scan and tracks the sequence number. If the sequence number 
>>> didn't change,
>>> + * means a scan didn't happen, so the node queues a scan and 
>>> increments the
>>> + * sequence number in LVB.
>>> + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
>>> +{
>>> +    struct ocfs2_orphan_scan *os;
>>> +    int status, i;
>>> +    u32 seqno = 0;
>>> +
>>> +    os = &osb->osb_orphan_scan;
>>> +
>>> +    status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
>>> +    if (status < 0) {
>>> +        if (status != -EAGAIN)
>>> +            mlog_errno(status);
>>> +        goto out;
>>> +    }
>>> +
>>> +    if (os->os_seqno != seqno) {
>>> +        os->os_seqno = seqno;
>>> +        goto unlock;
>>> +    }
>>> +
>>> +    for (i = 0; i < osb->max_slots; i++)
>>> +        ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
>>> +                        NULL);
>>> +    /*
>>> +     * We queued a recovery on orphan slots, increment the sequence
>>> +     * number and update LVB so other node will skip the scan for a 
>>> while
>>> +     */
>>> +    seqno++;
>>> +unlock:
>>> +    ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
>>> +out:
>>> +    return;
>>> +}
>>> +
>>> +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT 
>>> millsec */
>>> +void ocfs2_orphan_scan_work(struct work_struct *work)
>>> +{
>>> +    struct ocfs2_orphan_scan *os;
>>> +    struct ocfs2_super *osb;
>>> +
>>> +    os = container_of(work, struct ocfs2_orphan_scan,
>>> +              os_orphan_scan_work.work);
>>> +    osb = os->os_osb;
>>> +
>>> +    mutex_lock(&os->os_lock);
>>> +    ocfs2_queue_orphan_scan(osb);
>>> +    schedule_delayed_work(&os->os_orphan_scan_work,
>>> +                  ocfs2_orphan_scan_timeout());
>>> +    mutex_unlock(&os->os_lock);
>>> +}
>>> +
>>> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
>>> +{
>>> +    struct ocfs2_orphan_scan *os;
>>> +
>>> +    os = &osb->osb_orphan_scan;
>>> +    mutex_lock(&os->os_lock);
>>> +    cancel_delayed_work(&os->os_orphan_scan_work);
>>> +    mutex_unlock(&os->os_lock);
>>> +}
>>> +
>>> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
>>> +{
>>> +    struct ocfs2_orphan_scan *os;
>>> +
>>> +    os = &osb->osb_orphan_scan;
>>> +    os->os_osb = osb;
>>> +    mutex_init(&os->os_lock);
>>> +
>>> +    INIT_DELAYED_WORK(&os->os_orphan_scan_work,
>>> +              ocfs2_orphan_scan_work);
>>> +    schedule_delayed_work(&os->os_orphan_scan_work,
>>> +                  ocfs2_orphan_scan_timeout());
>>> +    return 0;
>>> +}
>>> +
>>>  struct ocfs2_orphan_filldir_priv {
>>>      struct inode        *head;
>>>      struct ocfs2_super    *osb;
>>> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
>>> index 619dd7f..3483202 100644
>>> --- a/fs/ocfs2/journal.h
>>> +++ b/fs/ocfs2/journal.h
>>> @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct 
>>> ocfs2_super *osb,
>>>  }
>>>  
>>>  /* Exported only for the journal struct init code in super.c. Do not 
>>> call. */
>>> +int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
>>> +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
>>> +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
>>> +
>>>  void ocfs2_complete_recovery(struct work_struct *work);
>>>  void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
>>>  
>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>> index 1386281..373fb1c 100644
>>> --- a/fs/ocfs2/ocfs2.h
>>> +++ b/fs/ocfs2/ocfs2.h
>>> @@ -151,6 +151,14 @@ struct ocfs2_lock_res {
>>>  #endif
>>>  };
>>>  
>>> +struct ocfs2_orphan_scan {
>>> +    struct mutex         os_lock;
>>> +    struct ocfs2_super     *os_osb;
>>> +    struct ocfs2_lock_res     os_lockres;     /* lock to synchronize 
>>> scans */
>>> +    struct delayed_work     os_orphan_scan_work;
>>> +    u32              os_seqno;       /* incremented on every scan */
>>> +};
>>> +
>>>  struct ocfs2_dlm_debug {
>>>      struct kref d_refcnt;
>>>      struct dentry *d_locking_state;
>>> @@ -341,6 +349,8 @@ struct ocfs2_super
>>>      unsigned int            *osb_orphan_wipes;
>>>      wait_queue_head_t        osb_wipe_event;
>>>  
>>> +    struct ocfs2_orphan_scan     osb_orphan_scan; +
>>>      /* used to protect metaecc calculation check of xattr. */
>>>      spinlock_t osb_xattr_lock;
>>>  
>>> diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
>>> index a53ce87..fcdba09 100644
>>> --- a/fs/ocfs2/ocfs2_lockid.h
>>> +++ b/fs/ocfs2/ocfs2_lockid.h
>>> @@ -48,6 +48,7 @@ enum ocfs2_lock_type {
>>>      OCFS2_LOCK_TYPE_FLOCK,
>>>      OCFS2_LOCK_TYPE_QINFO,
>>>      OCFS2_LOCK_TYPE_NFS_SYNC,
>>> +    OCFS2_LOCK_TYPE_ORPHAN_SCAN,
>>>      OCFS2_NUM_LOCK_TYPES
>>>  };
>>>  
>>> @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum 
>>> ocfs2_lock_type type)
>>>          case OCFS2_LOCK_TYPE_NFS_SYNC:
>>>              c = 'Y';
>>>              break;
>>> +        case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
>>> +            c = 'P';
>>> +            break;
>>>          default:
>>>              c = '\0';
>>>      }
>>> @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
>>>      [OCFS2_LOCK_TYPE_OPEN] = "Open",
>>>      [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
>>>      [OCFS2_LOCK_TYPE_QINFO] = "Quota",
>>> +    [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
>>>  };
>>>  
>>>  static inline const char *ocfs2_lock_type_string(enum 
>>> ocfs2_lock_type type)
>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>> index 79ff8d9..44ac27e 100644
>>> --- a/fs/ocfs2/super.c
>>> +++ b/fs/ocfs2/super.c
>>> @@ -1802,6 +1802,8 @@ static void ocfs2_dismount_volume(struct 
>>> super_block *sb, int mnt_err)
>>>  
>>>      ocfs2_truncate_log_shutdown(osb);
>>>  
>>> +    ocfs2_orphan_scan_stop(osb);
>>> +
>>>      /* This will disable recovery and flush any recovery work. */
>>>      ocfs2_recovery_exit(osb);
>>>  
>>> @@ -1957,6 +1959,13 @@ static int ocfs2_initialize_super(struct 
>>> super_block *sb,
>>>          goto bail;
>>>      }
>>>  
>>> +    status = ocfs2_orphan_scan_init(osb);
>>> +    if (status) {
>>> +        mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
>>> +        mlog_errno(status);
>>> +        goto bail;
>>> +    }
>>> +
>>>      init_waitqueue_head(&osb->checkpoint_event);
>>>      atomic_set(&osb->needs_checkpoint, 0);
>>>