All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alex Elder <elder@linaro.org>
To: Ilya Dryomov <idryomov@gmail.com>, ceph-devel@vger.kernel.org
Cc: Sage Weil <sweil@redhat.com>
Subject: Re: [PATCH] libceph: fix double __remove_osd() problem
Date: Wed, 18 Feb 2015 09:43:43 -0600	[thread overview]
Message-ID: <54E4B32F.1020407@linaro.org> (raw)
In-Reply-To: <1424265940-40771-1-git-send-email-idryomov@gmail.com>

On 02/18/2015 07:25 AM, Ilya Dryomov wrote:
> It turns out it's possible to get __remove_osd() called twice on the
> same OSD.  That doesn't sit well with rb_erase() - depending on the
> shape of the tree we can get a NULL dereference, a soft lockup or
> a random crash at some point in the future as we end up touching freed
> memory.  One scenario that I was able to reproduce is as follows:
> 
>             <osd3 is idle, on the osd lru list>
> <con reset - osd3>
> con_fault_finish()
>   osd_reset()
>                               <osdmap - osd3 down>
>                               ceph_osdc_handle_map()
>                                 <takes map_sem>
>                                 kick_requests()
>                                   <takes request_mutex>
>                                   reset_changed_osds()
>                                     __reset_osd()
>                                       __remove_osd()
>                                   <releases request_mutex>
>                                 <releases map_sem>
>     <takes map_sem>
>     <takes request_mutex>
>     __kick_osd_requests()
>       __reset_osd()
>         __remove_osd() <-- !!!
> 
> A case can be made that osd refcounting is imperfect and reworking it
> would be a proper resolution, but for now Sage and I decided to fix
> this by adding a safe guard around __remove_osd().
> 
> Fixes: http://tracker.ceph.com/issues/8087

So now you rely on the RB node in the osd getting cleared
as a signal that it has been removed already.  Yes, that
sounds like refcounting isn't working as desired...

The mutex around all calls to (now) remove_osd() avoids
races.  I like the RB_CLEAR_NODE() call anyway.
OK, enough chit chat.  This looks OK to me.

Reviewed-by: Alex Elder <elder@linaro.org>

> Cc: Sage Weil <sweil@redhat.com>
> Cc: stable@vger.kernel.org # 3.9+: 7c6e6fc53e73: libceph: assert both regular and lingering lists in __remove_osd()
> Cc: stable@vger.kernel.org # 3.9+: cc9f1f518cec: libceph: change from BUG to WARN for __remove_osd() asserts
> Cc: stable@vger.kernel.org # 3.9+
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>  net/ceph/osd_client.c | 26 ++++++++++++++++++--------
>  1 file changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 53299c7b0ca4..f693a2f8ac86 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -1048,14 +1048,24 @@ static void put_osd(struct ceph_osd *osd)
>   */
>  static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
>  {
> -	dout("__remove_osd %p\n", osd);
> +	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
>  	WARN_ON(!list_empty(&osd->o_requests));
>  	WARN_ON(!list_empty(&osd->o_linger_requests));
>  
> -	rb_erase(&osd->o_node, &osdc->osds);
>  	list_del_init(&osd->o_osd_lru);
> -	ceph_con_close(&osd->o_con);
> -	put_osd(osd);
> +	rb_erase(&osd->o_node, &osdc->osds);
> +	RB_CLEAR_NODE(&osd->o_node);
> +}
> +
> +static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
> +{
> +	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
> +
> +	if (!RB_EMPTY_NODE(&osd->o_node)) {
> +		ceph_con_close(&osd->o_con);
> +		__remove_osd(osdc, osd);
> +		put_osd(osd);
> +	}
>  }
>  
>  static void remove_all_osds(struct ceph_osd_client *osdc)
> @@ -1065,7 +1075,7 @@ static void remove_all_osds(struct ceph_osd_client *osdc)
>  	while (!RB_EMPTY_ROOT(&osdc->osds)) {
>  		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
>  						struct ceph_osd, o_node);
> -		__remove_osd(osdc, osd);
> +		remove_osd(osdc, osd);
>  	}
>  	mutex_unlock(&osdc->request_mutex);
>  }
> @@ -1106,7 +1116,7 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
>  	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
>  		if (time_before(jiffies, osd->lru_ttl))
>  			break;
> -		__remove_osd(osdc, osd);
> +		remove_osd(osdc, osd);
>  	}
>  	mutex_unlock(&osdc->request_mutex);
>  }
> @@ -1121,8 +1131,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
>  	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
>  	if (list_empty(&osd->o_requests) &&
>  	    list_empty(&osd->o_linger_requests)) {
> -		__remove_osd(osdc, osd);
> -
> +		remove_osd(osdc, osd);
>  		return -ENODEV;
>  	}
>  
> @@ -1926,6 +1935,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
>  {
>  	struct rb_node *p, *n;
>  
> +	dout("%s %p\n", __func__, osdc);
>  	for (p = rb_first(&osdc->osds); p; p = n) {
>  		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
>  
> 


      parent reply	other threads:[~2015-02-18 15:43 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-02-18 13:25 [PATCH] libceph: fix double __remove_osd() problem Ilya Dryomov
2015-02-18 15:18 ` Sage Weil
2015-02-18 15:43 ` Alex Elder [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=54E4B32F.1020407@linaro.org \
    --to=elder@linaro.org \
    --cc=ceph-devel@vger.kernel.org \
    --cc=idryomov@gmail.com \
    --cc=sweil@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.