* [PATCH 1/9] rbd: do not allow remove of mounted-on image
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
@ 2012-12-13 17:01 ` Alex Elder
2012-12-15 1:19 ` Sage Weil
2012-12-13 17:01 ` [PATCH 2/9] ceph: don't reference req after put Alex Elder
` (7 subsequent siblings)
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:01 UTC (permalink / raw)
To: ceph-devel
There is no check in rbd_remove() to see if anybody holds open the
image being removed. That's not cool.
Add a simple open count that goes up and down with opens and closes
(releases) of the device, and don't allow an rbd image to be removed
if the count is non-zero.
Protect the updates of the open count value with ctl_mutex to ensure
the underlying rbd device doesn't get removed while concurrently
being opened.
Signed-off-by: Alex Elder <elder@inktank.com>
---
drivers/block/rbd.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 842caf4..c7bf961 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -235,6 +235,7 @@ struct rbd_device {
/* sysfs related */
struct device dev;
+ unsigned long open_count;
};
static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -309,8 +310,11 @@ static int rbd_open(struct block_device *bdev,
fmode_t mode)
if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
return -EROFS;
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
rbd_get_dev(rbd_dev);
set_device_ro(bdev, rbd_dev->mapping.read_only);
+ rbd_dev->open_count++;
+ mutex_unlock(&ctl_mutex);
return 0;
}
@@ -319,7 +323,11 @@ static int rbd_release(struct gendisk *disk,
fmode_t mode)
{
struct rbd_device *rbd_dev = disk->private_data;
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ rbd_assert(rbd_dev->open_count > 0);
+ rbd_dev->open_count--;
rbd_put_dev(rbd_dev);
+ mutex_unlock(&ctl_mutex);
return 0;
}
@@ -3745,6 +3753,11 @@ static ssize_t rbd_remove(struct bus_type *bus,
goto done;
}
+ if (rbd_dev->open_count) {
+ ret = -EBUSY;
+ goto done;
+ }
+
rbd_remove_all_snaps(rbd_dev);
rbd_bus_del_dev(rbd_dev);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 1/9] rbd: do not allow remove of mounted-on image
2012-12-13 17:01 ` [PATCH 1/9] rbd: do not allow remove of mounted-on image Alex Elder
@ 2012-12-15 1:19 ` Sage Weil
0 siblings, 0 replies; 26+ messages in thread
From: Sage Weil @ 2012-12-15 1:19 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Reviewed-by: Sage Weil <sage@inktank.com>
On Thu, 13 Dec 2012, Alex Elder wrote:
> There is no check in rbd_remove() to see if anybody holds open the
> image being removed. That's not cool.
>
> Add a simple open count that goes up and down with opens and closes
> (releases) of the device, and don't allow an rbd image to be removed
> if the count is non-zero.
>
> Protect the updates of the open count value with ctl_mutex to ensure
> the underlying rbd device doesn't get removed while concurrently
> being opened.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> drivers/block/rbd.c | 13 +++++++++++++
> 1 file changed, 13 insertions(+)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 842caf4..c7bf961 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -235,6 +235,7 @@ struct rbd_device {
>
> /* sysfs related */
> struct device dev;
> + unsigned long open_count;
> };
>
> static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
> @@ -309,8 +310,11 @@ static int rbd_open(struct block_device *bdev,
> fmode_t mode)
> if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
> return -EROFS;
>
> + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> rbd_get_dev(rbd_dev);
> set_device_ro(bdev, rbd_dev->mapping.read_only);
> + rbd_dev->open_count++;
> + mutex_unlock(&ctl_mutex);
>
> return 0;
> }
> @@ -319,7 +323,11 @@ static int rbd_release(struct gendisk *disk,
> fmode_t mode)
> {
> struct rbd_device *rbd_dev = disk->private_data;
>
> + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
> + rbd_assert(rbd_dev->open_count > 0);
> + rbd_dev->open_count--;
> rbd_put_dev(rbd_dev);
> + mutex_unlock(&ctl_mutex);
>
> return 0;
> }
> @@ -3745,6 +3753,11 @@ static ssize_t rbd_remove(struct bus_type *bus,
> goto done;
> }
>
> + if (rbd_dev->open_count) {
> + ret = -EBUSY;
> + goto done;
> + }
> +
> rbd_remove_all_snaps(rbd_dev);
> rbd_bus_del_dev(rbd_dev);
>
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 2/9] ceph: don't reference req after put
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
2012-12-13 17:01 ` [PATCH 1/9] rbd: do not allow remove of mounted-on image Alex Elder
@ 2012-12-13 17:01 ` Alex Elder
2012-12-13 17:01 ` [PATCH 3/9] libceph: avoid using freed osd in __kick_osd_requests() Alex Elder
` (6 subsequent siblings)
8 siblings, 0 replies; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:01 UTC (permalink / raw)
To: ceph-devel
In __unregister_request(), there is a call to list_del_init()
referencing a request that was the subject of a call to
ceph_osdc_put_request() on the previous line. This is not
safe, because the request structure could have been freed
by the time we reach the list_del_init().
Fix this by reversing the order of these lines.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
---
net/ceph/osd_client.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7ebfe13..ac7be72 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -871,9 +871,9 @@ static void __unregister_request(struct
ceph_osd_client *osdc,
req->r_osd = NULL;
}
+ list_del_init(&req->r_req_lru_item);
ceph_osdc_put_request(req);
- list_del_init(&req->r_req_lru_item);
if (osdc->num_requests == 0) {
dout(" no requests, canceling timeout\n");
__cancel_osd_timeout(osdc);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH 3/9] libceph: avoid using freed osd in __kick_osd_requests()
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
2012-12-13 17:01 ` [PATCH 1/9] rbd: do not allow remove of mounted-on image Alex Elder
2012-12-13 17:01 ` [PATCH 2/9] ceph: don't reference req after put Alex Elder
@ 2012-12-13 17:01 ` Alex Elder
2012-12-15 1:23 ` Sage Weil
2012-12-13 17:01 ` [PATCH 4/9] rbd: get rid of RBD_MAX_SEG_NAME_LEN Alex Elder
` (5 subsequent siblings)
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:01 UTC (permalink / raw)
To: ceph-devel
If an osd has no requests and no linger requests, __reset_osd()
will just remove it with a call to __remove_osd(). That drops
a reference to the osd, and therefore the osd may have been free
by the time __reset_osd() returns. That function offers no
indication this may have occurred, and as a result the osd will
continue to be used even when it's no longer valid.
Change__reset_osd() so it returns an error (ENODEV) when it
deletes the osd being reset. And change __kick_osd_requests() so it
returns immediately (before referencing osd again) if __reset_osd()
returns *any* error.
Signed-off-by: Alex Elder <elder@inktank.com>
---
net/ceph/osd_client.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac7be72..60c74c1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -581,7 +581,7 @@ static void __kick_osd_requests(struct
ceph_osd_client *osdc,
dout("__kick_osd_requests osd%d\n", osd->o_osd);
err = __reset_osd(osdc, osd);
- if (err == -EAGAIN)
+ if (err)
return;
list_for_each_entry(req, &osd->o_requests, r_osd_item) {
@@ -745,6 +745,7 @@ static int __reset_osd(struct ceph_osd_client *osdc,
struct ceph_osd *osd)
if (list_empty(&osd->o_requests) &&
list_empty(&osd->o_linger_requests)) {
__remove_osd(osdc, osd);
+ ret = -ENODEV;
} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
&osd->o_con.peer_addr,
sizeof(osd->o_con.peer_addr)) == 0 &&
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 3/9] libceph: avoid using freed osd in __kick_osd_requests()
2012-12-13 17:01 ` [PATCH 3/9] libceph: avoid using freed osd in __kick_osd_requests() Alex Elder
@ 2012-12-15 1:23 ` Sage Weil
0 siblings, 0 replies; 26+ messages in thread
From: Sage Weil @ 2012-12-15 1:23 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Reviewed-by: Sage Weil <sage@inktank.com>
On Thu, 13 Dec 2012, Alex Elder wrote:
> If an osd has no requests and no linger requests, __reset_osd()
> will just remove it with a call to __remove_osd(). That drops
> a reference to the osd, and therefore the osd may have been free
> by the time __reset_osd() returns. That function offers no
> indication this may have occurred, and as a result the osd will
> continue to be used even when it's no longer valid.
>
> Change__reset_osd() so it returns an error (ENODEV) when it
> deletes the osd being reset. And change __kick_osd_requests() so it
> returns immediately (before referencing osd again) if __reset_osd()
> returns *any* error.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> net/ceph/osd_client.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index ac7be72..60c74c1 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -581,7 +581,7 @@ static void __kick_osd_requests(struct
> ceph_osd_client *osdc,
>
> dout("__kick_osd_requests osd%d\n", osd->o_osd);
> err = __reset_osd(osdc, osd);
> - if (err == -EAGAIN)
> + if (err)
> return;
>
> list_for_each_entry(req, &osd->o_requests, r_osd_item) {
> @@ -745,6 +745,7 @@ static int __reset_osd(struct ceph_osd_client *osdc,
> struct ceph_osd *osd)
> if (list_empty(&osd->o_requests) &&
> list_empty(&osd->o_linger_requests)) {
> __remove_osd(osdc, osd);
> + ret = -ENODEV;
> } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
> &osd->o_con.peer_addr,
> sizeof(osd->o_con.peer_addr)) == 0 &&
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 4/9] rbd: get rid of RBD_MAX_SEG_NAME_LEN
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
` (2 preceding siblings ...)
2012-12-13 17:01 ` [PATCH 3/9] libceph: avoid using freed osd in __kick_osd_requests() Alex Elder
@ 2012-12-13 17:01 ` Alex Elder
2012-12-15 1:26 ` Sage Weil
2012-12-13 17:02 ` [PATCH 5/9] libceph: init osd->o_node in create_osd() Alex Elder
` (4 subsequent siblings)
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:01 UTC (permalink / raw)
To: ceph-devel
RBD_MAX_SEG_NAME_LEN represents the maximum length of an rbd object
name (i.e., one of the objects providing storage backing an rbd
image).
Another symbol, MAX_OBJ_NAME_SIZE, is used in the osd client code to
define the maximum length of any object name in an osd request.
Right now they disagree, with RBD_MAX_SEG_NAME_LEN being too big.
There's no real benefit at this point to defining the rbd object
name length limit separate from any other object name, so just
get rid of RBD_MAX_SEG_NAME_LEN and use MAX_OBJ_NAME_SIZE in its
place.
Signed-off-by: Alex Elder <elder@inktank.com>
---
drivers/block/rbd.c | 6 +++---
drivers/block/rbd_types.h | 2 --
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c7bf961..ce26b749 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -740,13 +740,13 @@ static char *rbd_segment_name(struct rbd_device
*rbd_dev, u64 offset)
u64 segment;
int ret;
- name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
if (!name)
return NULL;
segment = offset >> rbd_dev->header.obj_order;
- ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
+ ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
rbd_dev->header.object_prefix, segment);
- if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
+ if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
pr_err("error formatting segment name for #%llu (%d)\n",
segment, ret);
kfree(name);
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index cbe77fa..49d77cb 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -46,8 +46,6 @@
#define RBD_MIN_OBJ_ORDER 16
#define RBD_MAX_OBJ_ORDER 30
-#define RBD_MAX_SEG_NAME_LEN 128
-
#define RBD_COMP_NONE 0
#define RBD_CRYPT_NONE 0
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 4/9] rbd: get rid of RBD_MAX_SEG_NAME_LEN
2012-12-13 17:01 ` [PATCH 4/9] rbd: get rid of RBD_MAX_SEG_NAME_LEN Alex Elder
@ 2012-12-15 1:26 ` Sage Weil
0 siblings, 0 replies; 26+ messages in thread
From: Sage Weil @ 2012-12-15 1:26 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Reviewed-by: Sage Weil <sage@inktank.com>
On Thu, 13 Dec 2012, Alex Elder wrote:
> RBD_MAX_SEG_NAME_LEN represents the maximum length of an rbd object
> name (i.e., one of the objects providing storage backing an rbd
> image).
>
> Another symbol, MAX_OBJ_NAME_SIZE, is used in the osd client code to
> define the maximum length of any object name in an osd request.
>
> Right now they disagree, with RBD_MAX_SEG_NAME_LEN being too big.
>
> There's no real benefit at this point to defining the rbd object
> name length limit separate from any other object name, so just
> get rid of RBD_MAX_SEG_NAME_LEN and use MAX_OBJ_NAME_SIZE in its
> place.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> drivers/block/rbd.c | 6 +++---
> drivers/block/rbd_types.h | 2 --
> 2 files changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index c7bf961..ce26b749 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -740,13 +740,13 @@ static char *rbd_segment_name(struct rbd_device
> *rbd_dev, u64 offset)
> u64 segment;
> int ret;
>
> - name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
> + name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
> if (!name)
> return NULL;
> segment = offset >> rbd_dev->header.obj_order;
> - ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
> + ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
> rbd_dev->header.object_prefix, segment);
> - if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
> + if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
> pr_err("error formatting segment name for #%llu (%d)\n",
> segment, ret);
> kfree(name);
> diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
> index cbe77fa..49d77cb 100644
> --- a/drivers/block/rbd_types.h
> +++ b/drivers/block/rbd_types.h
> @@ -46,8 +46,6 @@
> #define RBD_MIN_OBJ_ORDER 16
> #define RBD_MAX_OBJ_ORDER 30
>
> -#define RBD_MAX_SEG_NAME_LEN 128
> -
> #define RBD_COMP_NONE 0
> #define RBD_CRYPT_NONE 0
>
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 5/9] libceph: init osd->o_node in create_osd()
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
` (3 preceding siblings ...)
2012-12-13 17:01 ` [PATCH 4/9] rbd: get rid of RBD_MAX_SEG_NAME_LEN Alex Elder
@ 2012-12-13 17:02 ` Alex Elder
2012-12-15 1:43 ` Sage Weil
2012-12-13 17:02 ` [PATCH 6/9] rbd: remove linger unconditionally Alex Elder
` (3 subsequent siblings)
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:02 UTC (permalink / raw)
To: ceph-devel
It turns out to be harmless but the red-black node o_node in the
ceph osd structure is not initialized in create_osd(). Add a
call to rb_init_node() initialize it.
Signed-off-by: Alex Elder <elder@inktank.com>
---
net/ceph/osd_client.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 60c74c1..470816c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct
ceph_osd_client *osdc, int onum)
atomic_set(&osd->o_ref, 1);
osd->o_osdc = osdc;
osd->o_osd = onum;
+ rb_init_node(&osd->o_node);
INIT_LIST_HEAD(&osd->o_requests);
INIT_LIST_HEAD(&osd->o_linger_requests);
INIT_LIST_HEAD(&osd->o_osd_lru);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 5/9] libceph: init osd->o_node in create_osd()
2012-12-13 17:02 ` [PATCH 5/9] libceph: init osd->o_node in create_osd() Alex Elder
@ 2012-12-15 1:43 ` Sage Weil
2012-12-17 14:17 ` Alex Elder
0 siblings, 1 reply; 26+ messages in thread
From: Sage Weil @ 2012-12-15 1:43 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
We should drop this one, I think. See upstream commit
4c199a93a2d36b277a9fd209a0f2793f8460a215. When we added the similar call
on teh request tree it caused some noise in linux-next and then got
removed.
sage
On Thu, 13 Dec 2012, Alex Elder wrote:
> It turns out to be harmless but the red-black node o_node in the
> ceph osd structure is not initialized in create_osd(). Add a
> call to rb_init_node() initialize it.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> net/ceph/osd_client.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 60c74c1..470816c 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct
> ceph_osd_client *osdc, int onum)
> atomic_set(&osd->o_ref, 1);
> osd->o_osdc = osdc;
> osd->o_osd = onum;
> + rb_init_node(&osd->o_node);
> INIT_LIST_HEAD(&osd->o_requests);
> INIT_LIST_HEAD(&osd->o_linger_requests);
> INIT_LIST_HEAD(&osd->o_osd_lru);
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH 5/9] libceph: init osd->o_node in create_osd()
2012-12-15 1:43 ` Sage Weil
@ 2012-12-17 14:17 ` Alex Elder
2012-12-17 16:45 ` Sage Weil
0 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-17 14:17 UTC (permalink / raw)
To: Sage Weil; +Cc: ceph-devel
On 12/14/2012 07:43 PM, Sage Weil wrote:
> We should drop this one, I think. See upstream commit
> 4c199a93a2d36b277a9fd209a0f2793f8460a215. When we added the similar call
> on teh request tree it caused some noise in linux-next and then got
> removed.
Well, we need to initialize it. In particular, there is a call
to RB_EMPTY_NODE() in __unregister_request() which assumes that
if a node is not in the tree, it has been initialized as it is
in RB_CLEAR_NODE(). Even if that's not a normal path (and even
if no current code path allows that condition), if we're using
RB_EMPTY_NODE() we need to be sure any node we'll pass is properly
set up or it's broken code.
So it's fine with me if we use RB_CLEAR_NODE() instead. Or, we
could stop using RB_EMPTY_NODE() (but that's more work to verify).
The same goes for the event passed to __remove_event(), but that
should be fixed in a separate patch.
Please let me know what you think.
-Alex
> sage
>
> On Thu, 13 Dec 2012, Alex Elder wrote:
>
>> It turns out to be harmless but the red-black node o_node in the
>> ceph osd structure is not initialized in create_osd(). Add a
>> call to rb_init_node() initialize it.
>>
>> Signed-off-by: Alex Elder <elder@inktank.com>
>> ---
>> net/ceph/osd_client.c | 1 +
>> 1 file changed, 1 insertion(+)
>>
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 60c74c1..470816c 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct
>> ceph_osd_client *osdc, int onum)
>> atomic_set(&osd->o_ref, 1);
>> osd->o_osdc = osdc;
>> osd->o_osd = onum;
>> + rb_init_node(&osd->o_node);
>> INIT_LIST_HEAD(&osd->o_requests);
>> INIT_LIST_HEAD(&osd->o_linger_requests);
>> INIT_LIST_HEAD(&osd->o_osd_lru);
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH 5/9] libceph: init osd->o_node in create_osd()
2012-12-17 14:17 ` Alex Elder
@ 2012-12-17 16:45 ` Sage Weil
2012-12-17 16:57 ` Alex Elder
0 siblings, 1 reply; 26+ messages in thread
From: Sage Weil @ 2012-12-17 16:45 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
On Mon, 17 Dec 2012, Alex Elder wrote:
> On 12/14/2012 07:43 PM, Sage Weil wrote:
> > We should drop this one, I think. See upstream commit
> > 4c199a93a2d36b277a9fd209a0f2793f8460a215. When we added the similar call
> > on teh request tree it caused some noise in linux-next and then got
> > removed.
>
> Well, we need to initialize it. In particular, there is a call
> to RB_EMPTY_NODE() in __unregister_request() which assumes that
> if a node is not in the tree, it has been initialized as it is
> in RB_CLEAR_NODE(). Even if that's not a normal path (and even
> if no current code path allows that condition), if we're using
> RB_EMPTY_NODE() we need to be sure any node we'll pass is properly
> set up or it's broken code.
>
> So it's fine with me if we use RB_CLEAR_NODE() instead. Or, we
> could stop using RB_EMPTY_NODE() (but that's more work to verify).
It sounds like RB_CLEAR_NODE is the right path. From the commit message
it looks like rb_init_node() is broken and/or going away. Let's just
switch to that?
And I guess we probably need to do the same in the request tree, too?
s
> The same goes for the event passed to __remove_event(), but that
> should be fixed in a separate patch.
>
> Please let me know what you think.
>
> -Alex
>
> > sage
> >
> > On Thu, 13 Dec 2012, Alex Elder wrote:
> >
> >> It turns out to be harmless but the red-black node o_node in the
> >> ceph osd structure is not initialized in create_osd(). Add a
> >> call to rb_init_node() initialize it.
> >>
> >> Signed-off-by: Alex Elder <elder@inktank.com>
> >> ---
> >> net/ceph/osd_client.c | 1 +
> >> 1 file changed, 1 insertion(+)
> >>
> >> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> >> index 60c74c1..470816c 100644
> >> --- a/net/ceph/osd_client.c
> >> +++ b/net/ceph/osd_client.c
> >> @@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct
> >> ceph_osd_client *osdc, int onum)
> >> atomic_set(&osd->o_ref, 1);
> >> osd->o_osdc = osdc;
> >> osd->o_osd = onum;
> >> + rb_init_node(&osd->o_node);
> >> INIT_LIST_HEAD(&osd->o_requests);
> >> INIT_LIST_HEAD(&osd->o_linger_requests);
> >> INIT_LIST_HEAD(&osd->o_osd_lru);
> >> --
> >> 1.7.9.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>
> >>
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH 5/9] libceph: init osd->o_node in create_osd()
2012-12-17 16:45 ` Sage Weil
@ 2012-12-17 16:57 ` Alex Elder
0 siblings, 0 replies; 26+ messages in thread
From: Alex Elder @ 2012-12-17 16:57 UTC (permalink / raw)
To: Sage Weil; +Cc: ceph-devel
On 12/17/2012 10:45 AM, Sage Weil wrote:
> On Mon, 17 Dec 2012, Alex Elder wrote:
>> On 12/14/2012 07:43 PM, Sage Weil wrote:
>>> We should drop this one, I think. See upstream commit
>>> 4c199a93a2d36b277a9fd209a0f2793f8460a215. When we added the similar call
>>> on teh request tree it caused some noise in linux-next and then got
>>> removed.
>>
>> Well, we need to initialize it. In particular, there is a call
>> to RB_EMPTY_NODE() in __unregister_request() which assumes that
>> if a node is not in the tree, it has been initialized as it is
>> in RB_CLEAR_NODE(). Even if that's not a normal path (and even
>> if no current code path allows that condition), if we're using
>> RB_EMPTY_NODE() we need to be sure any node we'll pass is properly
>> set up or it's broken code.
>>
>> So it's fine with me if we use RB_CLEAR_NODE() instead. Or, we
>> could stop using RB_EMPTY_NODE() (but that's more work to verify).
>
> It sounds like RB_CLEAR_NODE is the right path. From the commit message
> it looks like rb_init_node() is broken and/or going away. Let's just
> switch to that?
Until you questioned it I actually hadn't looked deep enough to
know there was this good a reason to do the initialization... It
was more that I prefer to see things initialized explicitly sort
of thing.
> And I guess we probably need to do the same in the request tree, too?
Yes I already have a patch put together for that. I'll post it
shortly, I was waiting to hear back first.
-Alex
> s
>
>> The same goes for the event passed to __remove_event(), but that
>> should be fixed in a separate patch.
>>
>> Please let me know what you think.
>>
>> -Alex
>>
>>> sage
>>>
>>> On Thu, 13 Dec 2012, Alex Elder wrote:
>>>
>>>> It turns out to be harmless but the red-black node o_node in the
>>>> ceph osd structure is not initialized in create_osd(). Add a
>>>> call to rb_init_node() initialize it.
>>>>
>>>> Signed-off-by: Alex Elder <elder@inktank.com>
>>>> ---
>>>> net/ceph/osd_client.c | 1 +
>>>> 1 file changed, 1 insertion(+)
>>>>
>>>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>>>> index 60c74c1..470816c 100644
>>>> --- a/net/ceph/osd_client.c
>>>> +++ b/net/ceph/osd_client.c
>>>> @@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct
>>>> ceph_osd_client *osdc, int onum)
>>>> atomic_set(&osd->o_ref, 1);
>>>> osd->o_osdc = osdc;
>>>> osd->o_osd = onum;
>>>> + rb_init_node(&osd->o_node);
>>>> INIT_LIST_HEAD(&osd->o_requests);
>>>> INIT_LIST_HEAD(&osd->o_linger_requests);
>>>> INIT_LIST_HEAD(&osd->o_osd_lru);
>>>> --
>>>> 1.7.9.5
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>
>>>>
>>
>>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 6/9] rbd: remove linger unconditionally
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
` (4 preceding siblings ...)
2012-12-13 17:02 ` [PATCH 5/9] libceph: init osd->o_node in create_osd() Alex Elder
@ 2012-12-13 17:02 ` Alex Elder
2012-12-15 5:12 ` Sage Weil
2012-12-13 17:02 ` [PATCH 7/9] rbd: don't use ENOTSUPP Alex Elder
` (2 subsequent siblings)
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:02 UTC (permalink / raw)
To: ceph-devel
In __unregister_linger_request(), the request is being removed
from the osd client's req_linger list only when the request
has a non-null osd pointer. It should be done whether or not
the request currently has an osd.
This is most likely a non-issue because I believe the request
will always have an osd when this function is called.
Signed-off-by: Alex Elder <elder@inktank.com>
---
net/ceph/osd_client.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 470816c..b15b475 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -907,8 +907,8 @@ static void __unregister_linger_request(struct
ceph_osd_client *osdc,
struct ceph_osd_request *req)
{
dout("__unregister_linger_request %p\n", req);
+ list_del_init(&req->r_linger_item);
if (req->r_osd) {
- list_del_init(&req->r_linger_item);
list_del_init(&req->r_linger_osd);
if (list_empty(&req->r_osd->o_requests) &&
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 6/9] rbd: remove linger unconditionally
2012-12-13 17:02 ` [PATCH 6/9] rbd: remove linger unconditionally Alex Elder
@ 2012-12-15 5:12 ` Sage Weil
0 siblings, 0 replies; 26+ messages in thread
From: Sage Weil @ 2012-12-15 5:12 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Reviewed-by: Sage Weil <sage@inktank.com>
On Thu, 13 Dec 2012, Alex Elder wrote:
> In __unregister_linger_request(), the request is being removed
> from the osd client's req_linger list only when the request
> has a non-null osd pointer. It should be done whether or not
> the request currently has an osd.
>
> This is most likely a non-issue because I believe the request
> will always have an osd when this function is called.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> net/ceph/osd_client.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 470816c..b15b475 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -907,8 +907,8 @@ static void __unregister_linger_request(struct
> ceph_osd_client *osdc,
> struct ceph_osd_request *req)
> {
> dout("__unregister_linger_request %p\n", req);
> + list_del_init(&req->r_linger_item);
> if (req->r_osd) {
> - list_del_init(&req->r_linger_item);
> list_del_init(&req->r_linger_osd);
>
> if (list_empty(&req->r_osd->o_requests) &&
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 7/9] rbd: don't use ENOTSUPP
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
` (5 preceding siblings ...)
2012-12-13 17:02 ` [PATCH 6/9] rbd: remove linger unconditionally Alex Elder
@ 2012-12-13 17:02 ` Alex Elder
2012-12-15 5:12 ` Sage Weil
2012-12-13 17:02 ` [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name() Alex Elder
2012-12-13 17:03 ` [PATCH 9/9] libceph: socket can close in any connection state Alex Elder
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:02 UTC (permalink / raw)
To: ceph-devel
ENOTSUPP is not a standard errno (it shows up as "Unknown error 524"
in an error message). This is what was getting produced when the
the local rbd code does not implement features required by a
discovered rbd image.
Change the error code returned in this case to ENXIO.
Signed-off-by: Alex Elder <elder@inktank.com>
---
drivers/block/rbd.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ce26b749..4daa400 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2456,7 +2456,7 @@ static int _rbd_dev_v2_snap_features(struct
rbd_device *rbd_dev, u64 snap_id,
incompat = le64_to_cpu(features_buf.incompat);
if (incompat & ~RBD_FEATURES_ALL)
- return -ENOTSUPP;
+ return -ENXIO;
*snap_features = le64_to_cpu(features_buf.features);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 7/9] rbd: don't use ENOTSUPP
2012-12-13 17:02 ` [PATCH 7/9] rbd: don't use ENOTSUPP Alex Elder
@ 2012-12-15 5:12 ` Sage Weil
0 siblings, 0 replies; 26+ messages in thread
From: Sage Weil @ 2012-12-15 5:12 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Reviewed-by: Sage Weil <sage@inktank.com>
On Thu, 13 Dec 2012, Alex Elder wrote:
> ENOTSUPP is not a standard errno (it shows up as "Unknown error 524"
> in an error message). This is what was getting produced when the
> the local rbd code does not implement features required by a
> discovered rbd image.
>
> Change the error code returned in this case to ENXIO.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> drivers/block/rbd.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index ce26b749..4daa400 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -2456,7 +2456,7 @@ static int _rbd_dev_v2_snap_features(struct
> rbd_device *rbd_dev, u64 snap_id,
>
> incompat = le64_to_cpu(features_buf.incompat);
> if (incompat & ~RBD_FEATURES_ALL)
> - return -ENOTSUPP;
> + return -ENXIO;
>
> *snap_features = le64_to_cpu(features_buf.features);
>
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
` (6 preceding siblings ...)
2012-12-13 17:02 ` [PATCH 7/9] rbd: don't use ENOTSUPP Alex Elder
@ 2012-12-13 17:02 ` Alex Elder
2012-12-15 5:17 ` Sage Weil
2012-12-13 17:03 ` [PATCH 9/9] libceph: socket can close in any connection state Alex Elder
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:02 UTC (permalink / raw)
To: ceph-devel
Currently ceph_pg_poolid_by_name() returns an int, which is used to
encode a ceph pool id. This could be a problem because a pool id
(at least in some cases) is a 64-bit value. We have a defined pool
id value that represents "no pool," and that's a very sensible
return value here.
This patch changes ceph_pg_poolid_by_name() to return a 64-bit
pool id value, or CEPH_NOPOOL if the named pool is not found.
The patch also gratuitously renames the function, separating "pool"
from "id" in the name by an underscore.
Signed-off-by: Alex Elder <elder@inktank.com>
---
drivers/block/rbd.c | 6 +++---
include/linux/ceph/osdmap.h | 2 +-
net/ceph/osdmap.c | 14 ++++++++------
3 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4daa400..706824b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
ceph_opts = NULL; /* rbd_dev client now owns this */
/* pick the pool */
+ rc = -ENOENT;
osdc = &rbdc->client->osdc;
- rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
- if (rc < 0)
+ spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
+ if (spec->pool_id == CEPH_NOPOOL)
goto err_out_client;
- spec->pool_id = (u64) rc;
rbd_dev = rbd_dev_create(rbdc, spec);
if (!rbd_dev)
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5ea57ba..c841396 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
*osdmap,
struct ceph_pg pgid);
extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
id);
-extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
*name);
+extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
char *name);
#endif
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214..27e904e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
ceph_osdmap *map, u64 id)
}
EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
-int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
{
struct rb_node *rbp;
for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
- struct ceph_pg_pool_info *pi =
- rb_entry(rbp, struct ceph_pg_pool_info, node);
+ struct ceph_pg_pool_info *pi;
+
+ pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
if (pi->name && strcmp(pi->name, name) == 0)
- return pi->id;
+ return (__u64) pi->id;
}
- return -ENOENT;
+
+ return CEPH_NOPOOL;
}
-EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
static void __remove_pg_pool(struct rb_root *root, struct
ceph_pg_pool_info *pi)
{
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-13 17:02 ` [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name() Alex Elder
@ 2012-12-15 5:17 ` Sage Weil
2012-12-17 14:36 ` Alex Elder
0 siblings, 1 reply; 26+ messages in thread
From: Sage Weil @ 2012-12-15 5:17 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Most of the code uses int64_t/__s64 for the pool id, although in a few
cases we screwed up and limited it to 32 bits. In reality, that's way
overkill anyway; we could have left it at 32 bits to begin with.
My first instinct would be to change the return type to long long or s64
and avoid the use magic #defines...
On Thu, 13 Dec 2012, Alex Elder wrote:
> Currently ceph_pg_poolid_by_name() returns an int, which is used to
> encode a ceph pool id. This could be a problem because a pool id
> (at least in some cases) is a 64-bit value. We have a defined pool
> id value that represents "no pool," and that's a very sensible
> return value here.
>
> This patch changes ceph_pg_poolid_by_name() to return a 64-bit
> pool id value, or CEPH_NOPOOL if the named pool is not found.
>
> The patch also gratuitously renames the function, separating "pool"
> from "id" in the name by an underscore.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> drivers/block/rbd.c | 6 +++---
> include/linux/ceph/osdmap.h | 2 +-
> net/ceph/osdmap.c | 14 ++++++++------
> 3 files changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 4daa400..706824b 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
> ceph_opts = NULL; /* rbd_dev client now owns this */
>
> /* pick the pool */
> + rc = -ENOENT;
> osdc = &rbdc->client->osdc;
> - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
> - if (rc < 0)
> + spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
> + if (spec->pool_id == CEPH_NOPOOL)
> goto err_out_client;
> - spec->pool_id = (u64) rc;
>
> rbd_dev = rbd_dev_create(rbdc, spec);
> if (!rbd_dev)
> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> index 5ea57ba..c841396 100644
> --- a/include/linux/ceph/osdmap.h
> +++ b/include/linux/ceph/osdmap.h
> @@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
> *osdmap,
> struct ceph_pg pgid);
>
> extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
> id);
> -extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
> *name);
> +extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
> char *name);
>
> #endif
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index de73214..27e904e 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
> ceph_osdmap *map, u64 id)
> }
> EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
>
> -int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
> +__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
> {
> struct rb_node *rbp;
>
> for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
> - struct ceph_pg_pool_info *pi =
> - rb_entry(rbp, struct ceph_pg_pool_info, node);
> + struct ceph_pg_pool_info *pi;
> +
> + pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
> if (pi->name && strcmp(pi->name, name) == 0)
> - return pi->id;
> + return (__u64) pi->id;
> }
> - return -ENOENT;
> +
> + return CEPH_NOPOOL;
> }
> -EXPORT_SYMBOL(ceph_pg_poolid_by_name);
> +EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
>
> static void __remove_pg_pool(struct rb_root *root, struct
> ceph_pg_pool_info *pi)
> {
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-15 5:17 ` Sage Weil
@ 2012-12-17 14:36 ` Alex Elder
2012-12-17 16:49 ` Sage Weil
0 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-17 14:36 UTC (permalink / raw)
To: Sage Weil; +Cc: ceph-devel
On 12/14/2012 11:17 PM, Sage Weil wrote:
> Most of the code uses int64_t/__s64 for the pool id, although in a few
> cases we screwed up and limited it to 32 bits. In reality, that's way
> overkill anyway; we could have left it at 32 bits to begin with.
The differing types representing the same abstraction are precisely
the point of making this change. What really needs to happen is we
need to fix *that*; that is, decide whether a pool id is 32 or 64
bits, signed or not, and then make sure it's that and only that
throughout the code.
In the mean time, this change is defensive, making sure there's
no uncertainty in what we're dealing with within rbd. The code
will guarantee some future change won't inadvertently let a
wrong-sized pool id attempt to sneak through an interface
unnoticed. It may seem like overkill but this kind of bug is
really hard to track down, and it's better to simply preclude
it from happening.
> My first instinct would be to change the return type to long long or s64
> and avoid the use magic #defines...
I absolutely like using base types (like long long). But where
those types are used to represent a true abstraction (like a
snapshot id, or a pool id), it is the one place I think the use
of typedefs and "magic #defines" is actually a real help because
it makes explicit you're working with something more than an (e.g.)
an int. A typedef makes obviously to the reader that it's restricted
a bit (so, for example, it isn't meaningful to do math on it).
And symbolic constants make it a lot easier to search through
code for special situations like this.
This stuff is all sort of philosophical rather than technical.
The code before works, and the code as I've changed it works.
Anybody else have thoughts?
-Alex
>
>
> On Thu, 13 Dec 2012, Alex Elder wrote:
>
>> Currently ceph_pg_poolid_by_name() returns an int, which is used to
>> encode a ceph pool id. This could be a problem because a pool id
>> (at least in some cases) is a 64-bit value. We have a defined pool
>> id value that represents "no pool," and that's a very sensible
>> return value here.
>>
>> This patch changes ceph_pg_poolid_by_name() to return a 64-bit
>> pool id value, or CEPH_NOPOOL if the named pool is not found.
>>
>> The patch also gratuitously renames the function, separating "pool"
>> from "id" in the name by an underscore.
>>
>> Signed-off-by: Alex Elder <elder@inktank.com>
>> ---
>> drivers/block/rbd.c | 6 +++---
>> include/linux/ceph/osdmap.h | 2 +-
>> net/ceph/osdmap.c | 14 ++++++++------
>> 3 files changed, 12 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>> index 4daa400..706824b 100644
>> --- a/drivers/block/rbd.c
>> +++ b/drivers/block/rbd.c
>> @@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
>> ceph_opts = NULL; /* rbd_dev client now owns this */
>>
>> /* pick the pool */
>> + rc = -ENOENT;
>> osdc = &rbdc->client->osdc;
>> - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
>> - if (rc < 0)
>> + spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
>> + if (spec->pool_id == CEPH_NOPOOL)
>> goto err_out_client;
>> - spec->pool_id = (u64) rc;
>>
>> rbd_dev = rbd_dev_create(rbdc, spec);
>> if (!rbd_dev)
>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>> index 5ea57ba..c841396 100644
>> --- a/include/linux/ceph/osdmap.h
>> +++ b/include/linux/ceph/osdmap.h
>> @@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
>> *osdmap,
>> struct ceph_pg pgid);
>>
>> extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
>> id);
>> -extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
>> *name);
>> +extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
>> char *name);
>>
>> #endif
>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>> index de73214..27e904e 100644
>> --- a/net/ceph/osdmap.c
>> +++ b/net/ceph/osdmap.c
>> @@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
>> ceph_osdmap *map, u64 id)
>> }
>> EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
>>
>> -int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
>> +__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
>> {
>> struct rb_node *rbp;
>>
>> for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
>> - struct ceph_pg_pool_info *pi =
>> - rb_entry(rbp, struct ceph_pg_pool_info, node);
>> + struct ceph_pg_pool_info *pi;
>> +
>> + pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
>> if (pi->name && strcmp(pi->name, name) == 0)
>> - return pi->id;
>> + return (__u64) pi->id;
>> }
>> - return -ENOENT;
>> +
>> + return CEPH_NOPOOL;
>> }
>> -EXPORT_SYMBOL(ceph_pg_poolid_by_name);
>> +EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
>>
>> static void __remove_pg_pool(struct rb_root *root, struct
>> ceph_pg_pool_info *pi)
>> {
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>>
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-17 14:36 ` Alex Elder
@ 2012-12-17 16:49 ` Sage Weil
2012-12-17 17:09 ` Alex Elder
0 siblings, 1 reply; 26+ messages in thread
From: Sage Weil @ 2012-12-17 16:49 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
On Mon, 17 Dec 2012, Alex Elder wrote:
> On 12/14/2012 11:17 PM, Sage Weil wrote:
> > Most of the code uses int64_t/__s64 for the pool id, although in a few
> > cases we screwed up and limited it to 32 bits. In reality, that's way
> > overkill anyway; we could have left it at 32 bits to begin with.
>
> The differing types representing the same abstraction are precisely
> the point of making this change. What really needs to happen is we
> need to fix *that*; that is, decide whether a pool id is 32 or 64
> bits, signed or not, and then make sure it's that and only that
> throughout the code.
>
> In the mean time, this change is defensive, making sure there's
> no uncertainty in what we're dealing with within rbd. The code
> will guarantee some future change won't inadvertently let a
> wrong-sized pool id attempt to sneak through an interface
> unnoticed. It may seem like overkill but this kind of bug is
> really hard to track down, and it's better to simply preclude
> it from happening.
>
> > My first instinct would be to change the return type to long long or s64
> > and avoid the use magic #defines...
>
> I absolutely like using base types (like long long). But where
> those types are used to represent a true abstraction (like a
> snapshot id, or a pool id), it is the one place I think the use
> of typedefs and "magic #defines" is actually a real help because
> it makes explicit you're working with something more than an (e.g.)
> an int. A typedef makes obviously to the reader that it's restricted
> a bit (so, for example, it isn't meaningful to do math on it).
Completely agreed.
> And symbolic constants make it a lot easier to search through
> code for special situations like this.
Okay with me. Just keep in mind that most of the other code looks for a
negative int64_t return value (i.e., the pool id is 63 bits).
The reason there is a mismatch: it used to be a 32-bit value, and at one
point we thought we'd do a pool per radosgw bucket and did a huge
conversion to 64-bit. And missed a few places. The whole transition was
ill-conceived and generally a bad idea, though; we should never have that
many pools. So it's not clear it's worth the effort to spend another
feature bit to fix it up.
sage
> This stuff is all sort of philosophical rather than technical.
> The code before works, and the code as I've changed it works.
>
> Anybody else have thoughts?
>
> -Alex
> >
> >
> > On Thu, 13 Dec 2012, Alex Elder wrote:
> >
> >> Currently ceph_pg_poolid_by_name() returns an int, which is used to
> >> encode a ceph pool id. This could be a problem because a pool id
> >> (at least in some cases) is a 64-bit value. We have a defined pool
> >> id value that represents "no pool," and that's a very sensible
> >> return value here.
> >>
> >> This patch changes ceph_pg_poolid_by_name() to return a 64-bit
> >> pool id value, or CEPH_NOPOOL if the named pool is not found.
> >>
> >> The patch also gratuitously renames the function, separating "pool"
> >> from "id" in the name by an underscore.
> >>
> >> Signed-off-by: Alex Elder <elder@inktank.com>
> >> ---
> >> drivers/block/rbd.c | 6 +++---
> >> include/linux/ceph/osdmap.h | 2 +-
> >> net/ceph/osdmap.c | 14 ++++++++------
> >> 3 files changed, 12 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> >> index 4daa400..706824b 100644
> >> --- a/drivers/block/rbd.c
> >> +++ b/drivers/block/rbd.c
> >> @@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
> >> ceph_opts = NULL; /* rbd_dev client now owns this */
> >>
> >> /* pick the pool */
> >> + rc = -ENOENT;
> >> osdc = &rbdc->client->osdc;
> >> - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
> >> - if (rc < 0)
> >> + spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
> >> + if (spec->pool_id == CEPH_NOPOOL)
> >> goto err_out_client;
> >> - spec->pool_id = (u64) rc;
> >>
> >> rbd_dev = rbd_dev_create(rbdc, spec);
> >> if (!rbd_dev)
> >> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> >> index 5ea57ba..c841396 100644
> >> --- a/include/linux/ceph/osdmap.h
> >> +++ b/include/linux/ceph/osdmap.h
> >> @@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
> >> *osdmap,
> >> struct ceph_pg pgid);
> >>
> >> extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
> >> id);
> >> -extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
> >> *name);
> >> +extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
> >> char *name);
> >>
> >> #endif
> >> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> >> index de73214..27e904e 100644
> >> --- a/net/ceph/osdmap.c
> >> +++ b/net/ceph/osdmap.c
> >> @@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
> >> ceph_osdmap *map, u64 id)
> >> }
> >> EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
> >>
> >> -int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
> >> +__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
> >> {
> >> struct rb_node *rbp;
> >>
> >> for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
> >> - struct ceph_pg_pool_info *pi =
> >> - rb_entry(rbp, struct ceph_pg_pool_info, node);
> >> + struct ceph_pg_pool_info *pi;
> >> +
> >> + pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
> >> if (pi->name && strcmp(pi->name, name) == 0)
> >> - return pi->id;
> >> + return (__u64) pi->id;
> >> }
> >> - return -ENOENT;
> >> +
> >> + return CEPH_NOPOOL;
> >> }
> >> -EXPORT_SYMBOL(ceph_pg_poolid_by_name);
> >> +EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
> >>
> >> static void __remove_pg_pool(struct rb_root *root, struct
> >> ceph_pg_pool_info *pi)
> >> {
> >> --
> >> 1.7.9.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>
> >>
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-17 16:49 ` Sage Weil
@ 2012-12-17 17:09 ` Alex Elder
2012-12-17 21:28 ` Alex Elder
0 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-17 17:09 UTC (permalink / raw)
To: Sage Weil; +Cc: ceph-devel
On 12/17/2012 10:49 AM, Sage Weil wrote:
> On Mon, 17 Dec 2012, Alex Elder wrote:
>> On 12/14/2012 11:17 PM, Sage Weil wrote:
>>> Most of the code uses int64_t/__s64 for the pool id, although in a few
>>> cases we screwed up and limited it to 32 bits. In reality, that's way
>>> overkill anyway; we could have left it at 32 bits to begin with.
>>
>> The differing types representing the same abstraction are precisely
>> the point of making this change. What really needs to happen is we
>> need to fix *that*; that is, decide whether a pool id is 32 or 64
>> bits, signed or not, and then make sure it's that and only that
>> throughout the code.
>>
>> In the mean time, this change is defensive, making sure there's
>> no uncertainty in what we're dealing with within rbd. The code
>> will guarantee some future change won't inadvertently let a
>> wrong-sized pool id attempt to sneak through an interface
>> unnoticed. It may seem like overkill but this kind of bug is
>> really hard to track down, and it's better to simply preclude
>> it from happening.
>>
>>> My first instinct would be to change the return type to long long or s64
>>> and avoid the use magic #defines...
>>
>> I absolutely like using base types (like long long). But where
>> those types are used to represent a true abstraction (like a
>> snapshot id, or a pool id), it is the one place I think the use
>> of typedefs and "magic #defines" is actually a real help because
>> it makes explicit you're working with something more than an (e.g.)
>> an int. A typedef makes obviously to the reader that it's restricted
>> a bit (so, for example, it isn't meaningful to do math on it).
>
> Completely agreed.
>
>> And symbolic constants make it a lot easier to search through
>> code for special situations like this.
>
> Okay with me. Just keep in mind that most of the other code looks for a
> negative int64_t return value (i.e., the pool id is 63 bits).
I.e., if I do this here but not elsewhere we're subject to
the same kind of "someday" problems... In fact, it's just
a different form of mismatched type--here returning an unsigned
when elsewhere a signed value is assumed.
I still like the symbolic values, or in this case, maybe
a macro ceph_pool_id_valid() or something. It just makes
it easier to make other changes later, because you can
easily (or maybe more precisely) search for the effects
of a proposed change.
I find the time spent searching through code is large
enough that I tend to do things in a way that facilitates
that.
Let's talk about this today and come to an agreement
about the best way to resolve this.
Thanks.
-Alex
> The reason there is a mismatch: it used to be a 32-bit value, and at one
> point we thought we'd do a pool per radosgw bucket and did a huge
> conversion to 64-bit. And missed a few places. The whole transition was
> ill-conceived and generally a bad idea, though; we should never have that
> many pools. So it's not clear it's worth the effort to spend another
> feature bit to fix it up.
>
> sage
>
>> This stuff is all sort of philosophical rather than technical.
>> The code before works, and the code as I've changed it works.
>>
>> Anybody else have thoughts?
>>
>> -Alex
>>>
>>>
>>> On Thu, 13 Dec 2012, Alex Elder wrote:
>>>
>>>> Currently ceph_pg_poolid_by_name() returns an int, which is used to
>>>> encode a ceph pool id. This could be a problem because a pool id
>>>> (at least in some cases) is a 64-bit value. We have a defined pool
>>>> id value that represents "no pool," and that's a very sensible
>>>> return value here.
>>>>
>>>> This patch changes ceph_pg_poolid_by_name() to return a 64-bit
>>>> pool id value, or CEPH_NOPOOL if the named pool is not found.
>>>>
>>>> The patch also gratuitously renames the function, separating "pool"
>>>> from "id" in the name by an underscore.
>>>>
>>>> Signed-off-by: Alex Elder <elder@inktank.com>
>>>> ---
>>>> drivers/block/rbd.c | 6 +++---
>>>> include/linux/ceph/osdmap.h | 2 +-
>>>> net/ceph/osdmap.c | 14 ++++++++------
>>>> 3 files changed, 12 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>> index 4daa400..706824b 100644
>>>> --- a/drivers/block/rbd.c
>>>> +++ b/drivers/block/rbd.c
>>>> @@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
>>>> ceph_opts = NULL; /* rbd_dev client now owns this */
>>>>
>>>> /* pick the pool */
>>>> + rc = -ENOENT;
>>>> osdc = &rbdc->client->osdc;
>>>> - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
>>>> - if (rc < 0)
>>>> + spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
>>>> + if (spec->pool_id == CEPH_NOPOOL)
>>>> goto err_out_client;
>>>> - spec->pool_id = (u64) rc;
>>>>
>>>> rbd_dev = rbd_dev_create(rbdc, spec);
>>>> if (!rbd_dev)
>>>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>>>> index 5ea57ba..c841396 100644
>>>> --- a/include/linux/ceph/osdmap.h
>>>> +++ b/include/linux/ceph/osdmap.h
>>>> @@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
>>>> *osdmap,
>>>> struct ceph_pg pgid);
>>>>
>>>> extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
>>>> id);
>>>> -extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
>>>> *name);
>>>> +extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
>>>> char *name);
>>>>
>>>> #endif
>>>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>>>> index de73214..27e904e 100644
>>>> --- a/net/ceph/osdmap.c
>>>> +++ b/net/ceph/osdmap.c
>>>> @@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
>>>> ceph_osdmap *map, u64 id)
>>>> }
>>>> EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
>>>>
>>>> -int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
>>>> +__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
>>>> {
>>>> struct rb_node *rbp;
>>>>
>>>> for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
>>>> - struct ceph_pg_pool_info *pi =
>>>> - rb_entry(rbp, struct ceph_pg_pool_info, node);
>>>> + struct ceph_pg_pool_info *pi;
>>>> +
>>>> + pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
>>>> if (pi->name && strcmp(pi->name, name) == 0)
>>>> - return pi->id;
>>>> + return (__u64) pi->id;
>>>> }
>>>> - return -ENOENT;
>>>> +
>>>> + return CEPH_NOPOOL;
>>>> }
>>>> -EXPORT_SYMBOL(ceph_pg_poolid_by_name);
>>>> +EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
>>>>
>>>> static void __remove_pg_pool(struct rb_root *root, struct
>>>> ceph_pg_pool_info *pi)
>>>> {
>>>> --
>>>> 1.7.9.5
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>
>>>>
>>
>>
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-17 17:09 ` Alex Elder
@ 2012-12-17 21:28 ` Alex Elder
2012-12-17 22:31 ` Alex Elder
0 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-17 21:28 UTC (permalink / raw)
To: Sage Weil; +Cc: ceph-devel
On 12/17/2012 11:09 AM, Alex Elder wrote:
> On 12/17/2012 10:49 AM, Sage Weil wrote:
>> On Mon, 17 Dec 2012, Alex Elder wrote:
>>> On 12/14/2012 11:17 PM, Sage Weil wrote:
>>>> Most of the code uses int64_t/__s64 for the pool id, although in a few
>>>> cases we screwed up and limited it to 32 bits. In reality, that's way
>>>> overkill anyway; we could have left it at 32 bits to begin with.
I just wanted to follow up on this. Sage and I discussed
this today. He said that most of the other code involving
pool id's assume they are signed, so a negative pool id
indicates an error. Rather than introduce another new
way pool id's could be (mis)interpreted I agreed to rework
this patch so that it follows that basic pattern (i.e.,
considering pool id's signed).
I haven't looked at it closely yet but I still expect to
have a revised patch, though it's possible I'll conclude
leaving things as-is is best.
Some future cleanup task can take care of unifying everything
to 64 bits (or possibly 32 bits), but that's another day.
-Alex
>>> The differing types representing the same abstraction are precisely
>>> the point of making this change. What really needs to happen is we
>>> need to fix *that*; that is, decide whether a pool id is 32 or 64
>>> bits, signed or not, and then make sure it's that and only that
>>> throughout the code.
>>>
>>> In the mean time, this change is defensive, making sure there's
>>> no uncertainty in what we're dealing with within rbd. The code
>>> will guarantee some future change won't inadvertently let a
>>> wrong-sized pool id attempt to sneak through an interface
>>> unnoticed. It may seem like overkill but this kind of bug is
>>> really hard to track down, and it's better to simply preclude
>>> it from happening.
>>>
>>>> My first instinct would be to change the return type to long long or s64
>>>> and avoid the use magic #defines...
>>>
>>> I absolutely like using base types (like long long). But where
>>> those types are used to represent a true abstraction (like a
>>> snapshot id, or a pool id), it is the one place I think the use
>>> of typedefs and "magic #defines" is actually a real help because
>>> it makes explicit you're working with something more than an (e.g.)
>>> an int. A typedef makes obviously to the reader that it's restricted
>>> a bit (so, for example, it isn't meaningful to do math on it).
>>
>> Completely agreed.
>>
>>> And symbolic constants make it a lot easier to search through
>>> code for special situations like this.
>>
>> Okay with me. Just keep in mind that most of the other code looks for a
>> negative int64_t return value (i.e., the pool id is 63 bits).
>
> I.e., if I do this here but not elsewhere we're subject to
> the same kind of "someday" problems... In fact, it's just
> a different form of mismatched type--here returning an unsigned
> when elsewhere a signed value is assumed.
>
> I still like the symbolic values, or in this case, maybe
> a macro ceph_pool_id_valid() or something. It just makes
> it easier to make other changes later, because you can
> easily (or maybe more precisely) search for the effects
> of a proposed change.
>
> I find the time spent searching through code is large
> enough that I tend to do things in a way that facilitates
> that.
>
> Let's talk about this today and come to an agreement
> about the best way to resolve this.
>
> Thanks.
>
> -Alex
>
>> The reason there is a mismatch: it used to be a 32-bit value, and at one
>> point we thought we'd do a pool per radosgw bucket and did a huge
>> conversion to 64-bit. And missed a few places. The whole transition was
>> ill-conceived and generally a bad idea, though; we should never have that
>> many pools. So it's not clear it's worth the effort to spend another
>> feature bit to fix it up.
>>
>> sage
>>
>>> This stuff is all sort of philosophical rather than technical.
>>> The code before works, and the code as I've changed it works.
>>>
>>> Anybody else have thoughts?
>>>
>>> -Alex
>>>>
>>>>
>>>> On Thu, 13 Dec 2012, Alex Elder wrote:
>>>>
>>>>> Currently ceph_pg_poolid_by_name() returns an int, which is used to
>>>>> encode a ceph pool id. This could be a problem because a pool id
>>>>> (at least in some cases) is a 64-bit value. We have a defined pool
>>>>> id value that represents "no pool," and that's a very sensible
>>>>> return value here.
>>>>>
>>>>> This patch changes ceph_pg_poolid_by_name() to return a 64-bit
>>>>> pool id value, or CEPH_NOPOOL if the named pool is not found.
>>>>>
>>>>> The patch also gratuitously renames the function, separating "pool"
>>>>> from "id" in the name by an underscore.
>>>>>
>>>>> Signed-off-by: Alex Elder <elder@inktank.com>
>>>>> ---
>>>>> drivers/block/rbd.c | 6 +++---
>>>>> include/linux/ceph/osdmap.h | 2 +-
>>>>> net/ceph/osdmap.c | 14 ++++++++------
>>>>> 3 files changed, 12 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>>> index 4daa400..706824b 100644
>>>>> --- a/drivers/block/rbd.c
>>>>> +++ b/drivers/block/rbd.c
>>>>> @@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
>>>>> ceph_opts = NULL; /* rbd_dev client now owns this */
>>>>>
>>>>> /* pick the pool */
>>>>> + rc = -ENOENT;
>>>>> osdc = &rbdc->client->osdc;
>>>>> - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
>>>>> - if (rc < 0)
>>>>> + spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
>>>>> + if (spec->pool_id == CEPH_NOPOOL)
>>>>> goto err_out_client;
>>>>> - spec->pool_id = (u64) rc;
>>>>>
>>>>> rbd_dev = rbd_dev_create(rbdc, spec);
>>>>> if (!rbd_dev)
>>>>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>>>>> index 5ea57ba..c841396 100644
>>>>> --- a/include/linux/ceph/osdmap.h
>>>>> +++ b/include/linux/ceph/osdmap.h
>>>>> @@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
>>>>> *osdmap,
>>>>> struct ceph_pg pgid);
>>>>>
>>>>> extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
>>>>> id);
>>>>> -extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
>>>>> *name);
>>>>> +extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
>>>>> char *name);
>>>>>
>>>>> #endif
>>>>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>>>>> index de73214..27e904e 100644
>>>>> --- a/net/ceph/osdmap.c
>>>>> +++ b/net/ceph/osdmap.c
>>>>> @@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
>>>>> ceph_osdmap *map, u64 id)
>>>>> }
>>>>> EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
>>>>>
>>>>> -int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
>>>>> +__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
>>>>> {
>>>>> struct rb_node *rbp;
>>>>>
>>>>> for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
>>>>> - struct ceph_pg_pool_info *pi =
>>>>> - rb_entry(rbp, struct ceph_pg_pool_info, node);
>>>>> + struct ceph_pg_pool_info *pi;
>>>>> +
>>>>> + pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
>>>>> if (pi->name && strcmp(pi->name, name) == 0)
>>>>> - return pi->id;
>>>>> + return (__u64) pi->id;
>>>>> }
>>>>> - return -ENOENT;
>>>>> +
>>>>> + return CEPH_NOPOOL;
>>>>> }
>>>>> -EXPORT_SYMBOL(ceph_pg_poolid_by_name);
>>>>> +EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
>>>>>
>>>>> static void __remove_pg_pool(struct rb_root *root, struct
>>>>> ceph_pg_pool_info *pi)
>>>>> {
>>>>> --
>>>>> 1.7.9.5
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>>
>>>
>
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name()
2012-12-17 21:28 ` Alex Elder
@ 2012-12-17 22:31 ` Alex Elder
0 siblings, 0 replies; 26+ messages in thread
From: Alex Elder @ 2012-12-17 22:31 UTC (permalink / raw)
To: Sage Weil; +Cc: ceph-devel
On 12/17/2012 03:28 PM, Alex Elder wrote:
> On 12/17/2012 11:09 AM, Alex Elder wrote:
>> On 12/17/2012 10:49 AM, Sage Weil wrote:
>>> On Mon, 17 Dec 2012, Alex Elder wrote:
>>>> On 12/14/2012 11:17 PM, Sage Weil wrote:
>>>>> Most of the code uses int64_t/__s64 for the pool id, although in a few
>>>>> cases we screwed up and limited it to 32 bits. In reality, that's way
>>>>> overkill anyway; we could have left it at 32 bits to begin with.
>
> I just wanted to follow up on this. Sage and I discussed
> this today. He said that most of the other code involving
> pool id's assume they are signed, so a negative pool id
> indicates an error. Rather than introduce another new
> way pool id's could be (mis)interpreted I agreed to rework
> this patch so that it follows that basic pattern (i.e.,
> considering pool id's signed).
>
> I haven't looked at it closely yet but I still expect to
> have a revised patch, though it's possible I'll conclude
> leaving things as-is is best.
OK, I've looked at it now. If we're not going to make
the change I proposed, we might as well not make a change
at all.
This was trying to make an incomplete fix to a much larger
problem. We'll try to get the bigger problem fixed another
day.
I'm retracting this patch from consideration.
-Alex
> Some future cleanup task can take care of unifying everything
> to 64 bits (or possibly 32 bits), but that's another day.
>
> -Alex
>
>
>>>> The differing types representing the same abstraction are precisely
>>>> the point of making this change. What really needs to happen is we
>>>> need to fix *that*; that is, decide whether a pool id is 32 or 64
>>>> bits, signed or not, and then make sure it's that and only that
>>>> throughout the code.
>>>>
>>>> In the mean time, this change is defensive, making sure there's
>>>> no uncertainty in what we're dealing with within rbd. The code
>>>> will guarantee some future change won't inadvertently let a
>>>> wrong-sized pool id attempt to sneak through an interface
>>>> unnoticed. It may seem like overkill but this kind of bug is
>>>> really hard to track down, and it's better to simply preclude
>>>> it from happening.
>>>>
>>>>> My first instinct would be to change the return type to long long or s64
>>>>> and avoid the use magic #defines...
>>>>
>>>> I absolutely like using base types (like long long). But where
>>>> those types are used to represent a true abstraction (like a
>>>> snapshot id, or a pool id), it is the one place I think the use
>>>> of typedefs and "magic #defines" is actually a real help because
>>>> it makes explicit you're working with something more than an (e.g.)
>>>> an int. A typedef makes obviously to the reader that it's restricted
>>>> a bit (so, for example, it isn't meaningful to do math on it).
>>>
>>> Completely agreed.
>>>
>>>> And symbolic constants make it a lot easier to search through
>>>> code for special situations like this.
>>>
>>> Okay with me. Just keep in mind that most of the other code looks for a
>>> negative int64_t return value (i.e., the pool id is 63 bits).
>>
>> I.e., if I do this here but not elsewhere we're subject to
>> the same kind of "someday" problems... In fact, it's just
>> a different form of mismatched type--here returning an unsigned
>> when elsewhere a signed value is assumed.
>>
>> I still like the symbolic values, or in this case, maybe
>> a macro ceph_pool_id_valid() or something. It just makes
>> it easier to make other changes later, because you can
>> easily (or maybe more precisely) search for the effects
>> of a proposed change.
>>
>> I find the time spent searching through code is large
>> enough that I tend to do things in a way that facilitates
>> that.
>>
>> Let's talk about this today and come to an agreement
>> about the best way to resolve this.
>>
>> Thanks.
>>
>> -Alex
>>
>>> The reason there is a mismatch: it used to be a 32-bit value, and at one
>>> point we thought we'd do a pool per radosgw bucket and did a huge
>>> conversion to 64-bit. And missed a few places. The whole transition was
>>> ill-conceived and generally a bad idea, though; we should never have that
>>> many pools. So it's not clear it's worth the effort to spend another
>>> feature bit to fix it up.
>>>
>>> sage
>>>
>>>> This stuff is all sort of philosophical rather than technical.
>>>> The code before works, and the code as I've changed it works.
>>>>
>>>> Anybody else have thoughts?
>>>>
>>>> -Alex
>>>>>
>>>>>
>>>>> On Thu, 13 Dec 2012, Alex Elder wrote:
>>>>>
>>>>>> Currently ceph_pg_poolid_by_name() returns an int, which is used to
>>>>>> encode a ceph pool id. This could be a problem because a pool id
>>>>>> (at least in some cases) is a 64-bit value. We have a defined pool
>>>>>> id value that represents "no pool," and that's a very sensible
>>>>>> return value here.
>>>>>>
>>>>>> This patch changes ceph_pg_poolid_by_name() to return a 64-bit
>>>>>> pool id value, or CEPH_NOPOOL if the named pool is not found.
>>>>>>
>>>>>> The patch also gratuitously renames the function, separating "pool"
>>>>>> from "id" in the name by an underscore.
>>>>>>
>>>>>> Signed-off-by: Alex Elder <elder@inktank.com>
>>>>>> ---
>>>>>> drivers/block/rbd.c | 6 +++---
>>>>>> include/linux/ceph/osdmap.h | 2 +-
>>>>>> net/ceph/osdmap.c | 14 ++++++++------
>>>>>> 3 files changed, 12 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>>>> index 4daa400..706824b 100644
>>>>>> --- a/drivers/block/rbd.c
>>>>>> +++ b/drivers/block/rbd.c
>>>>>> @@ -3642,11 +3642,11 @@ static ssize_t rbd_add(struct bus_type *bus,
>>>>>> ceph_opts = NULL; /* rbd_dev client now owns this */
>>>>>>
>>>>>> /* pick the pool */
>>>>>> + rc = -ENOENT;
>>>>>> osdc = &rbdc->client->osdc;
>>>>>> - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
>>>>>> - if (rc < 0)
>>>>>> + spec->pool_id = ceph_pg_pool_id_by_name(osdc->osdmap, spec->pool_name);
>>>>>> + if (spec->pool_id == CEPH_NOPOOL)
>>>>>> goto err_out_client;
>>>>>> - spec->pool_id = (u64) rc;
>>>>>>
>>>>>> rbd_dev = rbd_dev_create(rbdc, spec);
>>>>>> if (!rbd_dev)
>>>>>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>>>>>> index 5ea57ba..c841396 100644
>>>>>> --- a/include/linux/ceph/osdmap.h
>>>>>> +++ b/include/linux/ceph/osdmap.h
>>>>>> @@ -124,6 +124,6 @@ extern int ceph_calc_pg_primary(struct ceph_osdmap
>>>>>> *osdmap,
>>>>>> struct ceph_pg pgid);
>>>>>>
>>>>>> extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64
>>>>>> id);
>>>>>> -extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char
>>>>>> *name);
>>>>>> +extern __u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const
>>>>>> char *name);
>>>>>>
>>>>>> #endif
>>>>>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>>>>>> index de73214..27e904e 100644
>>>>>> --- a/net/ceph/osdmap.c
>>>>>> +++ b/net/ceph/osdmap.c
>>>>>> @@ -485,19 +485,21 @@ const char *ceph_pg_pool_name_by_id(struct
>>>>>> ceph_osdmap *map, u64 id)
>>>>>> }
>>>>>> EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
>>>>>>
>>>>>> -int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
>>>>>> +__u64 ceph_pg_pool_id_by_name(struct ceph_osdmap *map, const char *name)
>>>>>> {
>>>>>> struct rb_node *rbp;
>>>>>>
>>>>>> for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
>>>>>> - struct ceph_pg_pool_info *pi =
>>>>>> - rb_entry(rbp, struct ceph_pg_pool_info, node);
>>>>>> + struct ceph_pg_pool_info *pi;
>>>>>> +
>>>>>> + pi = rb_entry(rbp, struct ceph_pg_pool_info, node);
>>>>>> if (pi->name && strcmp(pi->name, name) == 0)
>>>>>> - return pi->id;
>>>>>> + return (__u64) pi->id;
>>>>>> }
>>>>>> - return -ENOENT;
>>>>>> +
>>>>>> + return CEPH_NOPOOL;
>>>>>> }
>>>>>> -EXPORT_SYMBOL(ceph_pg_poolid_by_name);
>>>>>> +EXPORT_SYMBOL(ceph_pg_pool_id_by_name);
>>>>>>
>>>>>> static void __remove_pg_pool(struct rb_root *root, struct
>>>>>> ceph_pg_pool_info *pi)
>>>>>> {
>>>>>> --
>>>>>> 1.7.9.5
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>>>
>>>>
>>>>
>>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 9/9] libceph: socket can close in any connection state
2012-12-13 16:57 [PATCH 0/9] ceph: re-post of bug fixes Alex Elder
` (7 preceding siblings ...)
2012-12-13 17:02 ` [PATCH 8/9] rbd: fix ceph_pg_poolid_by_name() Alex Elder
@ 2012-12-13 17:03 ` Alex Elder
2012-12-15 5:18 ` Sage Weil
8 siblings, 1 reply; 26+ messages in thread
From: Alex Elder @ 2012-12-13 17:03 UTC (permalink / raw)
To: ceph-devel
A connection's socket can close for any reason, independent of the
state of the connection (and without irrespective of the connection
mutex). As a result, the connectino can be in pretty much any state
at the time its socket is closed.
Handle those other cases at the top of con_work(). Pull this whole
block of code into a separate function to reduce the clutter.
Signed-off-by: Alex Elder <elder@inktank.com>
---
net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++++-----------------
1 file changed, 30 insertions(+), 17 deletions(-)
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1041114..4b04ccc 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2273,6 +2273,35 @@ static void queue_con(struct ceph_connection *con)
(void) queue_con_delay(con, 0);
}
+static bool con_sock_closed(struct ceph_connection *con)
+{
+ if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
+ return false;
+
+#define CASE(x) \
+ case CON_STATE_ ## x: \
+ con->error_msg = "socket closed (con state " #x ")"; \
+ break;
+
+ switch (con->state) {
+ CASE(CLOSED);
+ CASE(PREOPEN);
+ CASE(CONNECTING);
+ CASE(NEGOTIATING);
+ CASE(OPEN);
+ CASE(STANDBY);
+ default:
+ pr_warning("%s con %p unrecognized state %lu\n",
+ __func__, con, con->state);
+ con->error_msg = "unrecognized con state";
+ BUG();
+ break;
+ }
+#undef CASE
+
+ return true;
+}
+
/*
* Do some work on a connection. Drop a connection ref when we're done.
*/
@@ -2284,24 +2313,8 @@ static void con_work(struct work_struct *work)
mutex_lock(&con->mutex);
restart:
- if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
- switch (con->state) {
- case CON_STATE_CONNECTING:
- con->error_msg = "connection failed";
- break;
- case CON_STATE_NEGOTIATING:
- con->error_msg = "negotiation failed";
- break;
- case CON_STATE_OPEN:
- con->error_msg = "socket closed";
- break;
- default:
- dout("unrecognized con state %d\n", (int)con->state);
- con->error_msg = "unrecognized con state";
- BUG();
- }
+ if (con_sock_closed(con))
goto fault;
- }
if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
dout("con_work %p backing off\n", con);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 9/9] libceph: socket can close in any connection state
2012-12-13 17:03 ` [PATCH 9/9] libceph: socket can close in any connection state Alex Elder
@ 2012-12-15 5:18 ` Sage Weil
0 siblings, 0 replies; 26+ messages in thread
From: Sage Weil @ 2012-12-15 5:18 UTC (permalink / raw)
To: Alex Elder; +Cc: ceph-devel
Reviewed-by: Sage Weil <sage@inktank.com>
On Thu, 13 Dec 2012, Alex Elder wrote:
> A connection's socket can close for any reason, independent of the
> state of the connection (and without irrespective of the connection
> mutex). As a result, the connectino can be in pretty much any state
> at the time its socket is closed.
>
> Handle those other cases at the top of con_work(). Pull this whole
> block of code into a separate function to reduce the clutter.
>
> Signed-off-by: Alex Elder <elder@inktank.com>
> ---
> net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++++-----------------
> 1 file changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 1041114..4b04ccc 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -2273,6 +2273,35 @@ static void queue_con(struct ceph_connection *con)
> (void) queue_con_delay(con, 0);
> }
>
> +static bool con_sock_closed(struct ceph_connection *con)
> +{
> + if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
> + return false;
> +
> +#define CASE(x) \
> + case CON_STATE_ ## x: \
> + con->error_msg = "socket closed (con state " #x ")"; \
> + break;
> +
> + switch (con->state) {
> + CASE(CLOSED);
> + CASE(PREOPEN);
> + CASE(CONNECTING);
> + CASE(NEGOTIATING);
> + CASE(OPEN);
> + CASE(STANDBY);
> + default:
> + pr_warning("%s con %p unrecognized state %lu\n",
> + __func__, con, con->state);
> + con->error_msg = "unrecognized con state";
> + BUG();
> + break;
> + }
> +#undef CASE
> +
> + return true;
> +}
> +
> /*
> * Do some work on a connection. Drop a connection ref when we're done.
> */
> @@ -2284,24 +2313,8 @@ static void con_work(struct work_struct *work)
>
> mutex_lock(&con->mutex);
> restart:
> - if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
> - switch (con->state) {
> - case CON_STATE_CONNECTING:
> - con->error_msg = "connection failed";
> - break;
> - case CON_STATE_NEGOTIATING:
> - con->error_msg = "negotiation failed";
> - break;
> - case CON_STATE_OPEN:
> - con->error_msg = "socket closed";
> - break;
> - default:
> - dout("unrecognized con state %d\n", (int)con->state);
> - con->error_msg = "unrecognized con state";
> - BUG();
> - }
> + if (con_sock_closed(con))
> goto fault;
> - }
>
> if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
> dout("con_work %p backing off\n", con);
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread