[PATCH 0/9] messenger fixups, batch #1

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/9] messenger fixups, batch #1
@ 2012-07-21  0:41 Sage Weil
  2012-07-21  0:41 ` [PATCH 1/9] libceph: move feature bits to separate header Sage Weil
                   ` (8 more replies)
  0 siblings, 9 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

This is the series I elbowed into testing today.  There is the CRUSH
support, and some incremental cleanups and fixes to the messenger that my
testing with socket failure injection turned up.  The basic strategy here
is to move more things under the con->mutex and drop useless checks/calls
where possible.

Sage Weil (8):
  libceph: move feature bits to separate header
  libceph: report socket read/write error message
  libceph: fix mutex coverage for ceph_con_close
  libceph: resubmit linger ops when pg mapping changes
  libceph: (re)initialize bio_iter on start of message receive
  ceph: close old con before reopening on mds reconnect
  libceph: protect ceph_con_open() with mutex
  libceph: reset connection retry on successfully negotiation

caleb miles (1):
  libceph: support crush tunables

 fs/ceph/mds_client.c               |    2 +
 fs/ceph/super.c                    |    1 +
 include/linux/ceph/ceph_features.h |   26 ++++++++++++++++++++++++
 include/linux/ceph/ceph_fs.h       |   14 ------------
 include/linux/ceph/libceph.h       |    6 -----
 include/linux/crush/crush.h        |    8 +++++++
 net/ceph/ceph_common.c             |    5 ++-
 net/ceph/crush/mapper.c            |   13 ++++++-----
 net/ceph/messenger.c               |   31 +++++++++++++++++++++-------
 net/ceph/osd_client.c              |   26 +++++++++++++++++++----
 net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
 11 files changed, 130 insertions(+), 41 deletions(-)
 create mode 100644 include/linux/ceph/ceph_features.h

-- 
1.7.9


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/9] libceph: move feature bits to separate header
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:14   ` Yehuda Sadeh
  2012-07-30 18:29   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 2/9] libceph: support crush tunables Sage Weil
                   ` (7 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

This is simply cleanup that will keep things more closely synced with the
userland code.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c               |    1 +
 fs/ceph/super.c                    |    1 +
 include/linux/ceph/ceph_features.h |   24 ++++++++++++++++++++++++
 include/linux/ceph/ceph_fs.h       |   14 --------------
 include/linux/ceph/libceph.h       |    6 ------
 net/ceph/ceph_common.c             |    5 +++--
 6 files changed, 29 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/ceph/ceph_features.h

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 418f6a8..39b76d6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -10,6 +10,7 @@
 #include "super.h"
 #include "mds_client.h"
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/pagelist.h>
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1e67dd7..2c47ecf 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -18,6 +18,7 @@
 #include "super.h"
 #include "mds_client.h"
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/auth.h>
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
new file mode 100644
index 0000000..342f93d
--- /dev/null
+++ b/include/linux/ceph/ceph_features.h
@@ -0,0 +1,24 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_UID            (1<<0)
+#define CEPH_FEATURE_NOSRCADDR      (1<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
+#define CEPH_FEATURE_FLOCK          (1<<3)
+#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
+#define CEPH_FEATURE_MONNAMES       (1<<5)
+#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
+#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
+
+/*
+ * Features supported.
+ */
+#define CEPH_FEATURES_SUPPORTED_DEFAULT  \
+	(CEPH_FEATURE_NOSRCADDR)
+
+#define CEPH_FEATURES_REQUIRED_DEFAULT   \
+	(CEPH_FEATURE_NOSRCADDR)
+#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index e81ab30..d021610 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -35,20 +35,6 @@
 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
 #define CEPH_MAX_MON   31
 
-
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
-#define CEPH_FEATURE_FLOCK          (1<<3)
-#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
-#define CEPH_FEATURE_MONNAMES       (1<<5)
-#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
-#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
-
-
 /*
  * ceph_file_layout - describe data layout for a file/inode
  */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 98ec36a..ea072e1 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -23,12 +23,6 @@
 #include "ceph_fs.h"
 
 /*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
-
-/*
  * mount options
  */
 #define CEPH_OPT_FSID             (1<<0)
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 3b45e01..69e38db 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/debugfs.h>
 #include <linux/ceph/decode.h>
@@ -460,9 +461,9 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 	client->auth_err = 0;
 
 	client->extra_mon_dispatch = NULL;
-	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
+	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
 		supported_features;
-	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
+	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
 		required_features;
 
 	/* msgr */
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/9] libceph: support crush tunables
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
  2012-07-21  0:41 ` [PATCH 1/9] libceph: move feature bits to separate header Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:24   ` Yehuda Sadeh
  2012-07-30 18:36   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 3/9] libceph: report socket read/write error message Sage Weil
                   ` (6 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: caleb miles

From: caleb miles <caleb.miles@inktank.com>

The server side recently added support for tuning some magic
crush variables. Decode these variables if they are present, or use the
default values if they are not present.

Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.

Signed-off-by: caleb miles <caleb.miles@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/ceph_features.h |    4 ++-
 include/linux/crush/crush.h        |    8 +++++++
 net/ceph/crush/mapper.c            |   13 ++++++-----
 net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 342f93d..df25dcf 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,12 +12,14 @@
 #define CEPH_FEATURE_MONNAMES       (1<<5)
 #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
+#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
 
 /*
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
-	(CEPH_FEATURE_NOSRCADDR)
+	(CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_CRUSH_TUNABLES)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR)
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 7c47508..25baa28 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -154,6 +154,14 @@ struct crush_map {
 	__s32 max_buckets;
 	__u32 max_rules;
 	__s32 max_devices;
+
+	/* choose local retries before re-descent */
+	__u32 choose_local_tries;
+	/* choose local attempts using a fallback permutation before
+	 * re-descent */
+	__u32 choose_local_fallback_tries;
+	/* choose attempts before giving up */ 
+	__u32 choose_total_tries;
 };
 
 
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index d7edc24..35fce75 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
 	int item = 0;
 	int itemtype;
 	int collide, reject;
-	const unsigned int orig_tries = 5; /* attempts before we fall back to search */
 
 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 		bucket->id, x, outpos, numrep);
@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
 					reject = 1;
 					goto reject;
 				}
-				if (flocal >= (in->size>>1) &&
-				    flocal > orig_tries)
+				if (map->choose_local_fallback_tries > 0 &&
+				    flocal >= (in->size>>1) &&
+				    flocal > map->choose_local_fallback_tries)
 					item = bucket_perm_choose(in, x, r);
 				else
 					item = crush_bucket_choose(in, x, r);
@@ -422,13 +422,14 @@ reject:
 					ftotal++;
 					flocal++;
 
-					if (collide && flocal < 3)
+					if (collide && flocal <= map->choose_local_tries)
 						/* retry locally a few times */
 						retry_bucket = 1;
-					else if (flocal <= in->size + orig_tries)
+					else if (map->choose_local_fallback_tries > 0 &&
+						 flocal <= in->size + map->choose_local_fallback_tries)
 						/* exhaustive bucket search */
 						retry_bucket = 1;
-					else if (ftotal < 20)
+					else if (ftotal <= map->choose_total_tries)
 						/* then retry descent */
 						retry_descent = 1;
 					else
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9600674..3124b71 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -135,6 +135,21 @@ bad:
 	return -EINVAL;
 }
 
+static int skip_name_map(void **p, void *end)
+{
+        int len;
+        ceph_decode_32_safe(p, end, len ,bad);
+        while (len--) {
+                int strlen;
+                *p += sizeof(u32);
+                ceph_decode_32_safe(p, end, strlen, bad);
+                *p += strlen;
+}
+        return 0;
+bad:
+        return -EINVAL;
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
 	struct crush_map *c;
@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	void **p = &pbyval;
 	void *start = pbyval;
 	u32 magic;
+	u32 num_name_maps;
 
 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
 
@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	if (c == NULL)
 		return ERR_PTR(-ENOMEM);
 
+        /* set tunables to default values */
+        c->choose_local_tries = 2;
+        c->choose_local_fallback_tries = 5;
+        c->choose_total_tries = 19;
+
 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
 	magic = ceph_decode_32(p);
 	if (magic != CRUSH_MAGIC) {
@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	}
 
 	/* ignore trailing name maps. */
+        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
+                err = skip_name_map(p, end);
+                if (err < 0)
+                        goto done;
+        }
+
+        /* tunables */
+        ceph_decode_need(p, end, 3*sizeof(u32), done);
+        c->choose_local_tries = ceph_decode_32(p);
+        c->choose_local_fallback_tries =  ceph_decode_32(p);
+        c->choose_total_tries = ceph_decode_32(p);
+        dout("crush decode tunable choose_local_tries = %d",
+             c->choose_local_tries);
+        dout("crush decode tunable choose_local_fallback_tries = %d",
+             c->choose_local_fallback_tries);
+        dout("crush decode tunable choose_total_tries = %d",
+             c->choose_total_tries);
 
+done:
 	dout("crush_decode success\n");
 	return c;
 
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 3/9] libceph: report socket read/write error message
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
  2012-07-21  0:41 ` [PATCH 1/9] libceph: move feature bits to separate header Sage Weil
  2012-07-21  0:41 ` [PATCH 2/9] libceph: support crush tunables Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:26   ` Yehuda Sadeh
  2012-07-30 18:37   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close Sage Weil
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

We need to set error_msg to something useful before calling ceph_fault();
do so here for try_{read,write}().  This is more informative than

libceph: osd0 192.168.106.220:6801 (null)

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 12419a0..7105908 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2291,14 +2291,18 @@ restart:
 	ret = try_read(con);
 	if (ret == -EAGAIN)
 		goto restart;
-	if (ret < 0)
+	if (ret < 0) {
+		con->error_msg = "socket error on read";
 		goto fault;
+	}
 
 	ret = try_write(con);
 	if (ret == -EAGAIN)
 		goto restart;
-	if (ret < 0)
+	if (ret < 0) {
+		con->error_msg = "socket error on write";
 		goto fault;
+	}
 
 done:
 	mutex_unlock(&con->mutex);
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
                   ` (2 preceding siblings ...)
  2012-07-21  0:41 ` [PATCH 3/9] libceph: report socket read/write error message Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:29   ` Yehuda Sadeh
  2012-07-30 18:43   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes Sage Weil
                   ` (4 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

Hold the mutex while twiddling all of the state bits to avoid possible
races.  While we're here, make not of why we cannot close the socket
directly.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 7105908..e24310e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -503,6 +503,7 @@ static void reset_connection(struct ceph_connection *con)
  */
 void ceph_con_close(struct ceph_connection *con)
 {
+	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con,
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	clear_bit(NEGOTIATING, &con->state);
@@ -515,11 +516,16 @@ void ceph_con_close(struct ceph_connection *con)
 	clear_bit(KEEPALIVE_PENDING, &con->flags);
 	clear_bit(WRITE_PENDING, &con->flags);
 
-	mutex_lock(&con->mutex);
 	reset_connection(con);
 	con->peer_global_seq = 0;
 	cancel_delayed_work(&con->work);
 	mutex_unlock(&con->mutex);
+
+	/*
+	 * We cannot close the socket directly from here because the
+	 * work threads use it without holding the mutex.  Instead, let
+	 * con_work() do it.
+	 */
 	queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_close);
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
                   ` (3 preceding siblings ...)
  2012-07-21  0:41 ` [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:51   ` Yehuda Sadeh
  2012-07-30 22:40   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive Sage Weil
                   ` (3 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

The linger op registration (i.e., watch) modifies the object state.  As
such, the OSD will reply with success if it has already applied without
doing the associated side-effects (setting up the watch session state).
If we lose the ACK and resubmit, we will see success but the watch will not
be correctly registered and we won't get notifies.

To fix this, always resubmit the linger op with a new tid.  We accomplish
this by re-registering as a linger (i.e., 'registered') if we are not yet
registered.  Then the second loop will treat this just like a normal
case of re-registering.

This mirrors a similar fix on the userland ceph.git, commit 5dd68b95, and
ceph bug #2796.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c |   26 +++++++++++++++++++++-----
 1 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 07920ca..c605705 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -891,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
 {
 	dout("__register_linger_request %p\n", req);
 	list_add_tail(&req->r_linger_item, &osdc->req_linger);
-	list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
+	if (req->r_osd)
+		list_add_tail(&req->r_linger_osd,
+			      &req->r_osd->o_linger_requests);
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -1305,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 
 	dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
 	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+	for (p = rb_first(&osdc->requests); p; ) {
 		req = rb_entry(p, struct ceph_osd_request, r_node);
+		p = rb_next(p);
 		err = __map_request(osdc, req, force_resend);
 		if (err < 0)
 			continue;  /* error */
@@ -1314,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
 			needmap++;  /* request a newer map */
 		} else if (err > 0) {
-			dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
-			     req->r_osd ? req->r_osd->o_osd : -1);
-			if (!req->r_linger)
+			if (!req->r_linger) {
+				dout("%p tid %llu requeued on osd%d\n", req,
+				     req->r_tid,
+				     req->r_osd ? req->r_osd->o_osd : -1);
 				req->r_flags |= CEPH_OSD_FLAG_RETRY;
+			}
+		}
+		if (req->r_linger && list_empty(&req->r_linger_item)) {
+			/*
+			 * register as a linger so that we will
+			 * re-submit below and get a new tid
+			 */
+			dout("%p tid %llu restart on osd%d\n",
+			     req, req->r_tid,
+			     req->r_osd ? req->r_osd->o_osd : -1);
+			__register_linger_request(osdc, req);
+			__unregister_request(osdc, req);
 		}
 	}
 
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
                   ` (4 preceding siblings ...)
  2012-07-21  0:41 ` [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:55   ` Yehuda Sadeh
  2012-07-30 19:04   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 7/9] ceph: close old con before reopening on mds reconnect Sage Weil
                   ` (2 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

Previously, we were opportunistically initializing the bio_iter if it
appeared to be uninitialized in the middle of the read path.  The problem
is that a sequence like:

 - start reading message
 - initialize bio_iter
 - read half a message
 - messenger fault, reconnect
 - restart reading message
 - ** bio_iter now non-NULL, not reinitialized **
 - read past end of bio, crash

Instead, initialize the bio_iter unconditionally when we allocate/claim
the message for read.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e24310e..efa369f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1876,6 +1876,11 @@ static int read_partial_message(struct ceph_connection *con)
 		else
 			con->in_msg_pos.page_pos = 0;
 		con->in_msg_pos.data_pos = 0;
+
+#ifdef CONFIG_BLOCK
+		if (m->bio)
+			init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
 	}
 
 	/* front */
@@ -1892,10 +1897,6 @@ static int read_partial_message(struct ceph_connection *con)
 		if (ret <= 0)
 			return ret;
 	}
-#ifdef CONFIG_BLOCK
-	if (m->bio && !m->bio_iter)
-		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
-#endif
 
 	/* (page) data */
 	while (con->in_msg_pos.data_pos < data_len) {
@@ -1906,7 +1907,7 @@ static int read_partial_message(struct ceph_connection *con)
 				return ret;
 #ifdef CONFIG_BLOCK
 		} else if (m->bio) {
-
+			BUG_ON(!m->bio_iter);
 			ret = read_partial_message_bio(con,
 						 &m->bio_iter, &m->bio_seg,
 						 data_len, do_datacrc);
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 7/9] ceph: close old con before reopening on mds reconnect
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
                   ` (5 preceding siblings ...)
  2012-07-21  0:41 ` [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:56   ` Yehuda Sadeh
  2012-07-21  0:41 ` [PATCH 8/9] libceph: protect ceph_con_open() with mutex Sage Weil
  2012-07-21  0:41 ` [PATCH 9/9] libceph: reset connection retry on successfully negotiation Sage Weil
  8 siblings, 1 reply; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

When we detect a mds session reset, close the old ceph_connection before
reopening it.  This ensures we clean up the old socket properly and keep
the ceph_connection state correct.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 39b76d6..a5a7354 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2518,6 +2518,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 	session->s_seq = 0;
 
+	ceph_con_close(&session->s_con);
 	ceph_con_open(&session->s_con,
 		      CEPH_ENTITY_TYPE_MDS, mds,
 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 8/9] libceph: protect ceph_con_open() with mutex
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
                   ` (6 preceding siblings ...)
  2012-07-21  0:41 ` [PATCH 7/9] ceph: close old con before reopening on mds reconnect Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 22:58   ` Yehuda Sadeh
  2012-07-30 19:06   ` Alex Elder
  2012-07-21  0:41 ` [PATCH 9/9] libceph: reset connection retry on successfully negotiation Sage Weil
  8 siblings, 2 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

Take the con mutex while we are initiating a ceph open.  This is necessary
because the may have previously been in use and then closed, which could
result in a racing workqueue running con_work().

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index efa369f..65964c2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -537,6 +537,7 @@ void ceph_con_open(struct ceph_connection *con,
 		   __u8 entity_type, __u64 entity_num,
 		   struct ceph_entity_addr *addr)
 {
+	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
 	set_bit(OPENING, &con->state);
 	WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
@@ -546,6 +547,7 @@ void ceph_con_open(struct ceph_connection *con,
 
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
+	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_open);
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 9/9] libceph: reset connection retry on successfully negotiation
  2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
                   ` (7 preceding siblings ...)
  2012-07-21  0:41 ` [PATCH 8/9] libceph: protect ceph_con_open() with mutex Sage Weil
@ 2012-07-21  0:41 ` Sage Weil
  2012-07-24 23:00   ` Yehuda Sadeh
  8 siblings, 1 reply; 30+ messages in thread
From: Sage Weil @ 2012-07-21  0:41 UTC (permalink / raw)
  To: ceph-devel; +Cc: Sage Weil

We exponentially back off when we encounter connection errors.  If several
errors accumulate, we will eventually wait ages before even trying to
reconnect.

Fix this by resetting the backoff counter after a successful negotiation/
connection with the remote node.  Fixes ceph issue #2802.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 65964c2..28896eb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1633,6 +1633,8 @@ static int process_connect(struct ceph_connection *con)
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
 			set_bit(LOSSYTX, &con->flags);
 
+		con->delay = 0;      /* reset backoff memory */
+
 		prepare_read_tag(con);
 		break;
 
-- 
1.7.9


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/9] libceph: move feature bits to separate header
  2012-07-21  0:41 ` [PATCH 1/9] libceph: move feature bits to separate header Sage Weil
@ 2012-07-24 22:14   ` Yehuda Sadeh
  2012-07-30 18:29   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:14 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> This is simply cleanup that will keep things more closely synced with the
> userland code.
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  fs/ceph/mds_client.c               |    1 +
>  fs/ceph/super.c                    |    1 +
>  include/linux/ceph/ceph_features.h |   24 ++++++++++++++++++++++++
>  include/linux/ceph/ceph_fs.h       |   14 --------------
>  include/linux/ceph/libceph.h       |    6 ------
>  net/ceph/ceph_common.c             |    5 +++--
>  6 files changed, 29 insertions(+), 22 deletions(-)
>  create mode 100644 include/linux/ceph/ceph_features.h
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 418f6a8..39b76d6 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -10,6 +10,7 @@
>  #include "super.h"
>  #include "mds_client.h"
>
> +#include <linux/ceph/ceph_features.h>
>  #include <linux/ceph/messenger.h>
>  #include <linux/ceph/decode.h>
>  #include <linux/ceph/pagelist.h>
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 1e67dd7..2c47ecf 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -18,6 +18,7 @@
>  #include "super.h"
>  #include "mds_client.h"
>
> +#include <linux/ceph/ceph_features.h>
>  #include <linux/ceph/decode.h>
>  #include <linux/ceph/mon_client.h>
>  #include <linux/ceph/auth.h>
> diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> new file mode 100644
> index 0000000..342f93d
> --- /dev/null
> +++ b/include/linux/ceph/ceph_features.h
> @@ -0,0 +1,24 @@
> +#ifndef __CEPH_FEATURES
> +#define __CEPH_FEATURES
> +
> +/*
> + * feature bits
> + */
> +#define CEPH_FEATURE_UID            (1<<0)
> +#define CEPH_FEATURE_NOSRCADDR      (1<<1)
> +#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
> +#define CEPH_FEATURE_FLOCK          (1<<3)
> +#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
> +#define CEPH_FEATURE_MONNAMES       (1<<5)
> +#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> +#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> +
> +/*
> + * Features supported.
> + */
> +#define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> +       (CEPH_FEATURE_NOSRCADDR)
> +
> +#define CEPH_FEATURES_REQUIRED_DEFAULT   \
> +       (CEPH_FEATURE_NOSRCADDR)
> +#endif
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index e81ab30..d021610 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -35,20 +35,6 @@
>  /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
>  #define CEPH_MAX_MON   31
>
> -
> -/*
> - * feature bits
> - */
> -#define CEPH_FEATURE_UID            (1<<0)
> -#define CEPH_FEATURE_NOSRCADDR      (1<<1)
> -#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
> -#define CEPH_FEATURE_FLOCK          (1<<3)
> -#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
> -#define CEPH_FEATURE_MONNAMES       (1<<5)
> -#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> -#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> -
> -
>  /*
>   * ceph_file_layout - describe data layout for a file/inode
>   */
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 98ec36a..ea072e1 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -23,12 +23,6 @@
>  #include "ceph_fs.h"
>
>  /*
> - * Supported features
> - */
> -#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
> -#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
> -
> -/*
>   * mount options
>   */
>  #define CEPH_OPT_FSID             (1<<0)
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 3b45e01..69e38db 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -17,6 +17,7 @@
>  #include <linux/string.h>
>
>
> +#include <linux/ceph/ceph_features.h>
>  #include <linux/ceph/libceph.h>
>  #include <linux/ceph/debugfs.h>
>  #include <linux/ceph/decode.h>
> @@ -460,9 +461,9 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
>         client->auth_err = 0;
>
>         client->extra_mon_dispatch = NULL;
> -       client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
> +       client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
>                 supported_features;
> -       client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
> +       client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
>                 required_features;
>
>         /* msgr */
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 2/9] libceph: support crush tunables
  2012-07-21  0:41 ` [PATCH 2/9] libceph: support crush tunables Sage Weil
@ 2012-07-24 22:24   ` Yehuda Sadeh
  2012-07-30 23:14     ` Sage Weil
  2012-07-30 18:36   ` Alex Elder
  1 sibling, 1 reply; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:24 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel, caleb miles

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> From: caleb miles <caleb.miles@inktank.com>
>
> The server side recently added support for tuning some magic
> crush variables. Decode these variables if they are present, or use the
> default values if they are not present.
>
> Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
>
> Signed-off-by: caleb miles <caleb.miles@inktank.com>
> Reviewed-by: Sage Weil <sage@inktank.com>
> ---
>  include/linux/ceph/ceph_features.h |    4 ++-
>  include/linux/crush/crush.h        |    8 +++++++
>  net/ceph/crush/mapper.c            |   13 ++++++-----
>  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 57 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> index 342f93d..df25dcf 100644
> --- a/include/linux/ceph/ceph_features.h
> +++ b/include/linux/ceph/ceph_features.h
> @@ -12,12 +12,14 @@
>  #define CEPH_FEATURE_MONNAMES       (1<<5)
>  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
>  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)

any reason why this is 18 and not 8?

>
>  /*
>   * Features supported.
>   */
>  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> -       (CEPH_FEATURE_NOSRCADDR)
> +       (CEPH_FEATURE_NOSRCADDR |        \
> +        CEPH_FEATURE_CRUSH_TUNABLES)
>
>  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
>         (CEPH_FEATURE_NOSRCADDR)
> diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> index 7c47508..25baa28 100644
> --- a/include/linux/crush/crush.h
> +++ b/include/linux/crush/crush.h
> @@ -154,6 +154,14 @@ struct crush_map {
>         __s32 max_buckets;
>         __u32 max_rules;
>         __s32 max_devices;
> +
> +       /* choose local retries before re-descent */
> +       __u32 choose_local_tries;
> +       /* choose local attempts using a fallback permutation before
> +        * re-descent */
> +       __u32 choose_local_fallback_tries;
> +       /* choose attempts before giving up */
> +       __u32 choose_total_tries;
>  };
>
>
> diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> index d7edc24..35fce75 100644
> --- a/net/ceph/crush/mapper.c
> +++ b/net/ceph/crush/mapper.c
> @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
>         int item = 0;
>         int itemtype;
>         int collide, reject;
> -       const unsigned int orig_tries = 5; /* attempts before we fall back to search */
>
>         dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
>                 bucket->id, x, outpos, numrep);
> @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
>                                         reject = 1;
>                                         goto reject;
>                                 }
> -                               if (flocal >= (in->size>>1) &&
> -                                   flocal > orig_tries)
> +                               if (map->choose_local_fallback_tries > 0 &&
> +                                   flocal >= (in->size>>1) &&
> +                                   flocal > map->choose_local_fallback_tries)

is flocal right here or should it be ftotal?

>                                         item = bucket_perm_choose(in, x, r);
>                                 else
>                                         item = crush_bucket_choose(in, x, r);
> @@ -422,13 +422,14 @@ reject:
>                                         ftotal++;
>                                         flocal++;
>
> -                                       if (collide && flocal < 3)
> +                                       if (collide && flocal <= map->choose_local_tries)
>                                                 /* retry locally a few times */
>                                                 retry_bucket = 1;
> -                                       else if (flocal <= in->size + orig_tries)
> +                                       else if (map->choose_local_fallback_tries > 0 &&
> +                                                flocal <= in->size + map->choose_local_fallback_tries)
>                                                 /* exhaustive bucket search */
>                                                 retry_bucket = 1;
> -                                       else if (ftotal < 20)
> +                                       else if (ftotal <= map->choose_total_tries)
>                                                 /* then retry descent */
>                                                 retry_descent = 1;
>                                         else
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 9600674..3124b71 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -135,6 +135,21 @@ bad:
>         return -EINVAL;
>  }
>
> +static int skip_name_map(void **p, void *end)
> +{
> +        int len;
> +        ceph_decode_32_safe(p, end, len ,bad);
> +        while (len--) {
> +                int strlen;
use u32 for strlen

> +                *p += sizeof(u32);
> +                ceph_decode_32_safe(p, end, strlen, bad);
> +                *p += strlen;
> +}
> +        return 0;
> +bad:
> +        return -EINVAL;
> +}
> +
>  static struct crush_map *crush_decode(void *pbyval, void *end)
>  {
>         struct crush_map *c;
> @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>         void **p = &pbyval;
>         void *start = pbyval;
>         u32 magic;
> +       u32 num_name_maps;
>
>         dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
>
> @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>         if (c == NULL)
>                 return ERR_PTR(-ENOMEM);
>
> +        /* set tunables to default values */
> +        c->choose_local_tries = 2;
> +        c->choose_local_fallback_tries = 5;
> +        c->choose_total_tries = 19;
> +
>         ceph_decode_need(p, end, 4*sizeof(u32), bad);
>         magic = ceph_decode_32(p);
>         if (magic != CRUSH_MAGIC) {
> @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>         }
>
>         /* ignore trailing name maps. */
> +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> +                err = skip_name_map(p, end);
> +                if (err < 0)
> +                        goto done;
> +        }
> +
> +        /* tunables */
> +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> +        c->choose_local_tries = ceph_decode_32(p);
> +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> +        c->choose_total_tries = ceph_decode_32(p);
> +        dout("crush decode tunable choose_local_tries = %d",
> +             c->choose_local_tries);
> +        dout("crush decode tunable choose_local_fallback_tries = %d",
> +             c->choose_local_fallback_tries);
> +        dout("crush decode tunable choose_total_tries = %d",
> +             c->choose_total_tries);
>
> +done:
>         dout("crush_decode success\n");
>         return c;
>
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/9] libceph: report socket read/write error message
  2012-07-21  0:41 ` [PATCH 3/9] libceph: report socket read/write error message Sage Weil
@ 2012-07-24 22:26   ` Yehuda Sadeh
  2012-07-30 18:37   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:26 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> We need to set error_msg to something useful before calling ceph_fault();
> do so here for try_{read,write}().  This is more informative than
>
> libceph: osd0 192.168.106.220:6801 (null)
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  net/ceph/messenger.c |    8 ++++++--
>  1 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 12419a0..7105908 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -2291,14 +2291,18 @@ restart:
>         ret = try_read(con);
>         if (ret == -EAGAIN)
>                 goto restart;
> -       if (ret < 0)
> +       if (ret < 0) {
> +               con->error_msg = "socket error on read";
>                 goto fault;
> +       }
>
>         ret = try_write(con);
>         if (ret == -EAGAIN)
>                 goto restart;
> -       if (ret < 0)
> +       if (ret < 0) {
> +               con->error_msg = "socket error on write";
>                 goto fault;
> +       }
>
>  done:
>         mutex_unlock(&con->mutex);
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close
  2012-07-21  0:41 ` [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close Sage Weil
@ 2012-07-24 22:29   ` Yehuda Sadeh
  2012-07-30 18:43   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:29 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> Hold the mutex while twiddling all of the state bits to avoid possible
> races.  While we're here, make not of why we cannot close the socket
> directly.
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  net/ceph/messenger.c |    8 +++++++-
>  1 files changed, 7 insertions(+), 1 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 7105908..e24310e 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -503,6 +503,7 @@ static void reset_connection(struct ceph_connection *con)
>   */
>  void ceph_con_close(struct ceph_connection *con)
>  {
> +       mutex_lock(&con->mutex);
>         dout("con_close %p peer %s\n", con,
>              ceph_pr_addr(&con->peer_addr.in_addr));
>         clear_bit(NEGOTIATING, &con->state);
> @@ -515,11 +516,16 @@ void ceph_con_close(struct ceph_connection *con)
>         clear_bit(KEEPALIVE_PENDING, &con->flags);
>         clear_bit(WRITE_PENDING, &con->flags);
>
> -       mutex_lock(&con->mutex);
>         reset_connection(con);
>         con->peer_global_seq = 0;
>         cancel_delayed_work(&con->work);
>         mutex_unlock(&con->mutex);
> +
> +       /*
> +        * We cannot close the socket directly from here because the
> +        * work threads use it without holding the mutex.  Instead, let
> +        * con_work() do it.
> +        */
>         queue_con(con);
>  }
>  EXPORT_SYMBOL(ceph_con_close);
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes
  2012-07-21  0:41 ` [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes Sage Weil
@ 2012-07-24 22:51   ` Yehuda Sadeh
  2012-07-30 22:40   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:51 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> The linger op registration (i.e., watch) modifies the object state.  As
> such, the OSD will reply with success if it has already applied without
> doing the associated side-effects (setting up the watch session state).
> If we lose the ACK and resubmit, we will see success but the watch will not
> be correctly registered and we won't get notifies.
>
> To fix this, always resubmit the linger op with a new tid.  We accomplish
> this by re-registering as a linger (i.e., 'registered') if we are not yet
> registered.  Then the second loop will treat this just like a normal
> case of re-registering.
>
> This mirrors a similar fix on the userland ceph.git, commit 5dd68b95, and
> ceph bug #2796.
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  net/ceph/osd_client.c |   26 +++++++++++++++++++++-----
>  1 files changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 07920ca..c605705 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -891,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
>  {
>         dout("__register_linger_request %p\n", req);
>         list_add_tail(&req->r_linger_item, &osdc->req_linger);
> -       list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
> +       if (req->r_osd)
> +               list_add_tail(&req->r_linger_osd,
> +                             &req->r_osd->o_linger_requests);
>  }
>
>  static void __unregister_linger_request(struct ceph_osd_client *osdc,
> @@ -1305,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
>
>         dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
>         mutex_lock(&osdc->request_mutex);
> -       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
> +       for (p = rb_first(&osdc->requests); p; ) {
>                 req = rb_entry(p, struct ceph_osd_request, r_node);
> +               p = rb_next(p);

It is not clear why you moved p = rb_next(p) from up there to here.

>                 err = __map_request(osdc, req, force_resend);
>                 if (err < 0)
>                         continue;  /* error */
> @@ -1314,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
>                         dout("%p tid %llu maps to no osd\n", req, req->r_tid);
>                         needmap++;  /* request a newer map */
>                 } else if (err > 0) {
> -                       dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
> -                            req->r_osd ? req->r_osd->o_osd : -1);
> -                       if (!req->r_linger)
> +                       if (!req->r_linger) {
> +                               dout("%p tid %llu requeued on osd%d\n", req,
> +                                    req->r_tid,
> +                                    req->r_osd ? req->r_osd->o_osd : -1);

I think we should create a helper for this, the code is cluttered with
all these tests:

static inline int osd_num(struct ceph_osd *osd)
{
  return (osd ? osd->o_osd : -1);
}

We can do it later.

>                                 req->r_flags |= CEPH_OSD_FLAG_RETRY;
> +                       }
> +               }
> +               if (req->r_linger && list_empty(&req->r_linger_item)) {
> +                       /*
> +                        * register as a linger so that we will
> +                        * re-submit below and get a new tid
> +                        */
> +                       dout("%p tid %llu restart on osd%d\n",
> +                            req, req->r_tid,
> +                            req->r_osd ? req->r_osd->o_osd : -1);
> +                       __register_linger_request(osdc, req);
> +                       __unregister_request(osdc, req);
>                 }
>         }
>
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive
  2012-07-21  0:41 ` [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive Sage Weil
@ 2012-07-24 22:55   ` Yehuda Sadeh
  2012-07-30 19:04   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:55 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> Previously, we were opportunistically initializing the bio_iter if it
> appeared to be uninitialized in the middle of the read path.  The problem
> is that a sequence like:
>
>  - start reading message
>  - initialize bio_iter
>  - read half a message
>  - messenger fault, reconnect
>  - restart reading message
>  - ** bio_iter now non-NULL, not reinitialized **
>  - read past end of bio, crash
>
> Instead, initialize the bio_iter unconditionally when we allocate/claim
> the message for read.
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  net/ceph/messenger.c |   11 ++++++-----
>  1 files changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index e24310e..efa369f 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1876,6 +1876,11 @@ static int read_partial_message(struct ceph_connection *con)
>                 else
>                         con->in_msg_pos.page_pos = 0;
>                 con->in_msg_pos.data_pos = 0;
> +
> +#ifdef CONFIG_BLOCK
> +               if (m->bio)
> +                       init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
> +#endif
>         }
>
>         /* front */
> @@ -1892,10 +1897,6 @@ static int read_partial_message(struct ceph_connection *con)
>                 if (ret <= 0)
>                         return ret;
>         }
> -#ifdef CONFIG_BLOCK
> -       if (m->bio && !m->bio_iter)
> -               init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
> -#endif
>
>         /* (page) data */
>         while (con->in_msg_pos.data_pos < data_len) {
> @@ -1906,7 +1907,7 @@ static int read_partial_message(struct ceph_connection *con)
>                                 return ret;
>  #ifdef CONFIG_BLOCK
>                 } else if (m->bio) {
> -
> +                       BUG_ON(!m->bio_iter);
>                         ret = read_partial_message_bio(con,
>                                                  &m->bio_iter, &m->bio_seg,
>                                                  data_len, do_datacrc);
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 7/9] ceph: close old con before reopening on mds reconnect
  2012-07-21  0:41 ` [PATCH 7/9] ceph: close old con before reopening on mds reconnect Sage Weil
@ 2012-07-24 22:56   ` Yehuda Sadeh
  2012-07-30 23:11     ` Sage Weil
  0 siblings, 1 reply; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:56 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> When we detect a mds session reset, close the old ceph_connection before
> reopening it.  This ensures we clean up the old socket properly and keep
> the ceph_connection state correct.
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  fs/ceph/mds_client.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 39b76d6..a5a7354 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2518,6 +2518,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
>         session->s_state = CEPH_MDS_SESSION_RECONNECTING;
>         session->s_seq = 0;
>
> +       ceph_con_close(&session->s_con);
>         ceph_con_open(&session->s_con,

Should we also BUG_ON in ceph_con_open if connection is not closed?

>                       CEPH_ENTITY_TYPE_MDS, mds,
>                       ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 8/9] libceph: protect ceph_con_open() with mutex
  2012-07-21  0:41 ` [PATCH 8/9] libceph: protect ceph_con_open() with mutex Sage Weil
@ 2012-07-24 22:58   ` Yehuda Sadeh
  2012-07-30 19:06   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 22:58 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> Take the con mutex while we are initiating a ceph open.  This is necessary
> because the may have previously been in use and then closed, which could
> result in a racing workqueue running con_work().
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  net/ceph/messenger.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index efa369f..65964c2 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -537,6 +537,7 @@ void ceph_con_open(struct ceph_connection *con,
>                    __u8 entity_type, __u64 entity_num,
>                    struct ceph_entity_addr *addr)
>  {
> +       mutex_lock(&con->mutex);
>         dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
>         set_bit(OPENING, &con->state);
>         WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
> @@ -546,6 +547,7 @@ void ceph_con_open(struct ceph_connection *con,
>
>         memcpy(&con->peer_addr, addr, sizeof(*addr));
>         con->delay = 0;      /* reset backoff memory */
> +       mutex_unlock(&con->mutex);
>         queue_con(con);
>  }
>  EXPORT_SYMBOL(ceph_con_open);
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 9/9] libceph: reset connection retry on successfully negotiation
  2012-07-21  0:41 ` [PATCH 9/9] libceph: reset connection retry on successfully negotiation Sage Weil
@ 2012-07-24 23:00   ` Yehuda Sadeh
  0 siblings, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-24 23:00 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> We exponentially back off when we encounter connection errors.  If several
> errors accumulate, we will eventually wait ages before even trying to
> reconnect.
>
> Fix this by resetting the backoff counter after a successful negotiation/
> connection with the remote node.  Fixes ceph issue #2802.
>
> Signed-off-by: Sage Weil <sage@inktank.com>
> ---
>  net/ceph/messenger.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 65964c2..28896eb 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1633,6 +1633,8 @@ static int process_connect(struct ceph_connection *con)
>                 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
>                         set_bit(LOSSYTX, &con->flags);
>
> +               con->delay = 0;      /* reset backoff memory */
> +
>                 prepare_read_tag(con);
>                 break;
>
> --
> 1.7.9
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/9] libceph: move feature bits to separate header
  2012-07-21  0:41 ` [PATCH 1/9] libceph: move feature bits to separate header Sage Weil
  2012-07-24 22:14   ` Yehuda Sadeh
@ 2012-07-30 18:29   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Elder @ 2012-07-30 18:29 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On 07/20/2012 07:41 PM, Sage Weil wrote:
> This is simply cleanup that will keep things more closely synced with the
> userland code.
> 
> Signed-off-by: Sage Weil <sage@inktank.com>

Looks good.

Reviewed-by: Alex Elder <elder@inktank.com>

> ---
>  fs/ceph/mds_client.c               |    1 +
>  fs/ceph/super.c                    |    1 +
>  include/linux/ceph/ceph_features.h |   24 ++++++++++++++++++++++++
>  include/linux/ceph/ceph_fs.h       |   14 --------------
>  include/linux/ceph/libceph.h       |    6 ------
>  net/ceph/ceph_common.c             |    5 +++--
>  6 files changed, 29 insertions(+), 22 deletions(-)
>  create mode 100644 include/linux/ceph/ceph_features.h
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 418f6a8..39b76d6 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -10,6 +10,7 @@
>  #include "super.h"
>  #include "mds_client.h"
>  
> +#include <linux/ceph/ceph_features.h>
>  #include <linux/ceph/messenger.h>
>  #include <linux/ceph/decode.h>
>  #include <linux/ceph/pagelist.h>
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 1e67dd7..2c47ecf 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -18,6 +18,7 @@
>  #include "super.h"
>  #include "mds_client.h"
>  
> +#include <linux/ceph/ceph_features.h>
>  #include <linux/ceph/decode.h>
>  #include <linux/ceph/mon_client.h>
>  #include <linux/ceph/auth.h>
> diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> new file mode 100644
> index 0000000..342f93d
> --- /dev/null
> +++ b/include/linux/ceph/ceph_features.h
> @@ -0,0 +1,24 @@
> +#ifndef __CEPH_FEATURES
> +#define __CEPH_FEATURES
> +
> +/*
> + * feature bits
> + */
> +#define CEPH_FEATURE_UID            (1<<0)
> +#define CEPH_FEATURE_NOSRCADDR      (1<<1)
> +#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
> +#define CEPH_FEATURE_FLOCK          (1<<3)
> +#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
> +#define CEPH_FEATURE_MONNAMES       (1<<5)
> +#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> +#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> +
> +/*
> + * Features supported.
> + */
> +#define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> +	(CEPH_FEATURE_NOSRCADDR)
> +
> +#define CEPH_FEATURES_REQUIRED_DEFAULT   \
> +	(CEPH_FEATURE_NOSRCADDR)
> +#endif
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index e81ab30..d021610 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -35,20 +35,6 @@
>  /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
>  #define CEPH_MAX_MON   31
>  
> -
> -/*
> - * feature bits
> - */
> -#define CEPH_FEATURE_UID            (1<<0)
> -#define CEPH_FEATURE_NOSRCADDR      (1<<1)
> -#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
> -#define CEPH_FEATURE_FLOCK          (1<<3)
> -#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
> -#define CEPH_FEATURE_MONNAMES       (1<<5)
> -#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> -#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> -
> -
>  /*
>   * ceph_file_layout - describe data layout for a file/inode
>   */
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 98ec36a..ea072e1 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -23,12 +23,6 @@
>  #include "ceph_fs.h"
>  
>  /*
> - * Supported features
> - */
> -#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
> -#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
> -
> -/*
>   * mount options
>   */
>  #define CEPH_OPT_FSID             (1<<0)
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 3b45e01..69e38db 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -17,6 +17,7 @@
>  #include <linux/string.h>
>  
>  
> +#include <linux/ceph/ceph_features.h>
>  #include <linux/ceph/libceph.h>
>  #include <linux/ceph/debugfs.h>
>  #include <linux/ceph/decode.h>
> @@ -460,9 +461,9 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
>  	client->auth_err = 0;
>  
>  	client->extra_mon_dispatch = NULL;
> -	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
> +	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
>  		supported_features;
> -	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
> +	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
>  		required_features;
>  
>  	/* msgr */
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 2/9] libceph: support crush tunables
  2012-07-21  0:41 ` [PATCH 2/9] libceph: support crush tunables Sage Weil
  2012-07-24 22:24   ` Yehuda Sadeh
@ 2012-07-30 18:36   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Elder @ 2012-07-30 18:36 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel, caleb miles

On 07/20/2012 07:41 PM, Sage Weil wrote:
> From: caleb miles <caleb.miles@inktank.com>
> 
> The server side recently added support for tuning some magic
> crush variables. Decode these variables if they are present, or use the
> default values if they are not present.
> 
> Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> 
> Signed-off-by: caleb miles <caleb.miles@inktank.com>
> Reviewed-by: Sage Weil <sage@inktank.com>

You know I prefer symbolic constant definitions instead of
bald numeric constants in the middle of code.  I.e.:

    #define CRUSH_LOCAL_TRIES_DEFAULT		2
    #define CRUSH_LOCAL_FALLBACK_TRIES_DEFAULT	5
    #define CRUSH_TOTAL_TRIES_DEFAULT		19

But it's OK with me if this gets committed as-is.

Reviewed-by: Alex Elder <elder@inktank.com>

> ---
>  include/linux/ceph/ceph_features.h |    4 ++-
>  include/linux/crush/crush.h        |    8 +++++++
>  net/ceph/crush/mapper.c            |   13 ++++++-----
>  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 57 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> index 342f93d..df25dcf 100644
> --- a/include/linux/ceph/ceph_features.h
> +++ b/include/linux/ceph/ceph_features.h
> @@ -12,12 +12,14 @@
>  #define CEPH_FEATURE_MONNAMES       (1<<5)
>  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
>  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
>  
>  /*
>   * Features supported.
>   */
>  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> -	(CEPH_FEATURE_NOSRCADDR)
> +	(CEPH_FEATURE_NOSRCADDR |	 \
> +	 CEPH_FEATURE_CRUSH_TUNABLES)
>  
>  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
>  	(CEPH_FEATURE_NOSRCADDR)
> diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> index 7c47508..25baa28 100644
> --- a/include/linux/crush/crush.h
> +++ b/include/linux/crush/crush.h
> @@ -154,6 +154,14 @@ struct crush_map {
>  	__s32 max_buckets;
>  	__u32 max_rules;
>  	__s32 max_devices;
> +
> +	/* choose local retries before re-descent */
> +	__u32 choose_local_tries;
> +	/* choose local attempts using a fallback permutation before
> +	 * re-descent */
> +	__u32 choose_local_fallback_tries;
> +	/* choose attempts before giving up */ 
> +	__u32 choose_total_tries;
>  };
>  
>  
> diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> index d7edc24..35fce75 100644
> --- a/net/ceph/crush/mapper.c
> +++ b/net/ceph/crush/mapper.c
> @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
>  	int item = 0;
>  	int itemtype;
>  	int collide, reject;
> -	const unsigned int orig_tries = 5; /* attempts before we fall back to search */
>  
>  	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
>  		bucket->id, x, outpos, numrep);
> @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
>  					reject = 1;
>  					goto reject;
>  				}
> -				if (flocal >= (in->size>>1) &&
> -				    flocal > orig_tries)
> +				if (map->choose_local_fallback_tries > 0 &&
> +				    flocal >= (in->size>>1) &&
> +				    flocal > map->choose_local_fallback_tries)
>  					item = bucket_perm_choose(in, x, r);
>  				else
>  					item = crush_bucket_choose(in, x, r);
> @@ -422,13 +422,14 @@ reject:
>  					ftotal++;
>  					flocal++;
>  
> -					if (collide && flocal < 3)
> +					if (collide && flocal <= map->choose_local_tries)
>  						/* retry locally a few times */
>  						retry_bucket = 1;
> -					else if (flocal <= in->size + orig_tries)
> +					else if (map->choose_local_fallback_tries > 0 &&
> +						 flocal <= in->size + map->choose_local_fallback_tries)
>  						/* exhaustive bucket search */
>  						retry_bucket = 1;
> -					else if (ftotal < 20)
> +					else if (ftotal <= map->choose_total_tries)
>  						/* then retry descent */
>  						retry_descent = 1;
>  					else
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 9600674..3124b71 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -135,6 +135,21 @@ bad:
>  	return -EINVAL;
>  }
>  
> +static int skip_name_map(void **p, void *end)
> +{
> +        int len;
> +        ceph_decode_32_safe(p, end, len ,bad);
> +        while (len--) {
> +                int strlen;
> +                *p += sizeof(u32);
> +                ceph_decode_32_safe(p, end, strlen, bad);
> +                *p += strlen;
> +}
> +        return 0;
> +bad:
> +        return -EINVAL;
> +}
> +
>  static struct crush_map *crush_decode(void *pbyval, void *end)
>  {
>  	struct crush_map *c;
> @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	void **p = &pbyval;
>  	void *start = pbyval;
>  	u32 magic;
> +	u32 num_name_maps;
>  
>  	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
>  
> @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	if (c == NULL)
>  		return ERR_PTR(-ENOMEM);
>  
> +        /* set tunables to default values */
> +        c->choose_local_tries = 2;
> +        c->choose_local_fallback_tries = 5;
> +        c->choose_total_tries = 19;
> +
>  	ceph_decode_need(p, end, 4*sizeof(u32), bad);
>  	magic = ceph_decode_32(p);
>  	if (magic != CRUSH_MAGIC) {
> @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	}
>  
>  	/* ignore trailing name maps. */
> +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> +                err = skip_name_map(p, end);
> +                if (err < 0)
> +                        goto done;
> +        }
> +
> +        /* tunables */
> +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> +        c->choose_local_tries = ceph_decode_32(p);
> +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> +        c->choose_total_tries = ceph_decode_32(p);
> +        dout("crush decode tunable choose_local_tries = %d",
> +             c->choose_local_tries);
> +        dout("crush decode tunable choose_local_fallback_tries = %d",
> +             c->choose_local_fallback_tries);
> +        dout("crush decode tunable choose_total_tries = %d",
> +             c->choose_total_tries);
>  
> +done:
>  	dout("crush_decode success\n");
>  	return c;
>  
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/9] libceph: report socket read/write error message
  2012-07-21  0:41 ` [PATCH 3/9] libceph: report socket read/write error message Sage Weil
  2012-07-24 22:26   ` Yehuda Sadeh
@ 2012-07-30 18:37   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Elder @ 2012-07-30 18:37 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On 07/20/2012 07:41 PM, Sage Weil wrote:
> We need to set error_msg to something useful before calling ceph_fault();
> do so here for try_{read,write}().  This is more informative than
> 
> libceph: osd0 192.168.106.220:6801 (null)
> 
> Signed-off-by: Sage Weil <sage@inktank.com>

Looks good.

Reviewed-by: Alex Elder <elder@inktank.com>

> ---
>  net/ceph/messenger.c |    8 ++++++--
>  1 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 12419a0..7105908 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -2291,14 +2291,18 @@ restart:
>  	ret = try_read(con);
>  	if (ret == -EAGAIN)
>  		goto restart;
> -	if (ret < 0)
> +	if (ret < 0) {
> +		con->error_msg = "socket error on read";
>  		goto fault;
> +	}
>  
>  	ret = try_write(con);
>  	if (ret == -EAGAIN)
>  		goto restart;
> -	if (ret < 0)
> +	if (ret < 0) {
> +		con->error_msg = "socket error on write";
>  		goto fault;
> +	}
>  
>  done:
>  	mutex_unlock(&con->mutex);
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close
  2012-07-21  0:41 ` [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close Sage Weil
  2012-07-24 22:29   ` Yehuda Sadeh
@ 2012-07-30 18:43   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Elder @ 2012-07-30 18:43 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On 07/20/2012 07:41 PM, Sage Weil wrote:
> Hold the mutex while twiddling all of the state bits to avoid possible
> races.  While we're here, make not of why we cannot close the socket
> directly.
> 
> Signed-off-by: Sage Weil <sage@inktank.com>

Looks OK to me.  A quick scan seems to show that the
state and flag bits are *almost* always set while the
mutex is held.  The one counterexample I found was for
the STANDBY state bit in clear_standby() (but I really
didn't look very closely).

Anyway, I think this looks fine--it makes things safer
even if there could be another imperfection somewhere.

Reviewed-by: Alex Elder <elder@inktank.com>

> ---
>  net/ceph/messenger.c |    8 +++++++-
>  1 files changed, 7 insertions(+), 1 deletions(-)
> 
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 7105908..e24310e 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -503,6 +503,7 @@ static void reset_connection(struct ceph_connection *con)
>   */
>  void ceph_con_close(struct ceph_connection *con)
>  {
> +	mutex_lock(&con->mutex);
>  	dout("con_close %p peer %s\n", con,
>  	     ceph_pr_addr(&con->peer_addr.in_addr));
>  	clear_bit(NEGOTIATING, &con->state);
> @@ -515,11 +516,16 @@ void ceph_con_close(struct ceph_connection *con)
>  	clear_bit(KEEPALIVE_PENDING, &con->flags);
>  	clear_bit(WRITE_PENDING, &con->flags);
>  
> -	mutex_lock(&con->mutex);
>  	reset_connection(con);
>  	con->peer_global_seq = 0;
>  	cancel_delayed_work(&con->work);
>  	mutex_unlock(&con->mutex);
> +
> +	/*
> +	 * We cannot close the socket directly from here because the
> +	 * work threads use it without holding the mutex.  Instead, let
> +	 * con_work() do it.
> +	 */
>  	queue_con(con);
>  }
>  EXPORT_SYMBOL(ceph_con_close);
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive
  2012-07-21  0:41 ` [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive Sage Weil
  2012-07-24 22:55   ` Yehuda Sadeh
@ 2012-07-30 19:04   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Elder @ 2012-07-30 19:04 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On 07/20/2012 07:41 PM, Sage Weil wrote:
> Previously, we were opportunistically initializing the bio_iter if it
> appeared to be uninitialized in the middle of the read path.  The problem
> is that a sequence like:
> 
>  - start reading message
>  - initialize bio_iter
>  - read half a message
>  - messenger fault, reconnect
>  - restart reading message
>  - ** bio_iter now non-NULL, not reinitialized **
>  - read past end of bio, crash
> 
> Instead, initialize the bio_iter unconditionally when we allocate/claim
> the message for read.
> 
> Signed-off-by: Sage Weil <sage@inktank.com>

This seems to be similar to this recent change to the writing side
of things:
    commit 43643528cce60ca184fe8197efa8e8da7c89a037
    Author: Yan, Zheng <zheng.z.yan@intel.com>
    rbd: Clear ceph_msg->bio_iter for retransmitted message

In any case, this is the place where a new message is being
initialized, so it makes sense that this is where you re-init
the bio_iter pointer.

Reviewed-by: Alex Elder <elder@inktank.com>


> ---
>  net/ceph/messenger.c |   11 ++++++-----
>  1 files changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index e24310e..efa369f 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -1876,6 +1876,11 @@ static int read_partial_message(struct ceph_connection *con)
>  		else
>  			con->in_msg_pos.page_pos = 0;
>  		con->in_msg_pos.data_pos = 0;
> +
> +#ifdef CONFIG_BLOCK
> +		if (m->bio)
> +			init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
> +#endif
>  	}
>  
>  	/* front */
> @@ -1892,10 +1897,6 @@ static int read_partial_message(struct ceph_connection *con)
>  		if (ret <= 0)
>  			return ret;
>  	}
> -#ifdef CONFIG_BLOCK
> -	if (m->bio && !m->bio_iter)
> -		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
> -#endif
>  
>  	/* (page) data */
>  	while (con->in_msg_pos.data_pos < data_len) {
> @@ -1906,7 +1907,7 @@ static int read_partial_message(struct ceph_connection *con)
>  				return ret;
>  #ifdef CONFIG_BLOCK
>  		} else if (m->bio) {
> -
> +			BUG_ON(!m->bio_iter);
>  			ret = read_partial_message_bio(con,
>  						 &m->bio_iter, &m->bio_seg,
>  						 data_len, do_datacrc);
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 8/9] libceph: protect ceph_con_open() with mutex
  2012-07-21  0:41 ` [PATCH 8/9] libceph: protect ceph_con_open() with mutex Sage Weil
  2012-07-24 22:58   ` Yehuda Sadeh
@ 2012-07-30 19:06   ` Alex Elder
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Elder @ 2012-07-30 19:06 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On 07/20/2012 07:41 PM, Sage Weil wrote:
> Take the con mutex while we are initiating a ceph open.  This is necessary
> because the may have previously been in use and then closed, which could
> result in a racing workqueue running con_work().
> 
> Signed-off-by: Sage Weil <sage@inktank.com>

Well there you go, another place where a state bit is
changed without holding the mutex.

Looks good.

Reviewed-by: Alex Elder <elder@inktank.com>

> ---
>  net/ceph/messenger.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index efa369f..65964c2 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -537,6 +537,7 @@ void ceph_con_open(struct ceph_connection *con,
>  		   __u8 entity_type, __u64 entity_num,
>  		   struct ceph_entity_addr *addr)
>  {
> +	mutex_lock(&con->mutex);
>  	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
>  	set_bit(OPENING, &con->state);
>  	WARN_ON(!test_and_clear_bit(CLOSED, &con->state));
> @@ -546,6 +547,7 @@ void ceph_con_open(struct ceph_connection *con,
>  
>  	memcpy(&con->peer_addr, addr, sizeof(*addr));
>  	con->delay = 0;      /* reset backoff memory */
> +	mutex_unlock(&con->mutex);
>  	queue_con(con);
>  }
>  EXPORT_SYMBOL(ceph_con_open);
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes
  2012-07-21  0:41 ` [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes Sage Weil
  2012-07-24 22:51   ` Yehuda Sadeh
@ 2012-07-30 22:40   ` Alex Elder
  2012-07-30 23:03     ` Sage Weil
  1 sibling, 1 reply; 30+ messages in thread
From: Alex Elder @ 2012-07-30 22:40 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

On 07/20/2012 07:41 PM, Sage Weil wrote:
> The linger op registration (i.e., watch) modifies the object state.  As
> such, the OSD will reply with success if it has already applied without
> doing the associated side-effects (setting up the watch session state).
> If we lose the ACK and resubmit, we will see success but the watch will not
> be correctly registered and we won't get notifies.
> 
> To fix this, always resubmit the linger op with a new tid.  We accomplish
> this by re-registering as a linger (i.e., 'registered') if we are not yet
> registered.  Then the second loop will treat this just like a normal
> case of re-registering.
> 
> This mirrors a similar fix on the userland ceph.git, commit 5dd68b95, and
> ceph bug #2796.
> 
> Signed-off-by: Sage Weil <sage@inktank.com>

I have two minor comments below.  I confess I don't enough about what's
going on conceptually here to offer a high quality review.  However the
change seems be doing what you say is needed, so I guess I'll say it
looks OK to me.  Please try to get another reviewer to sign off though.

Reviewed-by: Alex Elder <elder@inktank.com>)

> ---
>  net/ceph/osd_client.c |   26 +++++++++++++++++++++-----
>  1 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 07920ca..c605705 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -891,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
>  {
>  	dout("__register_linger_request %p\n", req);
>  	list_add_tail(&req->r_linger_item, &osdc->req_linger);
> -	list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
> +	if (req->r_osd)
> +		list_add_tail(&req->r_linger_osd,
> +			      &req->r_osd->o_linger_requests);
>  }
>  
>  static void __unregister_linger_request(struct ceph_osd_client *osdc,
> @@ -1305,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
>  
>  	dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
>  	mutex_lock(&osdc->request_mutex);
> -	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
> +	for (p = rb_first(&osdc->requests); p; ) {
>  		req = rb_entry(p, struct ceph_osd_request, r_node);
> +		p = rb_next(p);

Why is this hunk necessary?  Can the request's rb pointer(s)
get updated somehow via __map_request() or something?

>  		err = __map_request(osdc, req, force_resend);
>  		if (err < 0)
>  			continue;  /* error */
> @@ -1314,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
>  			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
>  			needmap++;  /* request a newer map */
>  		} else if (err > 0) {
> -			dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
> -			     req->r_osd ? req->r_osd->o_osd : -1);
> -			if (!req->r_linger)
> +			if (!req->r_linger) {
> +				dout("%p tid %llu requeued on osd%d\n", req,
> +				     req->r_tid,
> +				     req->r_osd ? req->r_osd->o_osd : -1);
>  				req->r_flags |= CEPH_OSD_FLAG_RETRY;
> +			}
> +		}
> +		if (req->r_linger && list_empty(&req->r_linger_item)) {
> +			/*
> +			 * register as a lingkker so that we will
                                             ^^
Is this the Swedish spelling?

> +			 * re-submit below and get a new tid
> +			 */
> +			dout("%p tid %llu restart on osd%d\n",
> +			     req, req->r_tid,
> +			     req->r_osd ? req->r_osd->o_osd : -1);
> +			__register_linger_request(osdc, req);
> +			__unregister_request(osdc, req);
>  		}
>  	}
>  
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes
  2012-07-30 22:40   ` Alex Elder
@ 2012-07-30 23:03     ` Sage Weil
  0 siblings, 0 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-30 23:03 UTC (permalink / raw)
  To: Alex Elder; +Cc: ceph-devel

On Mon, 30 Jul 2012, Alex Elder wrote:
> On 07/20/2012 07:41 PM, Sage Weil wrote:
> > The linger op registration (i.e., watch) modifies the object state.  As
> > such, the OSD will reply with success if it has already applied without
> > doing the associated side-effects (setting up the watch session state).
> > If we lose the ACK and resubmit, we will see success but the watch will not
> > be correctly registered and we won't get notifies.
> > 
> > To fix this, always resubmit the linger op with a new tid.  We accomplish
> > this by re-registering as a linger (i.e., 'registered') if we are not yet
> > registered.  Then the second loop will treat this just like a normal
> > case of re-registering.
> > 
> > This mirrors a similar fix on the userland ceph.git, commit 5dd68b95, and
> > ceph bug #2796.
> > 
> > Signed-off-by: Sage Weil <sage@inktank.com>
> 
> I have two minor comments below.  I confess I don't enough about what's
> going on conceptually here to offer a high quality review.  However the
> change seems be doing what you say is needed, so I guess I'll say it
> looks OK to me.  Please try to get another reviewer to sign off though.
> 
> Reviewed-by: Alex Elder <elder@inktank.com>)
> 
> > ---
> >  net/ceph/osd_client.c |   26 +++++++++++++++++++++-----
> >  1 files changed, 21 insertions(+), 5 deletions(-)
> > 
> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > index 07920ca..c605705 100644
> > --- a/net/ceph/osd_client.c
> > +++ b/net/ceph/osd_client.c
> > @@ -891,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
> >  {
> >  	dout("__register_linger_request %p\n", req);
> >  	list_add_tail(&req->r_linger_item, &osdc->req_linger);
> > -	list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
> > +	if (req->r_osd)
> > +		list_add_tail(&req->r_linger_osd,
> > +			      &req->r_osd->o_linger_requests);
> >  }
> >  
> >  static void __unregister_linger_request(struct ceph_osd_client *osdc,
> > @@ -1305,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
> >  
> >  	dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
> >  	mutex_lock(&osdc->request_mutex);
> > -	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
> > +	for (p = rb_first(&osdc->requests); p; ) {
> >  		req = rb_entry(p, struct ceph_osd_request, r_node);
> > +		p = rb_next(p);
> 
> Why is this hunk necessary?  Can the request's rb pointer(s)
> get updated somehow via __map_request() or something?

Exactly.

> 
> >  		err = __map_request(osdc, req, force_resend);
> >  		if (err < 0)
> >  			continue;  /* error */
> > @@ -1314,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
> >  			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
> >  			needmap++;  /* request a newer map */
> >  		} else if (err > 0) {
> > -			dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
> > -			     req->r_osd ? req->r_osd->o_osd : -1);
> > -			if (!req->r_linger)
> > +			if (!req->r_linger) {
> > +				dout("%p tid %llu requeued on osd%d\n", req,
> > +				     req->r_tid,
> > +				     req->r_osd ? req->r_osd->o_osd : -1);
> >  				req->r_flags |= CEPH_OSD_FLAG_RETRY;
> > +			}
> > +		}
> > +		if (req->r_linger && list_empty(&req->r_linger_item)) {
> > +			/*
> > +			 * register as a lingkker so that we will
>                                              ^^
> Is this the Swedish spelling?

Sigh, will fix!

> 
> > +			 * re-submit below and get a new tid
> > +			 */
> > +			dout("%p tid %llu restart on osd%d\n",
> > +			     req, req->r_tid,
> > +			     req->r_osd ? req->r_osd->o_osd : -1);
> > +			__register_linger_request(osdc, req);
> > +			__unregister_request(osdc, req);
> >  		}
> >  	}
> >  
> > 
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 7/9] ceph: close old con before reopening on mds reconnect
  2012-07-24 22:56   ` Yehuda Sadeh
@ 2012-07-30 23:11     ` Sage Weil
  0 siblings, 0 replies; 30+ messages in thread
From: Sage Weil @ 2012-07-30 23:11 UTC (permalink / raw)
  To: Yehuda Sadeh; +Cc: ceph-devel

On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> > When we detect a mds session reset, close the old ceph_connection before
> > reopening it.  This ensures we clean up the old socket properly and keep
> > the ceph_connection state correct.
> >
> > Signed-off-by: Sage Weil <sage@inktank.com>
> > ---
> >  fs/ceph/mds_client.c |    1 +
> >  1 files changed, 1 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 39b76d6..a5a7354 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -2518,6 +2518,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
> >         session->s_state = CEPH_MDS_SESSION_RECONNECTING;
> >         session->s_seq = 0;
> >
> > +       ceph_con_close(&session->s_con);
> >         ceph_con_open(&session->s_con,
> 
> Should we also BUG_ON in ceph_con_open if connection is not closed?

Yeah, adding that.

> 
> >                       CEPH_ENTITY_TYPE_MDS, mds,
> >                       ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
> > --
> > 1.7.9
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 2/9] libceph: support crush tunables
  2012-07-24 22:24   ` Yehuda Sadeh
@ 2012-07-30 23:14     ` Sage Weil
  2012-07-30 23:45       ` Yehuda Sadeh
  0 siblings, 1 reply; 30+ messages in thread
From: Sage Weil @ 2012-07-30 23:14 UTC (permalink / raw)
  To: Yehuda Sadeh; +Cc: ceph-devel, caleb miles

On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> > From: caleb miles <caleb.miles@inktank.com>
> >
> > The server side recently added support for tuning some magic
> > crush variables. Decode these variables if they are present, or use the
> > default values if they are not present.
> >
> > Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> >
> > Signed-off-by: caleb miles <caleb.miles@inktank.com>
> > Reviewed-by: Sage Weil <sage@inktank.com>
> > ---
> >  include/linux/ceph/ceph_features.h |    4 ++-
> >  include/linux/crush/crush.h        |    8 +++++++
> >  net/ceph/crush/mapper.c            |   13 ++++++-----
> >  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
> >  4 files changed, 57 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> > index 342f93d..df25dcf 100644
> > --- a/include/linux/ceph/ceph_features.h
> > +++ b/include/linux/ceph/ceph_features.h
> > @@ -12,12 +12,14 @@
> >  #define CEPH_FEATURE_MONNAMES       (1<<5)
> >  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> >  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
> 
> any reason why this is 18 and not 8?

9-17 are used.. just not implemented/used by the kernel code.

> 
> >
> >  /*
> >   * Features supported.
> >   */
> >  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> > -       (CEPH_FEATURE_NOSRCADDR)
> > +       (CEPH_FEATURE_NOSRCADDR |        \
> > +        CEPH_FEATURE_CRUSH_TUNABLES)
> >
> >  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
> >         (CEPH_FEATURE_NOSRCADDR)
> > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> > index 7c47508..25baa28 100644
> > --- a/include/linux/crush/crush.h
> > +++ b/include/linux/crush/crush.h
> > @@ -154,6 +154,14 @@ struct crush_map {
> >         __s32 max_buckets;
> >         __u32 max_rules;
> >         __s32 max_devices;
> > +
> > +       /* choose local retries before re-descent */
> > +       __u32 choose_local_tries;
> > +       /* choose local attempts using a fallback permutation before
> > +        * re-descent */
> > +       __u32 choose_local_fallback_tries;
> > +       /* choose attempts before giving up */
> > +       __u32 choose_total_tries;
> >  };
> >
> >
> > diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> > index d7edc24..35fce75 100644
> > --- a/net/ceph/crush/mapper.c
> > +++ b/net/ceph/crush/mapper.c
> > @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
> >         int item = 0;
> >         int itemtype;
> >         int collide, reject;
> > -       const unsigned int orig_tries = 5; /* attempts before we fall back to search */
> >
> >         dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
> >                 bucket->id, x, outpos, numrep);
> > @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
> >                                         reject = 1;
> >                                         goto reject;
> >                                 }
> > -                               if (flocal >= (in->size>>1) &&
> > -                                   flocal > orig_tries)
> > +                               if (map->choose_local_fallback_tries > 0 &&
> > +                                   flocal >= (in->size>>1) &&
> > +                                   flocal > map->choose_local_fallback_tries)
> 
> is flocal right here or should it be ftotal?
> 
> >                                         item = bucket_perm_choose(in, x, r);
> >                                 else
> >                                         item = crush_bucket_choose(in, x, r);
> > @@ -422,13 +422,14 @@ reject:
> >                                         ftotal++;
> >                                         flocal++;
> >
> > -                                       if (collide && flocal < 3)
> > +                                       if (collide && flocal <= map->choose_local_tries)
> >                                                 /* retry locally a few times */
> >                                                 retry_bucket = 1;
> > -                                       else if (flocal <= in->size + orig_tries)
> > +                                       else if (map->choose_local_fallback_tries > 0 &&
> > +                                                flocal <= in->size + map->choose_local_fallback_tries)
> >                                                 /* exhaustive bucket search */
> >                                                 retry_bucket = 1;
> > -                                       else if (ftotal < 20)
> > +                                       else if (ftotal <= map->choose_total_tries)
> >                                                 /* then retry descent */
> >                                                 retry_descent = 1;
> >                                         else
> > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> > index 9600674..3124b71 100644
> > --- a/net/ceph/osdmap.c
> > +++ b/net/ceph/osdmap.c
> > @@ -135,6 +135,21 @@ bad:
> >         return -EINVAL;
> >  }
> >
> > +static int skip_name_map(void **p, void *end)
> > +{
> > +        int len;
> > +        ceph_decode_32_safe(p, end, len ,bad);
> > +        while (len--) {
> > +                int strlen;
> use u32 for strlen
> 
> > +                *p += sizeof(u32);
> > +                ceph_decode_32_safe(p, end, strlen, bad);
> > +                *p += strlen;
> > +}
> > +        return 0;
> > +bad:
> > +        return -EINVAL;
> > +}
> > +
> >  static struct crush_map *crush_decode(void *pbyval, void *end)
> >  {
> >         struct crush_map *c;
> > @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         void **p = &pbyval;
> >         void *start = pbyval;
> >         u32 magic;
> > +       u32 num_name_maps;
> >
> >         dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
> >
> > @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         if (c == NULL)
> >                 return ERR_PTR(-ENOMEM);
> >
> > +        /* set tunables to default values */
> > +        c->choose_local_tries = 2;
> > +        c->choose_local_fallback_tries = 5;
> > +        c->choose_total_tries = 19;
> > +
> >         ceph_decode_need(p, end, 4*sizeof(u32), bad);
> >         magic = ceph_decode_32(p);
> >         if (magic != CRUSH_MAGIC) {
> > @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         }
> >
> >         /* ignore trailing name maps. */
> > +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> > +                err = skip_name_map(p, end);
> > +                if (err < 0)
> > +                        goto done;
> > +        }
> > +
> > +        /* tunables */
> > +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> > +        c->choose_local_tries = ceph_decode_32(p);
> > +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> > +        c->choose_total_tries = ceph_decode_32(p);
> > +        dout("crush decode tunable choose_local_tries = %d",
> > +             c->choose_local_tries);
> > +        dout("crush decode tunable choose_local_fallback_tries = %d",
> > +             c->choose_local_fallback_tries);
> > +        dout("crush decode tunable choose_total_tries = %d",
> > +             c->choose_total_tries);
> >
> > +done:
> >         dout("crush_decode success\n");
> >         return c;
> >
> > --
> > 1.7.9
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 2/9] libceph: support crush tunables
  2012-07-30 23:14     ` Sage Weil
@ 2012-07-30 23:45       ` Yehuda Sadeh
  0 siblings, 0 replies; 30+ messages in thread
From: Yehuda Sadeh @ 2012-07-30 23:45 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel, caleb miles

On Mon, Jul 30, 2012 at 4:14 PM, Sage Weil <sage@inktank.com> wrote:
>
> On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> > On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@inktank.com> wrote:
> > > From: caleb miles <caleb.miles@inktank.com>
> > >
> > > The server side recently added support for tuning some magic
> > > crush variables. Decode these variables if they are present, or use
> > > the
> > > default values if they are not present.
> > >
> > > Corresponds to ceph.git commit
> > > 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> > >
> > > Signed-off-by: caleb miles <caleb.miles@inktank.com>
> > > Reviewed-by: Sage Weil <sage@inktank.com>
> > > ---
> > >  include/linux/ceph/ceph_features.h |    4 ++-
> > >  include/linux/crush/crush.h        |    8 +++++++
> > >  net/ceph/crush/mapper.c            |   13 ++++++-----
> > >  net/ceph/osdmap.c                  |   39
> > > ++++++++++++++++++++++++++++++++++++
> > >  4 files changed, 57 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/linux/ceph/ceph_features.h
> > > b/include/linux/ceph/ceph_features.h
> > > index 342f93d..df25dcf 100644
> > > --- a/include/linux/ceph/ceph_features.h
> > > +++ b/include/linux/ceph/ceph_features.h
> > > @@ -12,12 +12,14 @@
> > >  #define CEPH_FEATURE_MONNAMES       (1<<5)
> > >  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> > >  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> > > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
> >
> > any reason why this is 18 and not 8?
>
> 9-17 are used.. just not implemented/used by the kernel code.
>

Maybe mention it somewhere in a comment, to avoid future confusion?

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2012-07-30 23:45 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-21  0:41 [PATCH 0/9] messenger fixups, batch #1 Sage Weil
2012-07-21  0:41 ` [PATCH 1/9] libceph: move feature bits to separate header Sage Weil
2012-07-24 22:14   ` Yehuda Sadeh
2012-07-30 18:29   ` Alex Elder
2012-07-21  0:41 ` [PATCH 2/9] libceph: support crush tunables Sage Weil
2012-07-24 22:24   ` Yehuda Sadeh
2012-07-30 23:14     ` Sage Weil
2012-07-30 23:45       ` Yehuda Sadeh
2012-07-30 18:36   ` Alex Elder
2012-07-21  0:41 ` [PATCH 3/9] libceph: report socket read/write error message Sage Weil
2012-07-24 22:26   ` Yehuda Sadeh
2012-07-30 18:37   ` Alex Elder
2012-07-21  0:41 ` [PATCH 4/9] libceph: fix mutex coverage for ceph_con_close Sage Weil
2012-07-24 22:29   ` Yehuda Sadeh
2012-07-30 18:43   ` Alex Elder
2012-07-21  0:41 ` [PATCH 5/9] libceph: resubmit linger ops when pg mapping changes Sage Weil
2012-07-24 22:51   ` Yehuda Sadeh
2012-07-30 22:40   ` Alex Elder
2012-07-30 23:03     ` Sage Weil
2012-07-21  0:41 ` [PATCH 6/9] libceph: (re)initialize bio_iter on start of message receive Sage Weil
2012-07-24 22:55   ` Yehuda Sadeh
2012-07-30 19:04   ` Alex Elder
2012-07-21  0:41 ` [PATCH 7/9] ceph: close old con before reopening on mds reconnect Sage Weil
2012-07-24 22:56   ` Yehuda Sadeh
2012-07-30 23:11     ` Sage Weil
2012-07-21  0:41 ` [PATCH 8/9] libceph: protect ceph_con_open() with mutex Sage Weil
2012-07-24 22:58   ` Yehuda Sadeh
2012-07-30 19:06   ` Alex Elder
2012-07-21  0:41 ` [PATCH 9/9] libceph: reset connection retry on successfully negotiation Sage Weil
2012-07-24 23:00   ` Yehuda Sadeh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.