From ab60b16d3c31b9bd9fd5b39f97dc42c52a50b67d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 19 Dec 2012 15:52:36 -0600 Subject: [PATCH 1/4] libceph: move linger requests sooner in kick_requests() The kick_requests() function is called by ceph_osdc_handle_map() when an osd map change has been indicated. Its purpose is to re-queue any request whose target osd is different from what it was when it was originally sent. It is structured as two loops, one for incomplete but registered requests, and a second for handling completed linger requests. As a special case, in the first loop if a request marked to linger has not yet completed, it is moved from the request list to the linger list. This is as a quick and dirty way to have the second loop handle sending the request along with all the other linger requests. Because of the way it's done now, however, this quick and dirty solution can result in these incomplete linger requests never getting re-sent as desired. The problem lies in the fact that the second loop only arranges for a linger request to be sent if it appears its target osd has changed. This is the proper handling for *completed* linger requests (it avoids issuing the same linger request twice to the same osd). But although the linger requests added to the list in the first loop may have been sent, they have not yet completed, so they need to be re-sent regardless of whether their target osd has changed. The first required fix is we need to avoid calling __map_request() on any incomplete linger request. Otherwise the subsequent __map_request() call in the second loop will find the target osd has not changed and will therefore not re-send the request. Second, we need to be sure that a sent but incomplete linger request gets re-sent. If the target osd is the same with the new osd map as it was when the request was originally sent, this won't happen. This can be fixed through careful handling when we move these requests from the request list to the linger list, by unregistering the request *before* it is registered as a linger request. This works because a side-effect of unregistering the request is to make the request's r_osd pointer be NULL, and *that* will ensure the second loop actually re-sends the linger request. Processing of such a request is done at that point, so continue with the next one once it's been moved. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 780caf6b049..0174c04edac 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1284,6 +1284,24 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); + + /* + * For linger requests that have not yet been + * registered, move them to the linger list; they'll + * be sent to the osd in the loop below. Unregister + * the request before re-registering it as a linger + * request to ensure the __map_request() below + * will decide it needs to be sent. + */ + if (req->r_linger && list_empty(&req->r_linger_item)) { + dout("%p tid %llu restart on osd%d\n", + req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + __unregister_request(osdc, req); + __register_linger_request(osdc, req); + continue; + } + err = __map_request(osdc, req, force_resend); if (err < 0) continue; /* error */ @@ -1298,17 +1316,6 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) req->r_flags |= CEPH_OSD_FLAG_RETRY; } } - if (req->r_linger && list_empty(&req->r_linger_item)) { - /* - * register as a linger so that we will - * re-submit below and get a new tid - */ - dout("%p tid %llu restart on osd%d\n", - req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - __register_linger_request(osdc, req); - __unregister_request(osdc, req); - } } list_for_each_entry_safe(req, nreq, &osdc->req_linger, @@ -1316,6 +1323,7 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); err = __map_request(osdc, req, force_resend); + dout("__map_request returned %d\n", err); if (err == 0) continue; /* no change and no osd was specified */ if (err < 0) From e6d50f67a6b1a6252a616e6e629473b5c4277218 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 26 Dec 2012 14:31:40 -0600 Subject: [PATCH 2/4] libceph: always reset osds when kicking When ceph_osdc_handle_map() is called to process a new osd map, kick_requests() is called to ensure all affected requests are updated if necessary to reflect changes in the osd map. This happens in two cases: whenever an incremental map update is processed; and when a full map update (or the last one if there is more than one) gets processed. In the former case, the kick_requests() call is followed immediately by a call to reset_changed_osds() to ensure any connections to osds affected by the map change are reset. But for full map updates this isn't done. Both cases should be doing this osd reset. Rather than duplicating the reset_changed_osds() call, move it into the end of kick_requests(). Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0174c04edac..eb9a4447876 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1270,7 +1270,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) * Requeue requests whose mapping to an OSD has changed. If requests map to * no osd, request a new map. * - * Caller should hold map_sem for read and request_mutex. + * Caller should hold map_sem for read. */ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) { @@ -1345,6 +1345,7 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("%d requests for down osds, need new map\n", needmap); ceph_monc_request_next_osdmap(&osdc->client->monc); } + reset_changed_osds(osdc); } @@ -1401,7 +1402,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap = newmap; } kick_requests(osdc, 0); - reset_changed_osds(osdc); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); From 122070a2ffc91f87fe8e8493eb0ac61986c5557c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 26 Dec 2012 10:43:57 -0600 Subject: [PATCH 3/4] libceph: WARN, don't BUG on unexpected connection states A number of assertions in the ceph messenger are implemented with BUG_ON(), killing the system if connection's state doesn't match what's expected. At this point our state model is (evidently) not well understood enough for these assertions to trigger a BUG(). Convert all BUG_ON(con->state...) calls to be WARN_ON(con->state...) so we learn about these issues without killing the machine. We now recognize that a connection fault can occur due to a socket closure at any time, regardless of the state of the connection. So there is really nothing we can assert about the state of the connection at that point so eliminate that assertion. Reported-by: Ugis Tested-by: Ugis Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4d111fd2b49..3b386674e34 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -561,7 +561,7 @@ void ceph_con_open(struct ceph_connection *con, mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); - BUG_ON(con->state != CON_STATE_CLOSED); + WARN_ON(con->state != CON_STATE_CLOSED); con->state = CON_STATE_PREOPEN; con->peer_name.type = (__u8) entity_type; @@ -1509,7 +1509,7 @@ static int process_banner(struct ceph_connection *con) static void fail_protocol(struct ceph_connection *con) { reset_connection(con); - BUG_ON(con->state != CON_STATE_NEGOTIATING); + WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_CLOSED; } @@ -1635,7 +1635,7 @@ static int process_connect(struct ceph_connection *con) return -1; } - BUG_ON(con->state != CON_STATE_NEGOTIATING); + WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); @@ -2132,7 +2132,6 @@ more: if (ret < 0) goto out; - BUG_ON(con->state != CON_STATE_CONNECTING); con->state = CON_STATE_NEGOTIATING; /* @@ -2160,7 +2159,7 @@ more: goto more; } - BUG_ON(con->state != CON_STATE_OPEN); + WARN_ON(con->state != CON_STATE_OPEN); if (con->in_base_pos < 0) { /* @@ -2382,7 +2381,7 @@ static void ceph_fault(struct ceph_connection *con) dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); - BUG_ON(con->state != CON_STATE_CONNECTING && + WARN_ON(con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN); From 0fa6ebc600bc8e830551aee47a0e929e818a1868 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Dec 2012 20:27:04 -0600 Subject: [PATCH 4/4] libceph: fix protocol feature mismatch failure path We should not set con->state to CLOSED here; that happens in ceph_fault() in the caller, where it first asserts that the state is not yet CLOSED. Avoids a BUG when the features don't match. Since the fail_protocol() has become a trivial wrapper, replace calls to it with direct calls to reset_connection(). Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/messenger.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3b386674e34..5ccf87ed8d6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -506,6 +506,7 @@ static void reset_connection(struct ceph_connection *con) { /* reset connection, out_queue, msg_ and connect_seq */ /* discard existing out_queue and msg_seq */ + dout("reset_connection %p\n", con); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); @@ -1506,13 +1507,6 @@ static int process_banner(struct ceph_connection *con) return 0; } -static void fail_protocol(struct ceph_connection *con) -{ - reset_connection(con); - WARN_ON(con->state != CON_STATE_NEGOTIATING); - con->state = CON_STATE_CLOSED; -} - static int process_connect(struct ceph_connection *con) { u64 sup_feat = con->msgr->supported_features; @@ -1530,7 +1524,7 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; - fail_protocol(con); + reset_connection(con); return -1; case CEPH_MSGR_TAG_BADPROTOVER: @@ -1541,7 +1535,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; - fail_protocol(con); + reset_connection(con); return -1; case CEPH_MSGR_TAG_BADAUTHORIZER: @@ -1631,7 +1625,7 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; - fail_protocol(con); + reset_connection(con); return -1; }