From 8618e30bc14b06bfafa0f164cca7b0e06451f88a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 8 Oct 2012 20:37:30 -0700 Subject: [PATCH 01/68] rbd: let con_work() handle backoff Both ceph_fault() and con_work() include handling for imposing a delay before doing further processing on a faulted connection. The latter is used only if ceph_fault() is unable to. Instead, just let con_work() always be responsible for implementing the delay. After setting up the delay value, set the BACKOFF flag on the connection unconditionally and call queue_con() to ensure con_work() will get called to handle it. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cad0d17ec45..973c16c20c4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2398,24 +2398,8 @@ static void ceph_fault(struct ceph_connection *con) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - con->ops->get(con); - if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay))) { - dout("fault queued %p delay %lu\n", con, con->delay); - } else { - con->ops->put(con); - dout("fault failed to queue %p delay %lu, backoff\n", - con, con->delay); - /* - * In many cases we see a socket state change - * while con_work is running and end up - * queuing (non-delayed) work, such that we - * can't backoff with a delay. Set a flag so - * that when con_work restarts we schedule the - * delay then. - */ - set_bit(CON_FLAG_BACKOFF, &con->flags); - } + set_bit(CON_FLAG_BACKOFF, &con->flags); + queue_con(con); } out_unlock: From 802c6d967fbdcd2cbc91b917425661bb8bbfaade Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 8 Oct 2012 20:37:30 -0700 Subject: [PATCH 02/68] rbd: define common queue_con_delay() This patch defines a single function, queue_con_delay() to call queue_delayed_work() for a connection. It basically generalizes what was previously queue_con() by adding the delay argument. queue_con() is now a simple helper that passes 0 for its delay. queue_con_delay() returns 0 if it queued work or an errno if it did not for some reason. If con_work() finds the BACKOFF flag set for a connection, it now calls queue_con_delay() to handle arranging to start again after a delay. Note about connection reference counts: con_work() only ever gets called as a work item function. At the time that work is scheduled, a reference to the connection is acquired, and the corresponding con_work() call is then responsible for dropping that reference before it returns. Previously, the backoff handling inside con_work() silently handed off its reference to delayed work it scheduled. Now that queue_con_delay() is used, a new reference is acquired for the newly-scheduled work, and the original reference is dropped by the con->ops->put() call at the end of the function. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 973c16c20c4..66f6f56bcb2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2244,22 +2244,33 @@ bad_tag: /* - * Atomically queue work on a connection. Bump @con reference to - * avoid races with connection teardown. + * Atomically queue work on a connection after the specified delay. + * Bump @con reference to avoid races with connection teardown. + * Returns 0 if work was queued, or an error code otherwise. */ -static void queue_con(struct ceph_connection *con) +static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { - dout("queue_con %p ref count 0\n", con); - return; + dout("%s %p ref count 0\n", __func__, con); + + return -ENOENT; } - if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) { - dout("queue_con %p - already queued\n", con); + if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { + dout("%s %p - already queued\n", __func__, con); con->ops->put(con); - } else { - dout("queue_con %p\n", con); + + return -EBUSY; } + + dout("%s %p %lu\n", __func__, con, delay); + + return 0; +} + +static void queue_con(struct ceph_connection *con) +{ + (void) queue_con_delay(con, 0); } /* @@ -2294,14 +2305,11 @@ restart: if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { dout("con_work %p backing off\n", con); - if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay))) { - dout("con_work %p backoff %lu\n", con, con->delay); - mutex_unlock(&con->mutex); - return; - } else { + ret = queue_con_delay(con, round_jiffies_relative(con->delay)); + if (ret) { dout("con_work %p FAILED to back off %lu\n", con, con->delay); + BUG_ON(ret == -ENOENT); set_bit(CON_FLAG_BACKOFF, &con->flags); } goto done; From 9478554ae5d21d65e948a3eff4ee2a8ad30d70e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 9 Oct 2012 13:50:17 -0700 Subject: [PATCH 03/68] rbd: define rbd_update_mapping_size() Encapsulate the code that handles updating the size of a mapping after an rbd image has been refreshed. This is done in anticipation of the next patch, which will make this common code for format 1 and 2 images. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bb3d9be3b1b..b64125d1d7b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1716,6 +1716,19 @@ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) __rbd_remove_snap_dev(snap); } +static void rbd_update_mapping_size(struct rbd_device *rbd_dev) +{ + sector_t size; + + if (rbd_dev->mapping.snap_id != CEPH_NOSNAP) + return; + + size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long) size); + rbd_dev->mapping.size = (u64) size; + set_capacity(rbd_dev->disk, size); +} + /* * only read the first part of the ondisk header, without the snaps info */ @@ -1730,17 +1743,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) down_write(&rbd_dev->header_rwsem); - /* resized? */ - if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { - sector_t size = (sector_t) h.image_size / SECTOR_SIZE; - - if (size != (sector_t) rbd_dev->mapping.size) { - dout("setting size to %llu sectors", - (unsigned long long) size); - rbd_dev->mapping.size = (u64) size; - set_capacity(rbd_dev->disk, size); - } - } + /* Update image size, and check for resize of mapped image */ + rbd_dev->header.image_size = h.image_size; + rbd_update_mapping_size(rbd_dev); /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); From 117973fb4c91f3fd913127577e9f71b3aa6cb556 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:55 -0500 Subject: [PATCH 04/68] rbd: define rbd_dev_v2_refresh() Define a new function rbd_dev_v2_refresh() to update/refresh the snapshot context for a format version 2 rbd image. This function will update anything that is not fixed for the life of an rbd image--at the moment this is mainly the snapshot context and (for a base mapping) the size. Update rbd_refresh_header() so it selects which function to use based on the image format. Rename __rbd_refresh_header() to be rbd_dev_v1_refresh() to be consistent with the naming of its version 2 counterpart. Similarly rename rbd_refresh_header() to be rbd_dev_refresh(). Unrelated--we use rbd_image_format_valid() here. Delete the other use of it, which was primarily put in place to ensure that function was referenced at the time it was defined. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 55 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b64125d1d7b..f11b839166e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -268,7 +268,8 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) put_device(&rbd_dev->dev); } -static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); +static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -1304,7 +1305,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", rbd_dev->header_name, (unsigned long long) notify_id, (unsigned int) opcode); - rc = rbd_refresh_header(rbd_dev, &hver); + rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) pr_warning(RBD_DRV_NAME "%d got notification but failed to " " update snaps: %d\n", rbd_dev->major, rc); @@ -1732,7 +1733,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev) /* * only read the first part of the ondisk header, without the snaps info */ -static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) { int ret; struct rbd_image_header h; @@ -1773,12 +1774,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) return ret; } -static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) { int ret; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - ret = __rbd_refresh_header(rbd_dev, hver); + if (rbd_dev->image_format == 1) + ret = rbd_dev_v1_refresh(rbd_dev, hver); + else + ret = rbd_dev_v2_refresh(rbd_dev, hver); mutex_unlock(&ctl_mutex); return ret; @@ -1938,7 +1943,7 @@ static ssize_t rbd_image_refresh(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret; - ret = rbd_refresh_header(rbd_dev, NULL); + ret = rbd_dev_refresh(rbd_dev, NULL); return ret < 0 ? ret : size; } @@ -2402,6 +2407,41 @@ static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, return ERR_PTR(-EINVAL); } +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) +{ + int ret; + __u8 obj_order; + + down_write(&rbd_dev->header_rwsem); + + /* Grab old order first, to see if it changes */ + + obj_order = rbd_dev->header.obj_order, + ret = rbd_dev_v2_image_size(rbd_dev); + if (ret) + goto out; + if (rbd_dev->header.obj_order != obj_order) { + ret = -EIO; + goto out; + } + rbd_update_mapping_size(rbd_dev); + + ret = rbd_dev_v2_snap_context(rbd_dev, hver); + dout("rbd_dev_v2_snap_context returned %d\n", ret); + if (ret) + goto out; + ret = rbd_dev_snaps_update(rbd_dev); + dout("rbd_dev_snaps_update returned %d\n", ret); + if (ret) + goto out; + ret = rbd_dev_snaps_register(rbd_dev); + dout("rbd_dev_snaps_register returned %d\n", ret); +out: + up_write(&rbd_dev->header_rwsem); + + return ret; +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2564,7 +2604,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) do { ret = rbd_req_sync_watch(rbd_dev); if (ret == -ERANGE) { - rc = rbd_refresh_header(rbd_dev, NULL); + rc = rbd_dev_refresh(rbd_dev, NULL); if (rc < 0) return rc; } @@ -3045,7 +3085,6 @@ static ssize_t rbd_add(struct bus_type *bus, rc = rbd_dev_probe(rbd_dev); if (rc < 0) goto err_out_client; - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); /* no need to lock here, as rbd_dev is not registered yet */ rc = rbd_dev_snaps_update(rbd_dev); From d889140c4a1c5edb6a7bd90392b9d878bfaccfb6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 9 Oct 2012 13:50:17 -0700 Subject: [PATCH 05/68] rbd: implement feature checks Version 2 images have two sets of feature bit fields. The first indicates features possibly used by the image. The second indicates features that the client *must* support in order to use the image. When an image (or snapshot) is first examined, we need to make sure that the local implementation supports the image's required features. If not, fail the probe for the image. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f11b839166e..0f260a6e97c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -70,6 +70,14 @@ #define RBD_IMAGE_ID_LEN_MAX 64 #define RBD_OBJ_PREFIX_LEN_MAX 64 +/* Feature bits */ + +#define RBD_FEATURE_LAYERING 1 + +/* Features supported by this (client software) implementation. */ + +#define RBD_FEATURES_ALL (0) + /* * An RBD device name will be "rbd#", where the "rbd" comes from * RBD_DRV_NAME above, and # is a unique integer identifier. @@ -2226,6 +2234,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, __le64 features; __le64 incompat; } features_buf = { 0 }; + u64 incompat; int ret; ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, @@ -2236,6 +2245,11 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) return ret; + + incompat = le64_to_cpu(features_buf.incompat); + if (incompat & ~RBD_FEATURES_ALL) + return -ENOTSUPP; + *snap_features = le64_to_cpu(features_buf.features); dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", @@ -2977,7 +2991,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; - /* Get the features for the image */ + /* Get the and check features for the image */ ret = rbd_dev_v2_features(rbd_dev); if (ret < 0) From 35152979e6181b1fbb4b61c3ff641c14df53ad66 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:55 -0500 Subject: [PATCH 06/68] rbd: activate v2 image support Now that v2 images support is fully implemented, have rbd_dev_v2_probe() return 0 to indicate a successful probe. (Note that an image that implements layering will fail the probe early because of the feature chekc.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0f260a6e97c..8f56d37637a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3014,7 +3014,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) dout("discovered version 2 image, header name is %s\n", rbd_dev->header_name); - return -ENOTSUPP; + return 0; out_err: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; From 0f9831a89310cebba52d3f526e6cc5c2e403e6f1 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 18 Oct 2012 14:01:43 -0700 Subject: [PATCH 07/68] ceph: fix dentry reference leak in encode_fh() Call to d_find_alias() needs a corresponding dput() This fixes http://tracker.newdream.net/issues/3271 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- fs/ceph/export.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8e1b60e557b..862887004d2 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } + if (dentry) + dput(dentry); return type; } From 7246240c7c186542f73af4fadc744d66440f616f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 25 Oct 2012 08:49:41 -0700 Subject: [PATCH 08/68] libceph: avoid NULL kref_put from NULL alloc_msg return The ceph_on_in_msg_alloc() method calls the ->alloc_msg() helper which may return NULL. It also drops con->mutex while it allocates a message, which means that the connection state may change (e.g., get closed). If that happens, we clean up and bail out. Avoid calling ceph_msg_put() on a NULL return value and triggering a crash. This was observed when an ->alloc_msg() call races with a timeout that resends a zillion messages and resets the connection, and ->alloc_msg() returns NULL (because the request was resent to another target). Fixes http://tracker.newdream.net/issues/3342 Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/messenger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 66f6f56bcb2..1041114453d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2742,7 +2742,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); if (con->state != CON_STATE_OPEN) { - ceph_msg_put(msg); + if (msg) + ceph_msg_put(msg); return -EAGAIN; } con->in_msg = msg; From b000056a5a8d3f5a4a9fb80184a7ec14f86a43d4 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 25 Oct 2012 10:23:46 -0700 Subject: [PATCH 09/68] ceph: Fix NULL ptr crash in strlen() set_request_path_attr() checks for NULL ptr before calling strlen() This fixes http://tracker.newdream.net/issues/3404 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1bcf712655d..62d2342eb26 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, } else if (rpath || rino) { *ino = rino; *ppath = rpath; - *pathlen = strlen(rpath); + *pathlen = rpath ? strlen(rpath) : 0; dout(" path %.*s\n", *pathlen, rpath); } From b213e0b1a62637b2a9395a34349b13d73ca2b90a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 10 Oct 2012 21:19:13 -0700 Subject: [PATCH 10/68] rbd: fix bug in rbd_dev_id_put() In rbd_dev_id_put(), there's a loop that's intended to determine the maximum device id in use. But it isn't doing that at all, the effect of how it's written is to simply use the just-put id number, which ignores whole purpose of this function. Fix the bug. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8f56d37637a..4a16464ad5b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2680,8 +2680,8 @@ static void rbd_dev_id_put(struct rbd_device *rbd_dev) struct rbd_device *rbd_dev; rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_id > max_id) - max_id = rbd_id; + if (rbd_dev->dev_id > max_id) + max_id = rbd_dev->dev_id; } spin_unlock(&rbd_dev_list_lock); From a0ea3a40fd20b8c66381f747c454f89d6d1f50d4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 10 Oct 2012 21:19:13 -0700 Subject: [PATCH 11/68] rbd: zero return code in rbd_dev_image_id() When rbd_dev_probe() calls rbd_dev_image_id() it expects to get a 0 return code if successful, but it is getting a positive value. The reason is that rbd_dev_image_id() returns the value it gets from rbd_req_sync_exec(), which returns the number of bytes read in as a result of the request. (This ultimately comes from ceph_copy_from_page_vector() in rbd_req_sync_op()). Force the return value to 0 when successful in rbd_dev_image_id(). Do the same in rbd_dev_v2_object_prefix(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin Reviewed-by: Dan Mick --- drivers/block/rbd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4a16464ad5b..94d613208c4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2207,6 +2207,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; + ret = 0; /* rbd_req_sync_exec() can return positive */ p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, @@ -2900,6 +2901,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; + ret = 0; /* rbd_req_sync_exec() can return positive */ p = response; rbd_dev->image_id = ceph_extract_encoded_string(&p, From be466c1cc36621590ef17b05a6d342dfd33f7280 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 22 Oct 2012 11:31:26 -0500 Subject: [PATCH 12/68] rbd: fix read-only option name The name of the "read-only" mapping option was inadvertently changed in this commit: f84344f3 rbd: separate mapping info in rbd_dev Revert that hunk to return it to what it should be. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 94d613208c4..5d9e2cc5c51 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -397,7 +397,7 @@ enum { static match_table_t rbd_opts_tokens = { /* int args above */ /* string args above */ - {Opt_read_only, "mapping.read_only"}, + {Opt_read_only, "read_only"}, {Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_write, "read_write"}, {Opt_read_write, "rw"}, /* Alternate spelling */ From 13f4042c05b6a1a638ccab3f0cabdb84993803a2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 10 Oct 2012 18:59:29 -0700 Subject: [PATCH 13/68] rbd: kill rbd_req_{read,write}() Both rbd_req_read() and rbd_req_write() are simple wrapper routines for rbd_do_op(), and each is only called once. Replace each wrapper call with a direct call to rbd_do_op(), and get rid of the wrapper functions. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 60 ++++++++++----------------------------------- 1 file changed, 13 insertions(+), 47 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5d9e2cc5c51..8f2c39ad3bb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1210,41 +1210,6 @@ done: return ret; } -/* - * Request async osd write - */ -static int rbd_req_write(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ofs, len, bio, coll, coll_index); -} - -/* - * Request async osd read - */ -static int rbd_req_read(struct request *rq, - struct rbd_device *rbd_dev, - u64 snapid, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - return rbd_do_op(rq, rbd_dev, NULL, - snapid, - CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, - ofs, len, bio, coll, coll_index); -} - /* * Request sync osd read */ @@ -1550,21 +1515,22 @@ static void rbd_rq_fn(struct request_queue *q) goto next_seg; } - /* init OSD command: write or read */ if (do_write) - rbd_req_write(rq, rbd_dev, - snapc, - ofs, - op_size, bio, - coll, cur_seg); + (void) rbd_do_op(rq, rbd_dev, + snapc, CEPH_NOSNAP, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + ofs, op_size, bio, + coll, cur_seg); else - rbd_req_read(rq, rbd_dev, - rbd_dev->mapping.snap_id, - ofs, - op_size, bio, - coll, cur_seg); - + (void) rbd_do_op(rq, rbd_dev, + NULL, rbd_dev->mapping.snap_id, + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + ofs, op_size, bio, + coll, cur_seg); next_seg: size -= op_size; ofs += op_size; From ff2e4bb5b32f89c455979a4222a9b78007cde254 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 10 Oct 2012 18:59:29 -0700 Subject: [PATCH 14/68] rbd: drop rbd_do_op() opcode and flags The only callers of rbd_do_op() are in rbd_rq_fn(), where call one is used for writes and the other used for reads. The request passed to rbd_do_op() already encodes the I/O direction, and that information can be used inside the function to set the opcode and flags value (rather than passing them in as arguments). So get rid of the opcode and flags arguments to rbd_do_op(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8f2c39ad3bb..a29c6d2a49a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1164,7 +1164,6 @@ static int rbd_do_op(struct request *rq, struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, u64 ofs, u64 len, struct bio *bio, struct rbd_req_coll *coll, @@ -1176,6 +1175,8 @@ static int rbd_do_op(struct request *rq, int ret; struct ceph_osd_req_op *ops; u32 payload_len; + int opcode; + int flags; seg_name = rbd_segment_name(rbd_dev, ofs); if (!seg_name) @@ -1183,7 +1184,15 @@ static int rbd_do_op(struct request *rq, seg_len = rbd_segment_length(rbd_dev, ofs, len); seg_ofs = rbd_segment_offset(rbd_dev, ofs); - payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); + if (rq_data_dir(rq) == WRITE) { + opcode = CEPH_OSD_OP_WRITE; + flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; + payload_len = seg_len; + } else { + opcode = CEPH_OSD_OP_READ; + flags = CEPH_OSD_FLAG_READ; + payload_len = 0; + } ret = -ENOMEM; ops = rbd_create_rw_ops(1, opcode, payload_len); @@ -1519,16 +1528,11 @@ static void rbd_rq_fn(struct request_queue *q) if (do_write) (void) rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, ofs, op_size, bio, coll, cur_seg); else (void) rbd_do_op(rq, rbd_dev, NULL, rbd_dev->mapping.snap_id, - CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, ofs, op_size, bio, coll, cur_seg); next_seg: From 4634246db8cb2e5117ef7c682efcc383fa3354f8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 10 Oct 2012 18:59:29 -0700 Subject: [PATCH 15/68] rbd: consolidate rbd_do_op() calls The two calls to rbd_do_op() from rbd_rq_fn() differ only in the value passed for the snapshot id and the snapshot context. For reads the snapshot always comes from the mapping, and for writes the snapshot id is always CEPH_NOSNAP. The snapshot context is always null for reads. For writes, the snapshot context always comes from the rbd header, but it is acquired under protection of header semaphore and could change thereafter, so we can't simply use what's available inside rbd_do_op(). Eliminate the snapid parameter from rbd_do_op(), and set it based on the I/O direction inside that function instead. Always pass the snapshot context acquired in the caller, but reset it to a null pointer inside rbd_do_op() if the operation is a read. As a result, there is no difference in the read and write calls to rbd_do_op() made in rbd_rq_fn(), so just call it unconditionally. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a29c6d2a49a..d0328835bbd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1163,7 +1163,6 @@ done: static int rbd_do_op(struct request *rq, struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, - u64 snapid, u64 ofs, u64 len, struct bio *bio, struct rbd_req_coll *coll, @@ -1177,6 +1176,7 @@ static int rbd_do_op(struct request *rq, u32 payload_len; int opcode; int flags; + u64 snapid; seg_name = rbd_segment_name(rbd_dev, ofs); if (!seg_name) @@ -1187,10 +1187,13 @@ static int rbd_do_op(struct request *rq, if (rq_data_dir(rq) == WRITE) { opcode = CEPH_OSD_OP_WRITE; flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; + snapid = CEPH_NOSNAP; payload_len = seg_len; } else { opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; + snapc = NULL; + snapid = rbd_dev->mapping.snap_id; payload_len = 0; } @@ -1518,24 +1521,13 @@ static void rbd_rq_fn(struct request_queue *q) kref_get(&coll->kref); bio = bio_chain_clone(&rq_bio, &next_bio, &bp, op_size, GFP_ATOMIC); - if (!bio) { + if (bio) + (void) rbd_do_op(rq, rbd_dev, snapc, + ofs, op_size, + bio, coll, cur_seg); + else rbd_coll_end_req_index(rq, coll, cur_seg, -ENOMEM, op_size); - goto next_seg; - } - - /* init OSD command: write or read */ - if (do_write) - (void) rbd_do_op(rq, rbd_dev, - snapc, CEPH_NOSNAP, - ofs, op_size, bio, - coll, cur_seg); - else - (void) rbd_do_op(rq, rbd_dev, - NULL, rbd_dev->mapping.snap_id, - ofs, op_size, bio, - coll, cur_seg); -next_seg: size -= op_size; ofs += op_size; From db2388b6ee40a949084e4cdddc3b0a4357068a62 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 20 Oct 2012 22:17:27 -0500 Subject: [PATCH 16/68] rbd: verify rbd image order value This adds a verification that an rbd image's object order is within the upper and lower bounds supported by this implementation. It must be at least 9 (SECTOR_SHIFT), because the Linux bio system assumes that minimum granularity. It also must be less than 32 (at the moment anyway) because there exist spots in the code that store the size of a "segment" (object backing an rbd image) in a signed int variable, which can be 32 bits including the sign. We should be able to relax this limit once we've verified the code uses 64-bit types where needed. Note that the CLI tool already limits the order to the range 12-25. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d0328835bbd..4734446c3b5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -533,6 +533,16 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) return false; + /* The bio layer requires at least sector-sized I/O */ + + if (ondisk->options.order < SECTOR_SHIFT) + return false; + + /* If we use u64 in a few spots we may be able to loosen this */ + + if (ondisk->options.order > 8 * sizeof (int) - 1) + return false; + /* * The size of a snapshot header has to fit in a size_t, and * that limits the number of snapshots. From d4b125e9eb43babd14538ba61718e3db71a98d29 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: [PATCH 17/68] rbd: increase maximum snapshot name length Change RBD_MAX_SNAP_NAME_LEN to be based on NAME_MAX. That is a practical limit for the length of a snapshot name (based on the presence of a directory using the name under /sys/bus/rbd to represent the snapshot). The /sys entry is created by prefixing it with "snap_"; define that prefix symbolically, and take its length into account in defining the snapshot name length limit. Enforce the limit in rbd_add_parse_args(). Also delete a dout() call in that function that was not meant to be committed. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4734446c3b5..4858d925b95 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -61,7 +61,10 @@ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_SNAP_DEV_NAME_PREFIX "snap_" +#define RBD_MAX_SNAP_NAME_LEN \ + (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) + #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ #define RBD_MAX_OPT_LEN 1024 @@ -2063,7 +2066,7 @@ static int rbd_register_snap_dev(struct rbd_snap *snap, dev->type = &rbd_snap_device_type; dev->parent = parent; dev->release = rbd_snap_dev_release; - dev_set_name(dev, "snap_%s", snap->name); + dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); dout("%s: registering device for snapshot %s\n", __func__, snap->name); ret = device_register(dev); @@ -2797,8 +2800,13 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev, if (!rbd_dev->image_name) goto out_err; - /* Snapshot name is optional */ + /* Snapshot name is optional; default is to use "head" */ + len = next_token(&buf); + if (len > RBD_MAX_SNAP_NAME_LEN) { + err_ptr = ERR_PTR(-ENAMETOOLONG); + goto out_err; + } if (!len) { buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ len = sizeof (RBD_SNAP_HEAD_NAME) - 1; @@ -2809,8 +2817,6 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev, memcpy(snap_name, buf, len); *(snap_name + len) = '\0'; -dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); - return snap_name; out_err: From e5cfeed281a842a37c9da84bad2911c9b470347e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 20 Oct 2012 22:17:27 -0500 Subject: [PATCH 18/68] rbd: simplify rbd_merge_bvec() The aim of this patch is to make what's going on rbd_merge_bvec() a bit more obvious than it was before. This was an issue when a recent btrfs bug led us to question whether the merge function was working correctly. Use "obj" rather than "chunk" to indicate the units whose boundaries we care about we call (rados) "objects". Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- drivers/block/rbd.c | 47 +++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4858d925b95..76fbfa12006 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1566,22 +1566,41 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct rbd_device *rbd_dev = q->queuedata; - unsigned int chunk_sectors; - sector_t sector; - unsigned int bio_sectors; - int max; + sector_t sector_offset; + sector_t sectors_per_obj; + sector_t obj_sector_offset; + int ret; - chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); - sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); - bio_sectors = bmd->bi_size >> SECTOR_SHIFT; + /* + * Find how far into its rbd object the partition-relative + * bio start sector is to offset relative to the enclosing + * device. + */ + sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; + sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); + obj_sector_offset = sector_offset & (sectors_per_obj - 1); - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << SECTOR_SHIFT; - if (max < 0) - max = 0; /* bio_add cannot handle a negative return */ - if (max <= bvec->bv_len && bio_sectors == 0) - return bvec->bv_len; - return max; + /* + * Compute the number of bytes from that offset to the end + * of the object. Account for what's already used by the bio. + */ + ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; + if (ret > bmd->bi_size) + ret -= bmd->bi_size; + else + ret = 0; + + /* + * Don't send back more than was asked for. And if the bio + * was empty, let the whole thing through because: "Note + * that a block device *must* allow a single page to be + * added to an empty bio." + */ + rbd_assert(bvec->bv_len <= PAGE_SIZE); + if (ret > (int) bvec->bv_len || !bmd->bi_size) + ret = (int) bvec->bv_len; + + return ret; } static void rbd_free_disk(struct rbd_device *rbd_dev) From 069a4b5690a952e74157fd489833c71c73f012b3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 22 Oct 2012 11:31:27 -0500 Subject: [PATCH 19/68] rbd: kill rbd_device->rbd_opts The rbd_device structure has an embedded rbd_options structure. Such a structure is needed to work with the generic ceph argument parsing code, but there's no need to keep it around once argument parsing is done. Use a local variable to hold the rbd options used in parsing in rbd_get_client(), and just transfer its content (it's just a read_only flag) into the field in the rbd_mapping sub-structure that requires that information. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin Reviewed-by: Dan Mick --- drivers/block/rbd.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 76fbfa12006..c800047f583 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -184,7 +184,6 @@ struct rbd_device { struct gendisk *disk; /* blkdev's gendisk and rq */ u32 image_format; /* Either 1 or 2 */ - struct rbd_options rbd_opts; struct rbd_client *rbd_client; char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ @@ -456,18 +455,24 @@ static int parse_rbd_opts_token(char *c, void *private) static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, size_t mon_addr_len, char *options) { - struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; + struct rbd_options rbd_opts; struct ceph_options *ceph_opts; struct rbd_client *rbdc; - rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; + /* Initialize all rbd options to the defaults */ + + rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; ceph_opts = ceph_parse_options(options, mon_addr, mon_addr + mon_addr_len, - parse_rbd_opts_token, rbd_opts); + parse_rbd_opts_token, &rbd_opts); if (IS_ERR(ceph_opts)) return PTR_ERR(ceph_opts); + /* Record the parsed rbd options */ + + rbd_dev->mapping.read_only = rbd_opts.read_only; + rbdc = rbd_client_find(ceph_opts); if (rbdc) { /* using an existing client */ @@ -685,7 +690,6 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; rbd_dev->mapping.snap_exists = false; - rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; ret = 0; } else { ret = snap_by_name(rbd_dev, snap_name); From 0ed7285e0001b960c888e5455ae982025210ed3d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 29 Oct 2012 11:01:42 -0700 Subject: [PATCH 20/68] libceph: fix osdmap decode error paths Ensure that we set the err value correctly so that we do not pass a 0 value to ERR_PTR and confuse the calling code. (In particular, osd_client.c handle_map() will BUG(!newmap)). Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 5433fb0eb3c..f552aa48fd9 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -645,10 +645,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + err = -ENOMEM; pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) goto bad; pi->id = ceph_decode_32(p); + err = -EINVAL; ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", @@ -664,8 +666,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) __insert_pg_pool(&map->pg_pools, pi); } - if (version >= 5 && __decode_pool_names(p, end, map) < 0) - goto bad; + if (version >= 5) { + err = __decode_pool_names(p, end, map); + if (err < 0) { + dout("fail to decode pool names"); + goto bad; + } + } ceph_decode_32_safe(p, end, map->pool_max, bad); @@ -745,7 +752,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) return map; bad: - dout("osdmap_decode fail\n"); + dout("osdmap_decode fail err %d\n", err); ceph_osdmap_destroy(map); return ERR_PTR(err); } @@ -839,6 +846,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + err = -EINVAL; goto bad; } pi = __lookup_pg_pool(&map->pg_pools, pool); @@ -855,8 +863,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (err < 0) goto bad; } - if (version >= 5 && __decode_pool_names(p, end, map) < 0) - goto bad; + if (version >= 5) { + err = __decode_pool_names(p, end, map); + if (err < 0) + goto bad; + } /* old_pool */ ceph_decode_32_safe(p, end, len, bad); @@ -932,15 +943,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, (void) __remove_pg_mapping(&map->pg_temp, pgid); /* insert */ - if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) { - err = -EINVAL; + err = -EINVAL; + if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) goto bad; - } + err = -ENOMEM; pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) { - err = -ENOMEM; + if (!pg) goto bad; - } pg->pgid = pgid; pg->len = pglen; for (j = 0; j < pglen; j++) From f7760dad286829682a8d36f4563ab20a65732414 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 20 Oct 2012 22:17:27 -0500 Subject: [PATCH 21/68] rbd: simplify rbd_rq_fn() When processing a request, rbd_rq_fn() makes clones of the bio's in the request's bio chain and submits the results to osd's to be satisfied. If a request bio straddles the boundary between objects backing the rbd image, it must be represented by two cloned bio's, one for the first part (at the end of one object) and one for the second (at the beginning of the next object). This has been handled by a function bio_chain_clone(), which includes an interface only a mother could love, and which has been found to have other problems. This patch defines two new fairly generic bio functions (one which replaces bio_chain_clone()) to help out the situation, and then revises rbd_rq_fn() to make use of them. First, bio_clone_range() clones a portion of a single bio, starting at a given offset within the bio and including only as many bytes as requested. As a convenience, a request to clone the entire bio is passed directly to bio_clone(). Second, bio_chain_clone_range() performs a similar function, producing a chain of cloned bio's covering a sub-range of the source chain. No bio_pair structures are used, and if successful the result will represent exactly the specified range. Using bio_chain_clone_range() makes bio_rq_fn() a little easier to understand, because it avoids the need to pass very much state information between consecutive calls. By avoiding the need to track a bio_pair structure, it also eliminates the problem described here: http://tracker.newdream.net/issues/2933 Note that a block request (and therefore the complete length of a bio chain processed in rbd_rq_fn()) is an unsigned int, while the result of rbd_segment_length() is u64. This change makes this range trunctation explicit, and trips a bug if the the segment boundary is too far off. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 231 +++++++++++++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 79 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c800047f583..cc06c55875b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -826,77 +826,144 @@ static void zero_bio_chain(struct bio *chain, int start_ofs) } /* - * bio_chain_clone - clone a chain of bios up to a certain length. - * might return a bio_pair that will need to be released. + * Clone a portion of a bio, starting at the given byte offset + * and continuing for the number of bytes indicated. */ -static struct bio *bio_chain_clone(struct bio **old, struct bio **next, - struct bio_pair **bp, - int len, gfp_t gfpmask) +static struct bio *bio_clone_range(struct bio *bio_src, + unsigned int offset, + unsigned int len, + gfp_t gfpmask) { - struct bio *old_chain = *old; - struct bio *new_chain = NULL; - struct bio *tail; - int total = 0; + struct bio_vec *bv; + unsigned int resid; + unsigned short idx; + unsigned int voff; + unsigned short end_idx; + unsigned short vcnt; + struct bio *bio; - if (*bp) { - bio_pair_release(*bp); - *bp = NULL; + /* Handle the easy case for the caller */ + + if (!offset && len == bio_src->bi_size) + return bio_clone(bio_src, gfpmask); + + if (WARN_ON_ONCE(!len)) + return NULL; + if (WARN_ON_ONCE(len > bio_src->bi_size)) + return NULL; + if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) + return NULL; + + /* Find first affected segment... */ + + resid = offset; + __bio_for_each_segment(bv, bio_src, idx, 0) { + if (resid < bv->bv_len) + break; + resid -= bv->bv_len; + } + voff = resid; + + /* ...and the last affected segment */ + + resid += len; + __bio_for_each_segment(bv, bio_src, end_idx, idx) { + if (resid <= bv->bv_len) + break; + resid -= bv->bv_len; + } + vcnt = end_idx - idx + 1; + + /* Build the clone */ + + bio = bio_alloc(gfpmask, (unsigned int) vcnt); + if (!bio) + return NULL; /* ENOMEM */ + + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); + bio->bi_rw = bio_src->bi_rw; + bio->bi_flags |= 1 << BIO_CLONED; + + /* + * Copy over our part of the bio_vec, then update the first + * and last (or only) entries. + */ + memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], + vcnt * sizeof (struct bio_vec)); + bio->bi_io_vec[0].bv_offset += voff; + if (vcnt > 1) { + bio->bi_io_vec[0].bv_len -= voff; + bio->bi_io_vec[vcnt - 1].bv_len = resid; + } else { + bio->bi_io_vec[0].bv_len = len; } - while (old_chain && (total < len)) { - struct bio *tmp; + bio->bi_vcnt = vcnt; + bio->bi_size = len; + bio->bi_idx = 0; - tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); - if (!tmp) - goto err_out; - gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ + return bio; +} - if (total + old_chain->bi_size > len) { - struct bio_pair *bp; +/* + * Clone a portion of a bio chain, starting at the given byte offset + * into the first bio in the source chain and continuing for the + * number of bytes indicated. The result is another bio chain of + * exactly the given length, or a null pointer on error. + * + * The bio_src and offset parameters are both in-out. On entry they + * refer to the first source bio and the offset into that bio where + * the start of data to be cloned is located. + * + * On return, bio_src is updated to refer to the bio in the source + * chain that contains first un-cloned byte, and *offset will + * contain the offset of that byte within that bio. + */ +static struct bio *bio_chain_clone_range(struct bio **bio_src, + unsigned int *offset, + unsigned int len, + gfp_t gfpmask) +{ + struct bio *bi = *bio_src; + unsigned int off = *offset; + struct bio *chain = NULL; + struct bio **end; - /* - * this split can only happen with a single paged bio, - * split_bio will BUG_ON if this is not the case - */ - dout("bio_chain_clone split! total=%d remaining=%d" - "bi_size=%u\n", - total, len - total, old_chain->bi_size); + /* Build up a chain of clone bios up to the limit */ - /* split the bio. We'll release it either in the next - call, or it will have to be released outside */ - bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); - if (!bp) - goto err_out; + if (!bi || off >= bi->bi_size || !len) + return NULL; /* Nothing to clone */ - __bio_clone(tmp, &bp->bio1); + end = &chain; + while (len) { + unsigned int bi_size; + struct bio *bio; - *next = &bp->bio2; - } else { - __bio_clone(tmp, old_chain); - *next = old_chain->bi_next; + if (!bi) + goto out_err; /* EINVAL; ran out of bio's */ + bi_size = min_t(unsigned int, bi->bi_size - off, len); + bio = bio_clone_range(bi, off, bi_size, gfpmask); + if (!bio) + goto out_err; /* ENOMEM */ + + *end = bio; + end = &bio->bi_next; + + off += bi_size; + if (off == bi->bi_size) { + bi = bi->bi_next; + off = 0; } - - tmp->bi_bdev = NULL; - tmp->bi_next = NULL; - if (new_chain) - tail->bi_next = tmp; - else - new_chain = tmp; - tail = tmp; - old_chain = old_chain->bi_next; - - total += tmp->bi_size; + len -= bi_size; } + *bio_src = bi; + *offset = off; - rbd_assert(total == len); + return chain; +out_err: + bio_chain_put(chain); - *old = old_chain; - - return new_chain; - -err_out: - dout("bio_chain_clone with err\n"); - bio_chain_put(new_chain); return NULL; } @@ -1014,8 +1081,9 @@ static int rbd_do_request(struct request *rq, req_data->coll_index = coll_index; } - dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, - (unsigned long long) ofs, (unsigned long long) len); + dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", + object_name, (unsigned long long) ofs, + (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, @@ -1463,18 +1531,16 @@ static void rbd_rq_fn(struct request_queue *q) { struct rbd_device *rbd_dev = q->queuedata; struct request *rq; - struct bio_pair *bp = NULL; while ((rq = blk_fetch_request(q))) { struct bio *bio; - struct bio *rq_bio, *next_bio = NULL; bool do_write; unsigned int size; - u64 op_size = 0; u64 ofs; int num_segs, cur_seg = 0; struct rbd_req_coll *coll; struct ceph_snap_context *snapc; + unsigned int bio_offset; dout("fetched request\n"); @@ -1486,10 +1552,6 @@ static void rbd_rq_fn(struct request_queue *q) /* deduce our operation (read, write) */ do_write = (rq_data_dir(rq) == WRITE); - - size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * SECTOR_SIZE; - rq_bio = rq->bio; if (do_write && rbd_dev->mapping.read_only) { __blk_end_request_all(rq, -EROFS); continue; @@ -1512,6 +1574,10 @@ static void rbd_rq_fn(struct request_queue *q) up_read(&rbd_dev->header_rwsem); + size = blk_rq_bytes(rq); + ofs = blk_rq_pos(rq) * SECTOR_SIZE; + bio = rq->bio; + dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); @@ -1531,30 +1597,37 @@ static void rbd_rq_fn(struct request_queue *q) continue; } + bio_offset = 0; do { - /* a bio clone to be passed down to OSD req */ + u64 limit = rbd_segment_length(rbd_dev, ofs, size); + unsigned int chain_size; + struct bio *bio_chain; + + BUG_ON(limit > (u64) UINT_MAX); + chain_size = (unsigned int) limit; dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - op_size = rbd_segment_length(rbd_dev, ofs, size); + kref_get(&coll->kref); - bio = bio_chain_clone(&rq_bio, &next_bio, &bp, - op_size, GFP_ATOMIC); - if (bio) + + /* Pass a cloned bio chain via an osd request */ + + bio_chain = bio_chain_clone_range(&bio, + &bio_offset, chain_size, + GFP_ATOMIC); + if (bio_chain) (void) rbd_do_op(rq, rbd_dev, snapc, - ofs, op_size, - bio, coll, cur_seg); + ofs, chain_size, + bio_chain, coll, cur_seg); else rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, op_size); - size -= op_size; - ofs += op_size; + -ENOMEM, chain_size); + size -= chain_size; + ofs += chain_size; cur_seg++; - rq_bio = next_bio; } while (size > 0); kref_put(&coll->kref, rbd_coll_release); - if (bp) - bio_pair_release(bp); spin_lock_irq(q->queue_lock); ceph_put_snap_context(snapc); @@ -1564,7 +1637,7 @@ static void rbd_rq_fn(struct request_queue *q) /* * a queue callback. Makes sure that we don't create a bio that spans across * multiple osd objects. One exception would be with a single page bios, - * which we handle later at bio_chain_clone + * which we handle later at bio_chain_clone_range() */ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) From 41f38c2b2f8b66b176a0e548ef06294343a7bfa2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:40 -0500 Subject: [PATCH 22/68] rbd: remove snapshots on error in rbd_add() If rbd_dev_snaps_update() has ever been called for an rbd device structure there could be snapshot structures on its snaps list. In rbd_add(), this function is called but a subsequent error path neglected to clean up any of these snapshots. Add a call to rbd_remove_all_snaps() in the appropriate spot to remedy this. Change a couple of error labels to be a little clearer while there. Drop the leading underscores from the function name; there's nothing special about that function that they might signify. As suggested in review, the leading underscores in __rbd_remove_snap_dev() have been removed as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cc06c55875b..c7681d46cf8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -228,7 +228,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); -static void __rbd_remove_snap_dev(struct rbd_snap *snap); +static void rbd_remove_snap_dev(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); @@ -1787,13 +1787,13 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return ret; } -static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) +static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) { struct rbd_snap *snap; struct rbd_snap *next; list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) - __rbd_remove_snap_dev(snap); + rbd_remove_snap_dev(snap); } static void rbd_update_mapping_size(struct rbd_device *rbd_dev) @@ -2146,7 +2146,7 @@ static bool rbd_snap_registered(struct rbd_snap *snap) return ret; } -static void __rbd_remove_snap_dev(struct rbd_snap *snap) +static void rbd_remove_snap_dev(struct rbd_snap *snap) { list_del(&snap->node); if (device_is_registered(&snap->dev)) @@ -2569,7 +2569,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) if (rbd_dev->mapping.snap_id == snap->id) rbd_dev->mapping.snap_exists = false; - __rbd_remove_snap_dev(snap); + rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->mapping.snap_id == snap->id ? "mapped " : "", @@ -3179,11 +3179,11 @@ static ssize_t rbd_add(struct bus_type *bus, /* no need to lock here, as rbd_dev is not registered yet */ rc = rbd_dev_snaps_update(rbd_dev); if (rc) - goto err_out_header; + goto err_out_probe; rc = rbd_dev_set_mapping(rbd_dev, snap_name); if (rc) - goto err_out_header; + goto err_out_snaps; /* generate unique id: find highest unique id, add one */ rbd_dev_id_get(rbd_dev); @@ -3247,7 +3247,9 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); -err_out_header: +err_out_snaps: + rbd_remove_all_snaps(rbd_dev); +err_out_probe: rbd_header_free(&rbd_dev->header); err_out_client: kfree(rbd_dev->header_name); @@ -3345,7 +3347,7 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - __rbd_remove_all_snaps(rbd_dev); + rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); done: From 86992098e7fdb97d01feb51495a952b264a55b7c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 23/68] rbd: make pool_id a 64 bit value If a format 2 image has a parent, its pool id will be specified using a 64-bit value. Change the pool id we save for an image to match that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c7681d46cf8..ef82e9091f4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -197,7 +197,7 @@ struct rbd_device { size_t image_name_len; char *header_name; char *pool_name; - int pool_id; + u64 pool_id; struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; @@ -1113,7 +1113,7 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); + layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id); ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, req, ops); rbd_assert(ret == 0); @@ -1982,7 +1982,7 @@ static ssize_t rbd_pool_id_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%d\n", rbd_dev->pool_id); + return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id); } static ssize_t rbd_name_show(struct device *dev, @@ -3170,7 +3170,7 @@ static ssize_t rbd_add(struct bus_type *bus, rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); if (rc < 0) goto err_out_client; - rbd_dev->pool_id = rc; + rbd_dev->pool_id = (u64) rc; rc = rbd_dev_probe(rbd_dev); if (rc < 0) From 971f839a7670197366c04e99472943532caeb0dc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 24/68] rbd: move snap info out of rbd_mapping struct Moving the snap_id and snap_name fields into the separate rbd_mapping structure was misguided. (And in time, perhaps we'll do away with that structure altogether...) Move these fields back into struct rbd_device. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ef82e9091f4..7d28ce33056 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -166,8 +166,6 @@ struct rbd_snap { }; struct rbd_mapping { - char *snap_name; - u64 snap_id; u64 size; u64 features; bool snap_exists; @@ -199,6 +197,9 @@ struct rbd_device { char *pool_name; u64 pool_id; + char *snap_name; + u64 snap_id; + struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; @@ -669,7 +670,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) list_for_each_entry(snap, &rbd_dev->snaps, node) { if (!strcmp(snap_name, snap->name)) { - rbd_dev->mapping.snap_id = snap->id; + rbd_dev->snap_id = snap->id; rbd_dev->mapping.size = snap->size; rbd_dev->mapping.features = snap->features; @@ -686,7 +687,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->mapping.snap_id = CEPH_NOSNAP; + rbd_dev->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; rbd_dev->mapping.snap_exists = false; @@ -698,7 +699,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) rbd_dev->mapping.snap_exists = true; rbd_dev->mapping.read_only = true; } - rbd_dev->mapping.snap_name = snap_name; + rbd_dev->snap_name = snap_name; done: return ret; } @@ -1278,7 +1279,7 @@ static int rbd_do_op(struct request *rq, opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; snapc = NULL; - snapid = rbd_dev->mapping.snap_id; + snapid = rbd_dev->snap_id; payload_len = 0; } @@ -1561,7 +1562,7 @@ static void rbd_rq_fn(struct request_queue *q) down_read(&rbd_dev->header_rwsem); - if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && + if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->mapping.snap_exists) { up_read(&rbd_dev->header_rwsem); dout("request for non-existent snapshot"); @@ -1800,7 +1801,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev) { sector_t size; - if (rbd_dev->mapping.snap_id != CEPH_NOSNAP) + if (rbd_dev->snap_id != CEPH_NOSNAP) return; size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; @@ -2011,7 +2012,7 @@ static ssize_t rbd_snap_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); + return sprintf(buf, "%s\n", rbd_dev->snap_name); } static ssize_t rbd_image_refresh(struct device *dev, @@ -2567,12 +2568,11 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* Existing snapshot not in the new snap context */ - if (rbd_dev->mapping.snap_id == snap->id) + if (rbd_dev->snap_id == snap->id) rbd_dev->mapping.snap_exists = false; rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", - rbd_dev->mapping.snap_id == snap->id ? - "mapped " : "", + rbd_dev->snap_id == snap->id ? "mapped " : "", (unsigned long long) snap->id); /* Done with this list entry; advance */ @@ -3256,7 +3256,7 @@ err_out_client: rbd_put_client(rbd_dev); kfree(rbd_dev->image_id); err_out_args: - kfree(rbd_dev->mapping.snap_name); + kfree(rbd_dev->snap_name); kfree(rbd_dev->image_name); kfree(rbd_dev->pool_name); err_out_mem: @@ -3309,7 +3309,7 @@ static void rbd_dev_release(struct device *dev) rbd_header_free(&rbd_dev->header); /* done with the id, and with the rbd_dev */ - kfree(rbd_dev->mapping.snap_name); + kfree(rbd_dev->snap_name); kfree(rbd_dev->image_id); kfree(rbd_dev->header_name); kfree(rbd_dev->pool_name); From daba5fdb4c469838dcee4b8dd4fecf7be69fa218 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Oct 2012 17:25:23 -0500 Subject: [PATCH 25/68] rbd: rename snap_exists field A Boolean field "snap_exists" in an rbd mapping is used to indicate whether a mapped snapshot has been removed from an image's snapshot context, to stop sending requests for that snapshot as soon as we know it's gone. Generalize the interpretation of this field so it applies to non-snapshot (i.e. "head") mappings. That is, define its value to be false until the mapping has been set, and then define it to be true for both snapshot mappings or head mappings. Rename the field "exists" to reflect the broader interpretation. The rbd_mapping structure is on its way out, so move the field back into the rbd_device structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7d28ce33056..589f56542df 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -168,7 +168,6 @@ struct rbd_snap { struct rbd_mapping { u64 size; u64 features; - bool snap_exists; bool read_only; }; @@ -189,6 +188,7 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; + bool exists; char *image_id; size_t image_id_len; char *image_name; @@ -690,16 +690,15 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) rbd_dev->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; - rbd_dev->mapping.snap_exists = false; ret = 0; } else { ret = snap_by_name(rbd_dev, snap_name); if (ret < 0) goto done; - rbd_dev->mapping.snap_exists = true; rbd_dev->mapping.read_only = true; } rbd_dev->snap_name = snap_name; + rbd_dev->exists = true; done: return ret; } @@ -1562,8 +1561,8 @@ static void rbd_rq_fn(struct request_queue *q) down_read(&rbd_dev->header_rwsem); - if (rbd_dev->snap_id != CEPH_NOSNAP && - !rbd_dev->mapping.snap_exists) { + if (!rbd_dev->exists) { + rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP); up_read(&rbd_dev->header_rwsem); dout("request for non-existent snapshot"); spin_lock_irq(q->queue_lock); @@ -2569,7 +2568,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* Existing snapshot not in the new snap context */ if (rbd_dev->snap_id == snap->id) - rbd_dev->mapping.snap_exists = false; + rbd_dev->exists = false; rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->snap_id == snap->id ? "mapped " : "", From 78cea76e0580befaf561c6989f4fc985bc66c8f7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 26/68] rbd: move ceph_parse_options() call up Move option parsing out of rbd_get_client() and into its caller. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 48 +++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 589f56542df..e83bddcca34 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -453,27 +453,11 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, - size_t mon_addr_len, char *options) +static int rbd_get_client(struct rbd_device *rbd_dev, + struct ceph_options *ceph_opts) { - struct rbd_options rbd_opts; - struct ceph_options *ceph_opts; struct rbd_client *rbdc; - /* Initialize all rbd options to the defaults */ - - rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; - - ceph_opts = ceph_parse_options(options, mon_addr, - mon_addr + mon_addr_len, - parse_rbd_opts_token, &rbd_opts); - if (IS_ERR(ceph_opts)) - return PTR_ERR(ceph_opts); - - /* Record the parsed rbd options */ - - rbd_dev->mapping.read_only = rbd_opts.read_only; - rbdc = rbd_client_find(ceph_opts); if (rbdc) { /* using an existing client */ @@ -3132,9 +3116,11 @@ static ssize_t rbd_add(struct bus_type *bus, struct rbd_device *rbd_dev = NULL; const char *mon_addrs = NULL; size_t mon_addrs_size = 0; + char *snap_name; + struct rbd_options rbd_opts; + struct ceph_options *ceph_opts; struct ceph_osd_client *osdc; int rc = -ENOMEM; - char *snap_name; if (!try_module_get(THIS_MODULE)) return -ENODEV; @@ -3160,9 +3146,26 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_mem; } - rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); - if (rc < 0) + /* Initialize all rbd options to the defaults */ + + rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; + + ceph_opts = ceph_parse_options(options, mon_addrs, + mon_addrs + mon_addrs_size - 1, + parse_rbd_opts_token, &rbd_opts); + if (IS_ERR(ceph_opts)) { + rc = PTR_ERR(ceph_opts); goto err_out_args; + } + + /* Record the parsed rbd options */ + + rbd_dev->mapping.read_only = rbd_opts.read_only; + + rc = rbd_get_client(rbd_dev, ceph_opts); + if (rc < 0) + goto err_out_opts; + ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */ /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; @@ -3254,6 +3257,9 @@ err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); kfree(rbd_dev->image_id); +err_out_opts: + if (ceph_opts) + ceph_destroy_options(ceph_opts); err_out_args: kfree(rbd_dev->snap_name); kfree(rbd_dev->image_name); From 0ddebc0c6c518ae42c376151e34d9d4b84443ba5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 27/68] rbd: do all argument parsing in one place This patch makes rbd_add_parse_args() be the single place all argument parsing occurs for an image map request: - Move the ceph_parse_options() call into that function - Use local variables rather than parameters to hold the list of monitor addresses supplied - Rather than returning it, pass the snapshot name (and its length) back via parameters - Have the function return a ceph_options structure pointer Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 79 ++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e83bddcca34..68447d83288 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2845,24 +2845,27 @@ static inline char *dup_token(const char **buf, size_t *lenp) * * Note: rbd_dev is assumed to have been initially zero-filled. */ -static char *rbd_add_parse_args(struct rbd_device *rbd_dev, - const char *buf, - const char **mon_addrs, - size_t *mon_addrs_size, - char *options, - size_t options_size) +static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, + const char *buf, + char *options, + size_t options_size, + char **snap_name, + size_t *snap_name_len) { size_t len; - char *err_ptr = ERR_PTR(-EINVAL); - char *snap_name; + const char *mon_addrs; + size_t mon_addrs_size; + struct rbd_options rbd_opts; + struct ceph_options *ceph_opts; + struct ceph_options *err_ptr = ERR_PTR(-EINVAL); /* The first four tokens are required */ len = next_token(&buf); if (!len) return err_ptr; - *mon_addrs_size = len + 1; - *mon_addrs = buf; + mon_addrs_size = len + 1; + mon_addrs = buf; buf += len; @@ -2890,14 +2893,27 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev, buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ len = sizeof (RBD_SNAP_HEAD_NAME) - 1; } - snap_name = kmalloc(len + 1, GFP_KERNEL); - if (!snap_name) + *snap_name = kmalloc(len + 1, GFP_KERNEL); + if (!*snap_name) goto out_err; - memcpy(snap_name, buf, len); - *(snap_name + len) = '\0'; + memcpy(*snap_name, buf, len); + *(*snap_name + len) = '\0'; + *snap_name_len = len; + /* Initialize all rbd options to the defaults */ - return snap_name; + rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; + ceph_opts = ceph_parse_options(options, mon_addrs, + mon_addrs + mon_addrs_size - 1, + parse_rbd_opts_token, &rbd_opts); + + /* Record the parsed rbd options */ + + if (!IS_ERR(ceph_opts)) { + rbd_dev->mapping.read_only = rbd_opts.read_only; + } + + return ceph_opts; out_err: kfree(rbd_dev->image_name); rbd_dev->image_name = NULL; @@ -3114,10 +3130,8 @@ static ssize_t rbd_add(struct bus_type *bus, { char *options; struct rbd_device *rbd_dev = NULL; - const char *mon_addrs = NULL; - size_t mon_addrs_size = 0; char *snap_name; - struct rbd_options rbd_opts; + size_t snap_name_len = 0; struct ceph_options *ceph_opts; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3139,32 +3153,16 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* parse add command */ - snap_name = rbd_add_parse_args(rbd_dev, buf, - &mon_addrs, &mon_addrs_size, options, count); - if (IS_ERR(snap_name)) { - rc = PTR_ERR(snap_name); + ceph_opts = rbd_add_parse_args(rbd_dev, buf, options, count, + &snap_name, &snap_name_len); + if (IS_ERR(ceph_opts)) { + rc = PTR_ERR(ceph_opts); goto err_out_mem; } - /* Initialize all rbd options to the defaults */ - - rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; - - ceph_opts = ceph_parse_options(options, mon_addrs, - mon_addrs + mon_addrs_size - 1, - parse_rbd_opts_token, &rbd_opts); - if (IS_ERR(ceph_opts)) { - rc = PTR_ERR(ceph_opts); - goto err_out_args; - } - - /* Record the parsed rbd options */ - - rbd_dev->mapping.read_only = rbd_opts.read_only; - rc = rbd_get_client(rbd_dev, ceph_opts); if (rc < 0) - goto err_out_opts; + goto err_out_args; ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */ /* pick the pool */ @@ -3257,10 +3255,9 @@ err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); kfree(rbd_dev->image_id); -err_out_opts: +err_out_args: if (ceph_opts) ceph_destroy_options(ceph_opts); -err_out_args: kfree(rbd_dev->snap_name); kfree(rbd_dev->image_name); kfree(rbd_dev->pool_name); From e5c35534042f4b5957a32bba651222c91247beba Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 28/68] rbd: get rid of snap_name_len The value returned in the "snap_name_len" argument to rbd_add_parse_args() is never actually used, so get rid of it. The snap_name_len recorded in rbd_dev_v2_snap_name() is not useful either, so get rid of that too. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 68447d83288..7bd23139b94 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2408,7 +2408,6 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) int ret; void *p; void *end; - size_t snap_name_len; char *snap_name; size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; @@ -2428,9 +2427,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) p = reply_buf; end = (char *) reply_buf + size; - snap_name_len = 0; - snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len, - GFP_KERNEL); + snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(snap_name)) { ret = PTR_ERR(snap_name); goto out; @@ -2849,8 +2846,7 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, char *options, size_t options_size, - char **snap_name, - size_t *snap_name_len) + char **snap_name) { size_t len; const char *mon_addrs; @@ -2898,7 +2894,7 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, goto out_err; memcpy(*snap_name, buf, len); *(*snap_name + len) = '\0'; - *snap_name_len = len; + /* Initialize all rbd options to the defaults */ rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; @@ -3131,7 +3127,6 @@ static ssize_t rbd_add(struct bus_type *bus, char *options; struct rbd_device *rbd_dev = NULL; char *snap_name; - size_t snap_name_len = 0; struct ceph_options *ceph_opts; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3154,7 +3149,7 @@ static ssize_t rbd_add(struct bus_type *bus, /* parse add command */ ceph_opts = rbd_add_parse_args(rbd_dev, buf, options, count, - &snap_name, &snap_name_len); + &snap_name); if (IS_ERR(ceph_opts)) { rc = PTR_ERR(ceph_opts); goto err_out_mem; From f28e565a1b15eef62618db4011d9e320089a4214 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 29/68] rbd: remove options args from rbd_add_parse_args() They "options" argument to rbd_add_parse_args() (and it's partner options_size) is now only needed within the function, so there's no need to have the caller allocate and pass the options buffer. Just allocate the options buffer within the function using dup_token(). Also distinguish between failures due to failed memory allocation and failing because a required argument was missing. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 58 ++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7bd23139b94..62df67a1132 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2844,54 +2844,58 @@ static inline char *dup_token(const char **buf, size_t *lenp) */ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, - char *options, - size_t options_size, char **snap_name) { size_t len; const char *mon_addrs; size_t mon_addrs_size; + char *options; + struct ceph_options *err_ptr = ERR_PTR(-EINVAL); struct rbd_options rbd_opts; struct ceph_options *ceph_opts; - struct ceph_options *err_ptr = ERR_PTR(-EINVAL); /* The first four tokens are required */ len = next_token(&buf); if (!len) - return err_ptr; - mon_addrs_size = len + 1; + return err_ptr; /* Missing monitor address(es) */ mon_addrs = buf; - + mon_addrs_size = len + 1; buf += len; - len = copy_token(&buf, options, options_size); - if (!len || len >= options_size) - return err_ptr; + options = dup_token(&buf, NULL); + if (!options) + goto out_mem; + if (!*options) + goto out_err; /* Missing options */ - err_ptr = ERR_PTR(-ENOMEM); rbd_dev->pool_name = dup_token(&buf, NULL); if (!rbd_dev->pool_name) - goto out_err; + goto out_mem; + if (!*rbd_dev->pool_name) + goto out_err; /* Missing pool name */ rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); if (!rbd_dev->image_name) - goto out_err; - - /* Snapshot name is optional; default is to use "head" */ + goto out_mem; + if (!*rbd_dev->image_name) + goto out_err; /* Missing image name */ + /* + * Snapshot name is optional; default is to use "-" + * (indicating the head/no snapshot). + */ len = next_token(&buf); - if (len > RBD_MAX_SNAP_NAME_LEN) { - err_ptr = ERR_PTR(-ENAMETOOLONG); - goto out_err; - } if (!len) { buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ len = sizeof (RBD_SNAP_HEAD_NAME) - 1; + } else if (len > RBD_MAX_SNAP_NAME_LEN) { + err_ptr = ERR_PTR(-ENAMETOOLONG); + goto out_err; } *snap_name = kmalloc(len + 1, GFP_KERNEL); if (!*snap_name) - goto out_err; + goto out_mem; memcpy(*snap_name, buf, len); *(*snap_name + len) = '\0'; @@ -2902,20 +2906,23 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, ceph_opts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, parse_rbd_opts_token, &rbd_opts); + kfree(options); /* Record the parsed rbd options */ - if (!IS_ERR(ceph_opts)) { + if (!IS_ERR(ceph_opts)) rbd_dev->mapping.read_only = rbd_opts.read_only; - } return ceph_opts; +out_mem: + err_ptr = ERR_PTR(-ENOMEM); out_err: kfree(rbd_dev->image_name); rbd_dev->image_name = NULL; rbd_dev->image_name_len = 0; kfree(rbd_dev->pool_name); rbd_dev->pool_name = NULL; + kfree(options); return err_ptr; } @@ -3124,7 +3131,6 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { - char *options; struct rbd_device *rbd_dev = NULL; char *snap_name; struct ceph_options *ceph_opts; @@ -3134,9 +3140,6 @@ static ssize_t rbd_add(struct bus_type *bus, if (!try_module_get(THIS_MODULE)) return -ENODEV; - options = kmalloc(count, GFP_KERNEL); - if (!options) - goto err_out_mem; rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) goto err_out_mem; @@ -3148,8 +3151,7 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* parse add command */ - ceph_opts = rbd_add_parse_args(rbd_dev, buf, options, count, - &snap_name); + ceph_opts = rbd_add_parse_args(rbd_dev, buf, &snap_name); if (IS_ERR(ceph_opts)) { rc = PTR_ERR(ceph_opts); goto err_out_mem; @@ -3233,7 +3235,6 @@ err_out_bus: /* this will also clean up rest of rbd_dev stuff */ rbd_bus_del_dev(rbd_dev); - kfree(options); return rc; err_out_disk: @@ -3258,7 +3259,6 @@ err_out_args: kfree(rbd_dev->pool_name); err_out_mem: kfree(rbd_dev); - kfree(options); dout("Error adding device %s\n", buf); module_put(THIS_MODULE); From 819d52bf72b61a8455024ff7863eed5d681e73c7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 30/68] rbd: remove snap_name arg from rbd_add_parse_args() The snapshot name returned by rbd_add_parse_args() just gets saved in the rbd_dev eventually. So just do that inside that function and do away with the snap_name argument, both in rbd_add_parse_args() and rbd_dev_set_mapping(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 62df67a1132..ae16cf615f0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -665,23 +665,22 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) return -ENOENT; } -static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) +static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) { int ret; - if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; ret = 0; } else { - ret = snap_by_name(rbd_dev, snap_name); + ret = snap_by_name(rbd_dev, rbd_dev->snap_name); if (ret < 0) goto done; rbd_dev->mapping.read_only = true; } - rbd_dev->snap_name = snap_name; rbd_dev->exists = true; done: return ret; @@ -2843,8 +2842,7 @@ static inline char *dup_token(const char **buf, size_t *lenp) * Note: rbd_dev is assumed to have been initially zero-filled. */ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, - const char *buf, - char **snap_name) + const char *buf) { size_t len; const char *mon_addrs; @@ -2893,11 +2891,11 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, err_ptr = ERR_PTR(-ENAMETOOLONG); goto out_err; } - *snap_name = kmalloc(len + 1, GFP_KERNEL); - if (!*snap_name) + rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL); + if (!rbd_dev->snap_name) goto out_mem; - memcpy(*snap_name, buf, len); - *(*snap_name + len) = '\0'; + memcpy(rbd_dev->snap_name, buf, len); + *(rbd_dev->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ @@ -3132,7 +3130,6 @@ static ssize_t rbd_add(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev = NULL; - char *snap_name; struct ceph_options *ceph_opts; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3151,7 +3148,7 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* parse add command */ - ceph_opts = rbd_add_parse_args(rbd_dev, buf, &snap_name); + ceph_opts = rbd_add_parse_args(rbd_dev, buf); if (IS_ERR(ceph_opts)) { rc = PTR_ERR(ceph_opts); goto err_out_mem; @@ -3178,7 +3175,7 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_probe; - rc = rbd_dev_set_mapping(rbd_dev, snap_name); + rc = rbd_dev_set_mapping(rbd_dev); if (rc) goto err_out_snaps; From 4e9afeba7aa9ca925a79d9ce82f1a8e79e14c291 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 31/68] rbd: pass and populate rbd_options structure Have the caller pass the address of an rbd_options structure to rbd_add_parse_args(), to be initialized with the information gleaned as a result of the parse. I know, this is another near-reversal of a recent change... Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ae16cf615f0..338cfadad63 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2842,15 +2842,16 @@ static inline char *dup_token(const char **buf, size_t *lenp) * Note: rbd_dev is assumed to have been initially zero-filled. */ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, - const char *buf) + const char *buf, + struct rbd_options **opts) { size_t len; const char *mon_addrs; size_t mon_addrs_size; char *options; struct ceph_options *err_ptr = ERR_PTR(-EINVAL); - struct rbd_options rbd_opts; struct ceph_options *ceph_opts; + struct rbd_options *rbd_opts = NULL; /* The first four tokens are required */ @@ -2899,17 +2900,17 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, /* Initialize all rbd options to the defaults */ - rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; + rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); + if (!rbd_opts) + goto out_mem; + + rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; ceph_opts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, - parse_rbd_opts_token, &rbd_opts); + parse_rbd_opts_token, rbd_opts); kfree(options); - - /* Record the parsed rbd options */ - - if (!IS_ERR(ceph_opts)) - rbd_dev->mapping.read_only = rbd_opts.read_only; + *opts = rbd_opts; return ceph_opts; out_mem: @@ -3131,6 +3132,7 @@ static ssize_t rbd_add(struct bus_type *bus, { struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts; + struct rbd_options *rbd_opts = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3139,7 +3141,7 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) - goto err_out_mem; + return -ENOMEM; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -3148,11 +3150,12 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* parse add command */ - ceph_opts = rbd_add_parse_args(rbd_dev, buf); + ceph_opts = rbd_add_parse_args(rbd_dev, buf, &rbd_opts); if (IS_ERR(ceph_opts)) { rc = PTR_ERR(ceph_opts); goto err_out_mem; } + rbd_dev->mapping.read_only = rbd_opts->read_only; rc = rbd_get_client(rbd_dev, ceph_opts); if (rc < 0) @@ -3219,6 +3222,8 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_bus; + kfree(rbd_opts); + /* Everything's ready. Announce the disk to the world. */ add_disk(rbd_dev->disk); @@ -3232,6 +3237,8 @@ err_out_bus: /* this will also clean up rest of rbd_dev stuff */ rbd_bus_del_dev(rbd_dev); + kfree(rbd_opts); + return rc; err_out_disk: @@ -3254,6 +3261,7 @@ err_out_args: kfree(rbd_dev->snap_name); kfree(rbd_dev->image_name); kfree(rbd_dev->pool_name); + kfree(rbd_opts); err_out_mem: kfree(rbd_dev); From dc79b113d6db48ee6ee7decf0216feee48757f01 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 32/68] rbd: have rbd_add_parse_args() return error Change the interface to rbd_add_parse_args() so it returns an error code rather than a pointer. Return the ceph_options result via a pointer whose address is passed as an argument. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 338cfadad63..ab6e6d8364e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2841,30 +2841,31 @@ static inline char *dup_token(const char **buf, size_t *lenp) * * Note: rbd_dev is assumed to have been initially zero-filled. */ -static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, - const char *buf, - struct rbd_options **opts) +static int rbd_add_parse_args(struct rbd_device *rbd_dev, + const char *buf, + struct ceph_options **ceph_opts, + struct rbd_options **opts) { size_t len; const char *mon_addrs; size_t mon_addrs_size; char *options; - struct ceph_options *err_ptr = ERR_PTR(-EINVAL); - struct ceph_options *ceph_opts; struct rbd_options *rbd_opts = NULL; + int ret; /* The first four tokens are required */ len = next_token(&buf); if (!len) - return err_ptr; /* Missing monitor address(es) */ + return -EINVAL; /* Missing monitor address(es) */ mon_addrs = buf; mon_addrs_size = len + 1; buf += len; + ret = -EINVAL; options = dup_token(&buf, NULL); if (!options) - goto out_mem; + return -ENOMEM; if (!*options) goto out_err; /* Missing options */ @@ -2889,7 +2890,7 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ len = sizeof (RBD_SNAP_HEAD_NAME) - 1; } else if (len > RBD_MAX_SNAP_NAME_LEN) { - err_ptr = ERR_PTR(-ENAMETOOLONG); + ret = -ENAMETOOLONG; goto out_err; } rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL); @@ -2906,15 +2907,19 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev, rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; - ceph_opts = ceph_parse_options(options, mon_addrs, + *ceph_opts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, parse_rbd_opts_token, rbd_opts); kfree(options); + if (IS_ERR(*ceph_opts)) { + ret = PTR_ERR(*ceph_opts); + goto out_err; + } *opts = rbd_opts; - return ceph_opts; + return 0; out_mem: - err_ptr = ERR_PTR(-ENOMEM); + ret = -ENOMEM; out_err: kfree(rbd_dev->image_name); rbd_dev->image_name = NULL; @@ -2923,7 +2928,7 @@ out_err: rbd_dev->pool_name = NULL; kfree(options); - return err_ptr; + return ret; } /* @@ -3131,7 +3136,7 @@ static ssize_t rbd_add(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev = NULL; - struct ceph_options *ceph_opts; + struct ceph_options *ceph_opts = NULL; struct rbd_options *rbd_opts = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3150,11 +3155,9 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* parse add command */ - ceph_opts = rbd_add_parse_args(rbd_dev, buf, &rbd_opts); - if (IS_ERR(ceph_opts)) { - rc = PTR_ERR(ceph_opts); + rc = rbd_add_parse_args(rbd_dev, buf, &ceph_opts, &rbd_opts); + if (rc < 0) goto err_out_mem; - } rbd_dev->mapping.read_only = rbd_opts->read_only; rc = rbd_get_client(rbd_dev, ceph_opts); From 0d7dbfce9d6e3a57a6946fadf7f92b1792b8acc0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:41 -0500 Subject: [PATCH 33/68] rbd: define image specification structure Group the fields that uniquely specify an rbd image into a new reference-counted rbd_spec structure. This structure will be used to describe the desired image when mapping an image, and when probing parent images in layered rbd devices. Replace the set of fields in the rbd device structure with a pointer to a dynamically allocated rbd_spec. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 158 +++++++++++++++++++++++++------------------- 1 file changed, 90 insertions(+), 68 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ab6e6d8364e..2049810fcdc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -112,6 +112,27 @@ struct rbd_image_header { u64 obj_version; }; +/* + * An rbd image specification. + * + * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely + * identify an image. + */ +struct rbd_spec { + u64 pool_id; + char *pool_name; + + char *image_id; + size_t image_id_len; + char *image_name; + size_t image_name_len; + + u64 snap_id; + char *snap_name; + + struct kref kref; +}; + struct rbd_options { bool read_only; }; @@ -189,16 +210,9 @@ struct rbd_device { struct rbd_image_header header; bool exists; - char *image_id; - size_t image_id_len; - char *image_name; - size_t image_name_len; - char *header_name; - char *pool_name; - u64 pool_id; + struct rbd_spec *spec; - char *snap_name; - u64 snap_id; + char *header_name; struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; @@ -654,7 +668,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) list_for_each_entry(snap, &rbd_dev->snaps, node) { if (!strcmp(snap_name, snap->name)) { - rbd_dev->snap_id = snap->id; + rbd_dev->spec->snap_id = snap->id; rbd_dev->mapping.size = snap->size; rbd_dev->mapping.features = snap->features; @@ -669,14 +683,14 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) { int ret; - if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->snap_id = CEPH_NOSNAP; + rbd_dev->spec->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; ret = 0; } else { - ret = snap_by_name(rbd_dev, rbd_dev->snap_name); + ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); if (ret < 0) goto done; rbd_dev->mapping.read_only = true; @@ -1096,7 +1110,7 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id); + layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, req, ops); rbd_assert(ret == 0); @@ -1261,7 +1275,7 @@ static int rbd_do_op(struct request *rq, opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; snapc = NULL; - snapid = rbd_dev->snap_id; + snapid = rbd_dev->spec->snap_id; payload_len = 0; } @@ -1545,7 +1559,7 @@ static void rbd_rq_fn(struct request_queue *q) down_read(&rbd_dev->header_rwsem); if (!rbd_dev->exists) { - rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP); + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); up_read(&rbd_dev->header_rwsem); dout("request for non-existent snapshot"); spin_lock_irq(q->queue_lock); @@ -1726,13 +1740,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) ret = -ENXIO; pr_warning("short header read for image %s" " (want %zd got %d)\n", - rbd_dev->image_name, size, ret); + rbd_dev->spec->image_name, size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; pr_warning("invalid header for image %s\n", - rbd_dev->image_name); + rbd_dev->spec->image_name); goto out_err; } @@ -1783,7 +1797,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev) { sector_t size; - if (rbd_dev->snap_id != CEPH_NOSNAP) + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) return; size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; @@ -1957,7 +1971,7 @@ static ssize_t rbd_pool_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->pool_name); + return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); } static ssize_t rbd_pool_id_show(struct device *dev, @@ -1965,7 +1979,8 @@ static ssize_t rbd_pool_id_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id); + return sprintf(buf, "%llu\n", + (unsigned long long) rbd_dev->spec->pool_id); } static ssize_t rbd_name_show(struct device *dev, @@ -1973,7 +1988,7 @@ static ssize_t rbd_name_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->image_name); + return sprintf(buf, "%s\n", rbd_dev->spec->image_name); } static ssize_t rbd_image_id_show(struct device *dev, @@ -1981,7 +1996,7 @@ static ssize_t rbd_image_id_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->image_id); + return sprintf(buf, "%s\n", rbd_dev->spec->image_id); } /* @@ -1994,7 +2009,7 @@ static ssize_t rbd_snap_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->snap_name); + return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); } static ssize_t rbd_image_refresh(struct device *dev, @@ -2547,11 +2562,12 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* Existing snapshot not in the new snap context */ - if (rbd_dev->snap_id == snap->id) + if (rbd_dev->spec->snap_id == snap->id) rbd_dev->exists = false; rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", - rbd_dev->snap_id == snap->id ? "mapped " : "", + rbd_dev->spec->snap_id == snap->id ? + "mapped " : "", (unsigned long long) snap->id); /* Done with this list entry; advance */ @@ -2869,16 +2885,17 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, if (!*options) goto out_err; /* Missing options */ - rbd_dev->pool_name = dup_token(&buf, NULL); - if (!rbd_dev->pool_name) + rbd_dev->spec->pool_name = dup_token(&buf, NULL); + if (!rbd_dev->spec->pool_name) goto out_mem; - if (!*rbd_dev->pool_name) + if (!*rbd_dev->spec->pool_name) goto out_err; /* Missing pool name */ - rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); - if (!rbd_dev->image_name) + rbd_dev->spec->image_name = + dup_token(&buf, &rbd_dev->spec->image_name_len); + if (!rbd_dev->spec->image_name) goto out_mem; - if (!*rbd_dev->image_name) + if (!*rbd_dev->spec->image_name) goto out_err; /* Missing image name */ /* @@ -2893,11 +2910,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, ret = -ENAMETOOLONG; goto out_err; } - rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL); - if (!rbd_dev->snap_name) + rbd_dev->spec->snap_name = kmalloc(len + 1, GFP_KERNEL); + if (!rbd_dev->spec->snap_name) goto out_mem; - memcpy(rbd_dev->snap_name, buf, len); - *(rbd_dev->snap_name + len) = '\0'; + memcpy(rbd_dev->spec->snap_name, buf, len); + *(rbd_dev->spec->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ @@ -2921,11 +2938,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, out_mem: ret = -ENOMEM; out_err: - kfree(rbd_dev->image_name); - rbd_dev->image_name = NULL; - rbd_dev->image_name_len = 0; - kfree(rbd_dev->pool_name); - rbd_dev->pool_name = NULL; + kfree(rbd_dev->spec->image_name); + rbd_dev->spec->image_name = NULL; + rbd_dev->spec->image_name_len = 0; + kfree(rbd_dev->spec->pool_name); + rbd_dev->spec->pool_name = NULL; kfree(options); return ret; @@ -2957,11 +2974,11 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; + size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; object_name = kmalloc(size, GFP_NOIO); if (!object_name) return -ENOMEM; - sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); + sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); dout("rbd id object name is %s\n", object_name); /* Response will be an encoded string, which includes a length */ @@ -2984,15 +3001,15 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = 0; /* rbd_req_sync_exec() can return positive */ p = response; - rbd_dev->image_id = ceph_extract_encoded_string(&p, + rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, p + RBD_IMAGE_ID_LEN_MAX, - &rbd_dev->image_id_len, + &rbd_dev->spec->image_id_len, GFP_NOIO); - if (IS_ERR(rbd_dev->image_id)) { - ret = PTR_ERR(rbd_dev->image_id); - rbd_dev->image_id = NULL; + if (IS_ERR(rbd_dev->spec->image_id)) { + ret = PTR_ERR(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; } else { - dout("image_id is %s\n", rbd_dev->image_id); + dout("image_id is %s\n", rbd_dev->spec->image_id); } out: kfree(response); @@ -3008,20 +3025,21 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) /* Version 1 images have no id; empty string is used */ - rbd_dev->image_id = kstrdup("", GFP_KERNEL); - if (!rbd_dev->image_id) + rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); + if (!rbd_dev->spec->image_id) return -ENOMEM; - rbd_dev->image_id_len = 0; + rbd_dev->spec->image_id_len = 0; /* Record the header object name for this rbd image. */ - size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); + size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) { ret = -ENOMEM; goto out_err; } - sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); + sprintf(rbd_dev->header_name, "%s%s", + rbd_dev->spec->image_name, RBD_SUFFIX); /* Populate rbd image metadata */ @@ -3038,8 +3056,8 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) out_err: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; - kfree(rbd_dev->image_id); - rbd_dev->image_id = NULL; + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; return ret; } @@ -3054,12 +3072,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) * Image id was filled in by the caller. Record the header * object name for this rbd image. */ - size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; + size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) return -ENOMEM; sprintf(rbd_dev->header_name, "%s%s", - RBD_HEADER_PREFIX, rbd_dev->image_id); + RBD_HEADER_PREFIX, rbd_dev->spec->image_id); /* Get the size and object order for the image */ @@ -3147,6 +3165,9 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) return -ENOMEM; + rbd_dev->spec = kzalloc(sizeof (*rbd_dev->spec), GFP_KERNEL); + if (!rbd_dev->spec) + goto err_out_mem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -3167,10 +3188,10 @@ static ssize_t rbd_add(struct bus_type *bus, /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; - rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); + rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->spec->pool_name); if (rc < 0) goto err_out_client; - rbd_dev->pool_id = (u64) rc; + rbd_dev->spec->pool_id = (u64) rc; rc = rbd_dev_probe(rbd_dev); if (rc < 0) @@ -3257,15 +3278,16 @@ err_out_probe: err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); - kfree(rbd_dev->image_id); + kfree(rbd_dev->spec->image_id); err_out_args: if (ceph_opts) ceph_destroy_options(ceph_opts); - kfree(rbd_dev->snap_name); - kfree(rbd_dev->image_name); - kfree(rbd_dev->pool_name); + kfree(rbd_dev->spec->snap_name); + kfree(rbd_dev->spec->image_name); + kfree(rbd_dev->spec->pool_name); kfree(rbd_opts); err_out_mem: + kfree(rbd_dev->spec); kfree(rbd_dev); dout("Error adding device %s\n", buf); @@ -3314,11 +3336,11 @@ static void rbd_dev_release(struct device *dev) rbd_header_free(&rbd_dev->header); /* done with the id, and with the rbd_dev */ - kfree(rbd_dev->snap_name); - kfree(rbd_dev->image_id); + kfree(rbd_dev->spec->snap_name); + kfree(rbd_dev->spec->image_id); kfree(rbd_dev->header_name); - kfree(rbd_dev->pool_name); - kfree(rbd_dev->image_name); + kfree(rbd_dev->spec->pool_name); + kfree(rbd_dev->spec->image_name); rbd_dev_id_put(rbd_dev); kfree(rbd_dev); From 8b8fb99c5c93a0bdfe7b0c0c9fd2d41a3244555e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Oct 2012 17:25:24 -0500 Subject: [PATCH 34/68] rbd: add reference counting to rbd_spec With layered images we'll share rbd_spec structures, so add a reference count to it. It neatens up some code also. A silly get/put pair is added to the alloc routine just to avoid "defined but not used" warnings. It will go away soon. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 52 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2049810fcdc..86206a75017 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2134,6 +2134,45 @@ static struct device_type rbd_snap_device_type = { .release = rbd_snap_dev_release, }; +static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) +{ + kref_get(&spec->kref); + + return spec; +} + +static void rbd_spec_free(struct kref *kref); +static void rbd_spec_put(struct rbd_spec *spec) +{ + if (spec) + kref_put(&spec->kref, rbd_spec_free); +} + +static struct rbd_spec *rbd_spec_alloc(void) +{ + struct rbd_spec *spec; + + spec = kzalloc(sizeof (*spec), GFP_KERNEL); + if (!spec) + return NULL; + kref_init(&spec->kref); + + rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ + + return spec; +} + +static void rbd_spec_free(struct kref *kref) +{ + struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); + + kfree(spec->pool_name); + kfree(spec->image_id); + kfree(spec->image_name); + kfree(spec->snap_name); + kfree(spec); +} + static bool rbd_snap_registered(struct rbd_snap *snap) { bool ret = snap->dev.type == &rbd_snap_device_type; @@ -3165,7 +3204,7 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) return -ENOMEM; - rbd_dev->spec = kzalloc(sizeof (*rbd_dev->spec), GFP_KERNEL); + rbd_dev->spec = rbd_spec_alloc(); if (!rbd_dev->spec) goto err_out_mem; @@ -3278,16 +3317,12 @@ err_out_probe: err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); - kfree(rbd_dev->spec->image_id); err_out_args: if (ceph_opts) ceph_destroy_options(ceph_opts); - kfree(rbd_dev->spec->snap_name); - kfree(rbd_dev->spec->image_name); - kfree(rbd_dev->spec->pool_name); kfree(rbd_opts); err_out_mem: - kfree(rbd_dev->spec); + rbd_spec_put(rbd_dev->spec); kfree(rbd_dev); dout("Error adding device %s\n", buf); @@ -3336,12 +3371,9 @@ static void rbd_dev_release(struct device *dev) rbd_header_free(&rbd_dev->header); /* done with the id, and with the rbd_dev */ - kfree(rbd_dev->spec->snap_name); - kfree(rbd_dev->spec->image_id); kfree(rbd_dev->header_name); - kfree(rbd_dev->spec->pool_name); - kfree(rbd_dev->spec->image_name); rbd_dev_id_put(rbd_dev); + rbd_spec_put(rbd_dev->spec); kfree(rbd_dev); /* release module ref */ From 859c31df9cee9d1e1308b3b024b61355e6a629a5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:42 -0500 Subject: [PATCH 35/68] rbd: fill rbd_spec in rbd_add_parse_args() Pass the address of an rbd_spec structure to rbd_add_parse_args(). Use it to hold the information defining the rbd image to be mapped in an rbd_add() call. Use the result in the caller to initialize the rbd_dev->id field. This means rbd_dev is no longer needed in rbd_add_parse_args(), so get rid of it. Now that this transformation of rbd_add_parse_args() is complete, correct and expand on the its header documentation to reflect the new reality. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 113 +++++++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 86206a75017..be85d925dfd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2887,25 +2887,58 @@ static inline char *dup_token(const char **buf, size_t *lenp) } /* - * This fills in the pool_name, image_name, image_name_len, rbd_dev, - * rbd_md_name, and name fields of the given rbd_dev, based on the - * list of monitor addresses and other options provided via - * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated - * copy of the snapshot name to map if successful, or a - * pointer-coded error otherwise. + * Parse the options provided for an "rbd add" (i.e., rbd image + * mapping) request. These arrive via a write to /sys/bus/rbd/add, + * and the data written is passed here via a NUL-terminated buffer. + * Returns 0 if successful or an error code otherwise. * - * Note: rbd_dev is assumed to have been initially zero-filled. + * The information extracted from these options is recorded in + * the other parameters which return dynamically-allocated + * structures: + * ceph_opts + * The address of a pointer that will refer to a ceph options + * structure. Caller must release the returned pointer using + * ceph_destroy_options() when it is no longer needed. + * rbd_opts + * Address of an rbd options pointer. Fully initialized by + * this function; caller must release with kfree(). + * spec + * Address of an rbd image specification pointer. Fully + * initialized by this function based on parsed options. + * Caller must release with rbd_spec_put(). + * + * The options passed take this form: + * [] + * where: + * + * A comma-separated list of one or more monitor addresses. + * A monitor address is an ip address, optionally followed + * by a port number (separated by a colon). + * I.e.: ip1[:port1][,ip2[:port2]...] + * + * A comma-separated list of ceph and/or rbd options. + * + * The name of the rados pool containing the rbd image. + * + * The name of the image in that pool to map. + * + * An optional snapshot id. If provided, the mapping will + * present data from the image at the time that snapshot was + * created. The image head is used if no snapshot id is + * provided. Snapshot mappings are always read-only. */ -static int rbd_add_parse_args(struct rbd_device *rbd_dev, - const char *buf, +static int rbd_add_parse_args(const char *buf, struct ceph_options **ceph_opts, - struct rbd_options **opts) + struct rbd_options **opts, + struct rbd_spec **rbd_spec) { size_t len; + char *options; const char *mon_addrs; size_t mon_addrs_size; - char *options; + struct rbd_spec *spec = NULL; struct rbd_options *rbd_opts = NULL; + struct ceph_options *copts; int ret; /* The first four tokens are required */ @@ -2924,17 +2957,20 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, if (!*options) goto out_err; /* Missing options */ - rbd_dev->spec->pool_name = dup_token(&buf, NULL); - if (!rbd_dev->spec->pool_name) + spec = rbd_spec_alloc(); + if (!spec) goto out_mem; - if (!*rbd_dev->spec->pool_name) + + spec->pool_name = dup_token(&buf, NULL); + if (!spec->pool_name) + goto out_mem; + if (!*spec->pool_name) goto out_err; /* Missing pool name */ - rbd_dev->spec->image_name = - dup_token(&buf, &rbd_dev->spec->image_name_len); - if (!rbd_dev->spec->image_name) + spec->image_name = dup_token(&buf, &spec->image_name_len); + if (!spec->image_name) goto out_mem; - if (!*rbd_dev->spec->image_name) + if (!*spec->image_name) goto out_err; /* Missing image name */ /* @@ -2949,11 +2985,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, ret = -ENAMETOOLONG; goto out_err; } - rbd_dev->spec->snap_name = kmalloc(len + 1, GFP_KERNEL); - if (!rbd_dev->spec->snap_name) + spec->snap_name = kmalloc(len + 1, GFP_KERNEL); + if (!spec->snap_name) goto out_mem; - memcpy(rbd_dev->spec->snap_name, buf, len); - *(rbd_dev->spec->snap_name + len) = '\0'; + memcpy(spec->snap_name, buf, len); + *(spec->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ @@ -2963,25 +2999,25 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; - *ceph_opts = ceph_parse_options(options, mon_addrs, + copts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, parse_rbd_opts_token, rbd_opts); - kfree(options); - if (IS_ERR(*ceph_opts)) { - ret = PTR_ERR(*ceph_opts); + if (IS_ERR(copts)) { + ret = PTR_ERR(copts); goto out_err; } + kfree(options); + + *ceph_opts = copts; *opts = rbd_opts; + *rbd_spec = spec; return 0; out_mem: ret = -ENOMEM; out_err: - kfree(rbd_dev->spec->image_name); - rbd_dev->spec->image_name = NULL; - rbd_dev->spec->image_name_len = 0; - kfree(rbd_dev->spec->pool_name); - rbd_dev->spec->pool_name = NULL; + kfree(rbd_opts); + rbd_spec_put(spec); kfree(options); return ret; @@ -3195,6 +3231,7 @@ static ssize_t rbd_add(struct bus_type *bus, struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts = NULL; struct rbd_options *rbd_opts = NULL; + struct rbd_spec *spec = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3204,9 +3241,6 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) return -ENOMEM; - rbd_dev->spec = rbd_spec_alloc(); - if (!rbd_dev->spec) - goto err_out_mem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -3215,9 +3249,10 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* parse add command */ - rc = rbd_add_parse_args(rbd_dev, buf, &ceph_opts, &rbd_opts); + rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); if (rc < 0) goto err_out_mem; + rbd_dev->mapping.read_only = rbd_opts->read_only; rc = rbd_get_client(rbd_dev, ceph_opts); @@ -3227,10 +3262,12 @@ static ssize_t rbd_add(struct bus_type *bus, /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; - rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->spec->pool_name); + rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); if (rc < 0) goto err_out_client; - rbd_dev->spec->pool_id = (u64) rc; + spec->pool_id = (u64) rc; + + rbd_dev->spec = spec; rc = rbd_dev_probe(rbd_dev); if (rc < 0) @@ -3321,8 +3358,8 @@ err_out_args: if (ceph_opts) ceph_destroy_options(ceph_opts); kfree(rbd_opts); + rbd_spec_put(spec); err_out_mem: - rbd_spec_put(rbd_dev->spec); kfree(rbd_dev); dout("Error adding device %s\n", buf); From 9d3997fdf4c82adfb37a4886a21eaa513ee071b6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:42 -0500 Subject: [PATCH 36/68] rbd: don't pass rbd_dev to rbd_get_client() The only reason rbd_dev is passed to rbd_get_client() is so its rbd_client field can get assigned. Instead, just return the rbd_client pointer as a result and have the caller do the assignment. Change rbd_put_client() so it takes an rbd_client structure, so follows the more typical symmetry with rbd_get_client(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index be85d925dfd..a528d4ca7a6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -467,23 +467,17 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static int rbd_get_client(struct rbd_device *rbd_dev, - struct ceph_options *ceph_opts) +static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { struct rbd_client *rbdc; rbdc = rbd_client_find(ceph_opts); - if (rbdc) { - /* using an existing client */ + if (rbdc) /* using an existing client */ ceph_destroy_options(ceph_opts); - } else { + else rbdc = rbd_client_create(ceph_opts); - if (IS_ERR(rbdc)) - return PTR_ERR(rbdc); - } - rbd_dev->rbd_client = rbdc; - return 0; + return rbdc; } /* @@ -508,10 +502,9 @@ static void rbd_client_release(struct kref *kref) * Drop reference to ceph client node. If it's not referenced anymore, release * it. */ -static void rbd_put_client(struct rbd_device *rbd_dev) +static void rbd_put_client(struct rbd_client *rbdc) { - kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); - rbd_dev->rbd_client = NULL; + kref_put(&rbdc->kref, rbd_client_release); } /* @@ -3232,6 +3225,7 @@ static ssize_t rbd_add(struct bus_type *bus, struct ceph_options *ceph_opts = NULL; struct rbd_options *rbd_opts = NULL; struct rbd_spec *spec = NULL; + struct rbd_client *rbdc; struct ceph_osd_client *osdc; int rc = -ENOMEM; @@ -3255,13 +3249,16 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev->mapping.read_only = rbd_opts->read_only; - rc = rbd_get_client(rbd_dev, ceph_opts); - if (rc < 0) + rbdc = rbd_get_client(ceph_opts); + if (IS_ERR(rbdc)) { + rc = PTR_ERR(rbdc); goto err_out_args; + } + rbd_dev->rbd_client = rbdc; ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */ /* pick the pool */ - osdc = &rbd_dev->rbd_client->client->osdc; + osdc = &rbdc->client->osdc; rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); if (rc < 0) goto err_out_client; @@ -3353,7 +3350,7 @@ err_out_probe: rbd_header_free(&rbd_dev->header); err_out_client: kfree(rbd_dev->header_name); - rbd_put_client(rbd_dev); + rbd_put_client(rbdc); err_out_args: if (ceph_opts) ceph_destroy_options(ceph_opts); @@ -3398,7 +3395,7 @@ static void rbd_dev_release(struct device *dev) if (rbd_dev->watch_event) rbd_req_sync_unwatch(rbd_dev); - rbd_put_client(rbd_dev); + rbd_put_client(rbd_dev->rbd_client); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); From bd4ba6554dcbae652b8b27a44f5a7795c9f3178a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:42 -0500 Subject: [PATCH 37/68] rbd: consolidate rbd_dev init in rbd_add() Group the allocation and initialization of fields of the rbd device structure created in rbd_add(). Move the grouped code down later in the function, just prior to the call to rbd_dev_probe(). This is for the most part simple code movement. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a528d4ca7a6..4771de2fba8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3232,29 +3232,16 @@ static ssize_t rbd_add(struct bus_type *bus, if (!try_module_get(THIS_MODULE)) return -ENODEV; - rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); - if (!rbd_dev) - return -ENOMEM; - - /* static rbd_device initialization */ - spin_lock_init(&rbd_dev->lock); - INIT_LIST_HEAD(&rbd_dev->node); - INIT_LIST_HEAD(&rbd_dev->snaps); - init_rwsem(&rbd_dev->header_rwsem); - /* parse add command */ rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); if (rc < 0) - goto err_out_mem; - - rbd_dev->mapping.read_only = rbd_opts->read_only; + goto err_out_module; rbdc = rbd_get_client(ceph_opts); if (IS_ERR(rbdc)) { rc = PTR_ERR(rbdc); goto err_out_args; } - rbd_dev->rbd_client = rbdc; ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */ /* pick the pool */ @@ -3264,11 +3251,22 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; + rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + goto err_out_client; + + spin_lock_init(&rbd_dev->lock); + INIT_LIST_HEAD(&rbd_dev->node); + INIT_LIST_HEAD(&rbd_dev->snaps); + init_rwsem(&rbd_dev->header_rwsem); + rbd_dev->rbd_client = rbdc; rbd_dev->spec = spec; + rbd_dev->mapping.read_only = rbd_opts->read_only; + rc = rbd_dev_probe(rbd_dev); if (rc < 0) - goto err_out_client; + goto err_out_mem; /* no need to lock here, as rbd_dev is not registered yet */ rc = rbd_dev_snaps_update(rbd_dev); @@ -3348,19 +3346,20 @@ err_out_snaps: rbd_remove_all_snaps(rbd_dev); err_out_probe: rbd_header_free(&rbd_dev->header); -err_out_client: kfree(rbd_dev->header_name); +err_out_mem: + kfree(rbd_dev); +err_out_client: rbd_put_client(rbdc); err_out_args: if (ceph_opts) ceph_destroy_options(ceph_opts); kfree(rbd_opts); rbd_spec_put(spec); -err_out_mem: - kfree(rbd_dev); +err_out_module: + module_put(THIS_MODULE); dout("Error adding device %s\n", buf); - module_put(THIS_MODULE); return (ssize_t) rc; } From c53d589337e9a211413484a604c76072e8474dc0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:42 -0500 Subject: [PATCH 38/68] rbd: define rbd_dev_{create,destroy}() helpers Encapsulate the creation/initialization and destruction of rbd device structures. The rbd_client and the rbd_spec structures provided on creation hold references whose ownership is transferred to the new rbd_device structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 62 ++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4771de2fba8..a8ad8f8370b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -504,7 +504,8 @@ static void rbd_client_release(struct kref *kref) */ static void rbd_put_client(struct rbd_client *rbdc) { - kref_put(&rbdc->kref, rbd_client_release); + if (rbdc) + kref_put(&rbdc->kref, rbd_client_release); } /* @@ -2166,6 +2167,34 @@ static void rbd_spec_free(struct kref *kref) kfree(spec); } +struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, + struct rbd_spec *spec) +{ + struct rbd_device *rbd_dev; + + rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + return NULL; + + spin_lock_init(&rbd_dev->lock); + INIT_LIST_HEAD(&rbd_dev->node); + INIT_LIST_HEAD(&rbd_dev->snaps); + init_rwsem(&rbd_dev->header_rwsem); + + rbd_dev->spec = spec; + rbd_dev->rbd_client = rbdc; + + return rbd_dev; +} + +static void rbd_dev_destroy(struct rbd_device *rbd_dev) +{ + kfree(rbd_dev->header_name); + rbd_put_client(rbd_dev->rbd_client); + rbd_spec_put(rbd_dev->spec); + kfree(rbd_dev); +} + static bool rbd_snap_registered(struct rbd_snap *snap) { bool ret = snap->dev.type == &rbd_snap_device_type; @@ -3242,7 +3271,7 @@ static ssize_t rbd_add(struct bus_type *bus, rc = PTR_ERR(rbdc); goto err_out_args; } - ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */ + ceph_opts = NULL; /* rbd_dev client now owns this */ /* pick the pool */ osdc = &rbdc->client->osdc; @@ -3251,22 +3280,19 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; - rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); + rbd_dev = rbd_dev_create(rbdc, spec); if (!rbd_dev) goto err_out_client; - - spin_lock_init(&rbd_dev->lock); - INIT_LIST_HEAD(&rbd_dev->node); - INIT_LIST_HEAD(&rbd_dev->snaps); - init_rwsem(&rbd_dev->header_rwsem); - rbd_dev->rbd_client = rbdc; - rbd_dev->spec = spec; + rbdc = NULL; /* rbd_dev now owns this */ + spec = NULL; /* rbd_dev now owns this */ rbd_dev->mapping.read_only = rbd_opts->read_only; + kfree(rbd_opts); + rbd_opts = NULL; /* done with this */ rc = rbd_dev_probe(rbd_dev); if (rc < 0) - goto err_out_mem; + goto err_out_rbd_dev; /* no need to lock here, as rbd_dev is not registered yet */ rc = rbd_dev_snaps_update(rbd_dev); @@ -3317,8 +3343,6 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_bus; - kfree(rbd_opts); - /* Everything's ready. Announce the disk to the world. */ add_disk(rbd_dev->disk); @@ -3332,7 +3356,6 @@ err_out_bus: /* this will also clean up rest of rbd_dev stuff */ rbd_bus_del_dev(rbd_dev); - kfree(rbd_opts); return rc; @@ -3346,9 +3369,8 @@ err_out_snaps: rbd_remove_all_snaps(rbd_dev); err_out_probe: rbd_header_free(&rbd_dev->header); - kfree(rbd_dev->header_name); -err_out_mem: - kfree(rbd_dev); +err_out_rbd_dev: + rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); err_out_args: @@ -3394,7 +3416,6 @@ static void rbd_dev_release(struct device *dev) if (rbd_dev->watch_event) rbd_req_sync_unwatch(rbd_dev); - rbd_put_client(rbd_dev->rbd_client); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); @@ -3404,10 +3425,9 @@ static void rbd_dev_release(struct device *dev) rbd_header_free(&rbd_dev->header); /* done with the id, and with the rbd_dev */ - kfree(rbd_dev->header_name); rbd_dev_id_put(rbd_dev); - rbd_spec_put(rbd_dev->spec); - kfree(rbd_dev); + rbd_assert(rbd_dev->rbd_client != NULL); + rbd_dev_destroy(rbd_dev); /* release module ref */ module_put(THIS_MODULE); From 83a06263625b823afa3a842ddbf53473c22f24b2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Oct 2012 15:47:17 -0500 Subject: [PATCH 39/68] rbd: encapsulate last part of probe Group the activities that now take place after an rbd_dev_probe() call into a single function, and move the call to that function into rbd_dev_probe() itself. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 161 +++++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 75 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a8ad8f8370b..8d26c0f2be1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3221,6 +3221,84 @@ out_err: return ret; } +static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) +{ + int ret; + + /* no need to lock here, as rbd_dev is not registered yet */ + ret = rbd_dev_snaps_update(rbd_dev); + if (ret) + return ret; + + ret = rbd_dev_set_mapping(rbd_dev); + if (ret) + goto err_out_snaps; + + /* generate unique id: find highest unique id, add one */ + rbd_dev_id_get(rbd_dev); + + /* Fill in the device name, now that we have its id. */ + BUILD_BUG_ON(DEV_NAME_LEN + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); + + /* Get our block major device number. */ + + ret = register_blkdev(0, rbd_dev->name); + if (ret < 0) + goto err_out_id; + rbd_dev->major = ret; + + /* Set up the blkdev mapping. */ + + ret = rbd_init_disk(rbd_dev); + if (ret) + goto err_out_blkdev; + + ret = rbd_bus_add_dev(rbd_dev); + if (ret) + goto err_out_disk; + + /* + * At this point cleanup in the event of an error is the job + * of the sysfs code (initiated by rbd_bus_del_dev()). + */ + down_write(&rbd_dev->header_rwsem); + ret = rbd_dev_snaps_register(rbd_dev); + up_write(&rbd_dev->header_rwsem); + if (ret) + goto err_out_bus; + + ret = rbd_init_watch_dev(rbd_dev); + if (ret) + goto err_out_bus; + + /* Everything's ready. Announce the disk to the world. */ + + add_disk(rbd_dev->disk); + + pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, + (unsigned long long) rbd_dev->mapping.size); + + return ret; +err_out_bus: + /* this will also clean up rest of rbd_dev stuff */ + + rbd_bus_del_dev(rbd_dev); + + return ret; +err_out_disk: + rbd_free_disk(rbd_dev); +err_out_blkdev: + unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_id: + rbd_dev_id_put(rbd_dev); +err_out_snaps: + rbd_remove_all_snaps(rbd_dev); + + return ret; +} + /* * Probe for the existence of the header object for the given rbd * device. For format 2 images this includes determining the image @@ -3240,9 +3318,16 @@ static int rbd_dev_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v1_probe(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); - if (ret) + if (ret) { dout("probe failed, returning %d\n", ret); + return ret; + } + + ret = rbd_dev_probe_finish(rbd_dev); + if (ret) + rbd_header_free(&rbd_dev->header); + return ret; } @@ -3294,81 +3379,7 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc < 0) goto err_out_rbd_dev; - /* no need to lock here, as rbd_dev is not registered yet */ - rc = rbd_dev_snaps_update(rbd_dev); - if (rc) - goto err_out_probe; - - rc = rbd_dev_set_mapping(rbd_dev); - if (rc) - goto err_out_snaps; - - /* generate unique id: find highest unique id, add one */ - rbd_dev_id_get(rbd_dev); - - /* Fill in the device name, now that we have its id. */ - BUILD_BUG_ON(DEV_NAME_LEN - < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); - sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); - - /* Get our block major device number. */ - - rc = register_blkdev(0, rbd_dev->name); - if (rc < 0) - goto err_out_id; - rbd_dev->major = rc; - - /* Set up the blkdev mapping. */ - - rc = rbd_init_disk(rbd_dev); - if (rc) - goto err_out_blkdev; - - rc = rbd_bus_add_dev(rbd_dev); - if (rc) - goto err_out_disk; - - /* - * At this point cleanup in the event of an error is the job - * of the sysfs code (initiated by rbd_bus_del_dev()). - */ - - down_write(&rbd_dev->header_rwsem); - rc = rbd_dev_snaps_register(rbd_dev); - up_write(&rbd_dev->header_rwsem); - if (rc) - goto err_out_bus; - - rc = rbd_init_watch_dev(rbd_dev); - if (rc) - goto err_out_bus; - - /* Everything's ready. Announce the disk to the world. */ - - add_disk(rbd_dev->disk); - - pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, - (unsigned long long) rbd_dev->mapping.size); - return count; - -err_out_bus: - /* this will also clean up rest of rbd_dev stuff */ - - rbd_bus_del_dev(rbd_dev); - - return rc; - -err_out_disk: - rbd_free_disk(rbd_dev); -err_out_blkdev: - unregister_blkdev(rbd_dev->major, rbd_dev->name); -err_out_id: - rbd_dev_id_put(rbd_dev); -err_out_snaps: - rbd_remove_all_snaps(rbd_dev); -err_out_probe: - rbd_header_free(&rbd_dev->header); err_out_rbd_dev: rbd_dev_destroy(rbd_dev); err_out_client: From 2c0d0a10ea89456781218f458f6bf72e99d87d2a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Oct 2012 19:40:33 -0500 Subject: [PATCH 40/68] rbd: allow null image name We will know the image id for format 2 parent images, but won't initially know its image name. Avoid making the query for an image id in rbd_dev_image_id() if it's already known. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8d26c0f2be1..a8521338bf4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3067,6 +3067,14 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) void *response; void *p; + /* + * When probing a parent image, the image id is already + * known (and the image name likely is not). There's no + * need to fetch the image id again in this case. + */ + if (rbd_dev->spec->image_id) + return 0; + /* * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. From a92ffdf8a9b09f8fae9a8f418f87f30a5e459570 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Oct 2012 19:40:33 -0500 Subject: [PATCH 41/68] rbd: allow null image name Format 2 parent images are partially identified by their image id, but it may not be possible to determine their image name. The name is not strictly needed for correct operation, so we won't be treating it as an error if we don't know it. Handle this case gracefully in rbd_name_show(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a8521338bf4..28052ff679c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1982,7 +1982,10 @@ static ssize_t rbd_name_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->spec->image_name); + if (rbd_dev->spec->image_name) + return sprintf(buf, "%s\n", rbd_dev->spec->image_name); + + return sprintf(buf, "(unknown)\n"); } static ssize_t rbd_image_id_show(struct device *dev, From 86b00e0da6be7bbc16412f126c5b548ac5d91d50 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:42 -0500 Subject: [PATCH 42/68] rbd: get parent spec for version 2 images Add support for getting the the information identifying the parent image for rbd images that have them. The child image holds a reference to its parent image specification structure. Create a new entry "parent" in /sys/bus/rbd/image/N/ to report the identifying information for the parent image, if any. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- Documentation/ABI/testing/sysfs-bus-rbd | 4 + drivers/block/rbd.c | 131 ++++++++++++++++++++++++ include/linux/ceph/rados.h | 2 + 3 files changed, 137 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 1cf2adf46b1..cd9213ccf3d 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -70,6 +70,10 @@ snap_* A directory per each snapshot +parent + + Information identifying the pool, image, and snapshot id for + the parent image in a layered rbd image (format 2 only). Entries under /sys/bus/rbd/devices//snap_ ------------------------------------------------------------- diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 28052ff679c..bce1fcfb518 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -217,6 +217,9 @@ struct rbd_device { struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; + struct rbd_spec *parent_spec; + u64 parent_overlap; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -2009,6 +2012,49 @@ static ssize_t rbd_snap_show(struct device *dev, return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); } +/* + * For an rbd v2 image, shows the pool id, image id, and snapshot id + * for the parent image. If there is no parent, simply shows + * "(no parent image)". + */ +static ssize_t rbd_parent_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + struct rbd_spec *spec = rbd_dev->parent_spec; + int count; + char *bufp = buf; + + if (!spec) + return sprintf(buf, "(no parent image)\n"); + + count = sprintf(bufp, "pool_id %llu\npool_name %s\n", + (unsigned long long) spec->pool_id, spec->pool_name); + if (count < 0) + return count; + bufp += count; + + count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, + spec->image_name ? spec->image_name : "(unknown)"); + if (count < 0) + return count; + bufp += count; + + count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", + (unsigned long long) spec->snap_id, spec->snap_name); + if (count < 0) + return count; + bufp += count; + + count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); + if (count < 0) + return count; + bufp += count; + + return (ssize_t) (bufp - buf); +} + static ssize_t rbd_image_refresh(struct device *dev, struct device_attribute *attr, const char *buf, @@ -2032,6 +2078,7 @@ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); +static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, @@ -2043,6 +2090,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_name.attr, &dev_attr_image_id.attr, &dev_attr_current_snap.attr, + &dev_attr_parent.attr, &dev_attr_refresh.attr, NULL }; @@ -2192,6 +2240,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, static void rbd_dev_destroy(struct rbd_device *rbd_dev) { + rbd_spec_put(rbd_dev->parent_spec); kfree(rbd_dev->header_name); rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); @@ -2400,6 +2449,71 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev) &rbd_dev->header.features); } +static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) +{ + struct rbd_spec *parent_spec; + size_t size; + void *reply_buf = NULL; + __le64 snapid; + void *p; + void *end; + char *image_id; + u64 overlap; + size_t len = 0; + int ret; + + parent_spec = rbd_spec_alloc(); + if (!parent_spec) + return -ENOMEM; + + size = sizeof (__le64) + /* pool_id */ + sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ + sizeof (__le64) + /* snap_id */ + sizeof (__le64); /* overlap */ + reply_buf = kmalloc(size, GFP_KERNEL); + if (!reply_buf) { + ret = -ENOMEM; + goto out_err; + } + + snapid = cpu_to_le64(CEPH_NOSNAP); + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "get_parent", + (char *) &snapid, sizeof (snapid), + (char *) reply_buf, size, + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + goto out_err; + + ret = -ERANGE; + p = reply_buf; + end = (char *) reply_buf + size; + ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); + if (parent_spec->pool_id == CEPH_NOPOOL) + goto out; /* No parent? No problem. */ + + image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + if (IS_ERR(image_id)) { + ret = PTR_ERR(image_id); + goto out_err; + } + parent_spec->image_id = image_id; + ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); + ceph_decode_64_safe(&p, end, overlap, out_err); + + rbd_dev->parent_overlap = overlap; + rbd_dev->parent_spec = parent_spec; + parent_spec = NULL; /* rbd_dev now owns this */ +out: + ret = 0; +out_err: + kfree(reply_buf); + rbd_spec_put(parent_spec); + + return ret; +} + static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) { size_t size; @@ -3154,6 +3268,12 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) ret = rbd_read_header(rbd_dev, &rbd_dev->header); if (ret < 0) goto out_err; + + /* Version 1 images have no parent (no layering) */ + + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; + rbd_dev->image_format = 1; dout("discovered version 1 image, header name is %s\n", @@ -3205,6 +3325,14 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; + /* If the image supports layering, get the parent info */ + + if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { + ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret < 0) + goto out_err; + } + /* crypto and compression type aren't (yet) supported for v2 images */ rbd_dev->header.crypt_type = 0; @@ -3224,6 +3352,9 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) return 0; out_err: + rbd_dev->parent_overlap = 0; + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; kfree(rbd_dev->header.object_prefix); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 0a99099801a..15077db662e 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -87,6 +87,8 @@ struct ceph_pg { * * lpgp_num -- as above. */ +#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */ + #define CEPH_PG_TYPE_REP 1 #define CEPH_PG_TYPE_RAID4 2 #define CEPH_PG_POOL_VERSION 2 From 72afc71ffca0f444ee0e1ef8c7e34ab209bb48b3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Oct 2012 19:40:33 -0500 Subject: [PATCH 43/68] libceph: define ceph_pg_pool_name_by_id() Define and export function ceph_pg_pool_name_by_id() to supply the name of a pg pool whose id is given. This will be used by the next patch. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osdmap.h | 1 + net/ceph/osdmap.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e88a620b9f8..5ea57ba6932 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -123,6 +123,7 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); #endif diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f552aa48fd9..de73214b5d2 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -469,6 +469,22 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) +{ + struct ceph_pg_pool_info *pi; + + if (id == CEPH_NOPOOL) + return NULL; + + if (WARN_ON_ONCE(id > (u64) INT_MAX)) + return NULL; + + pi = __lookup_pg_pool(&map->pg_pools, (int) id); + + return pi ? pi->name : NULL; +} +EXPORT_SYMBOL(ceph_pg_pool_name_by_id); + int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) { struct rb_node *rbp; From 9e15b77d9af3b63dfbff14e695336dfca88c22b2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Oct 2012 19:40:33 -0500 Subject: [PATCH 44/68] rbd: get additional info in parent spec When a layered rbd image has a parent, that parent is identified only by its pool id, image id, and snapshot id. Images that have been mapped also record *names* for those three id's. Add code to look up these names for parent images so they match mapped images more closely. Skip doing this for an image if it already has its pool name defined (this will be the case for images mapped by the user). It is possible that an the name of a parent image can't be determined, even if the image id is valid. If this occurs it does not preclude correct operation, so don't treat this as an error. On the other hand, defined pools will always have both an id and a name. And any snapshot of an image identified as a parent for a clone image will exist, and will have a name (if not it indicates some other internal error). So treat failure to get these bits of information as errors. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 133 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bce1fcfb518..842caf4aab4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -70,7 +70,10 @@ #define RBD_SNAP_HEAD_NAME "-" +/* This allows a single page to hold an image name sent by OSD */ +#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) #define RBD_IMAGE_ID_LEN_MAX 64 + #define RBD_OBJ_PREFIX_LEN_MAX 64 /* Feature bits */ @@ -658,6 +661,20 @@ out_err: return -ENOMEM; } +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +{ + struct rbd_snap *snap; + + if (snap_id == CEPH_NOSNAP) + return RBD_SNAP_HEAD_NAME; + + list_for_each_entry(snap, &rbd_dev->snaps, node) + if (snap_id == snap->id) + return snap->name; + + return NULL; +} + static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) { @@ -2499,6 +2516,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) goto out_err; } parent_spec->image_id = image_id; + parent_spec->image_id_len = len; ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); @@ -2514,6 +2532,117 @@ out_err: return ret; } +static char *rbd_dev_image_name(struct rbd_device *rbd_dev) +{ + size_t image_id_size; + char *image_id; + void *p; + void *end; + size_t size; + void *reply_buf = NULL; + size_t len = 0; + char *image_name = NULL; + int ret; + + rbd_assert(!rbd_dev->spec->image_name); + + image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; + image_id = kmalloc(image_id_size, GFP_KERNEL); + if (!image_id) + return NULL; + + p = image_id; + end = (char *) image_id + image_id_size; + ceph_encode_string(&p, end, rbd_dev->spec->image_id, + (u32) rbd_dev->spec->image_id_len); + + size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; + reply_buf = kmalloc(size, GFP_KERNEL); + if (!reply_buf) + goto out; + + ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, + "rbd", "dir_get_name", + image_id, image_id_size, + (char *) reply_buf, size, + CEPH_OSD_FLAG_READ, NULL); + if (ret < 0) + goto out; + p = reply_buf; + end = (char *) reply_buf + size; + image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + if (IS_ERR(image_name)) + image_name = NULL; + else + dout("%s: name is %s len is %zd\n", __func__, image_name, len); +out: + kfree(reply_buf); + kfree(image_id); + + return image_name; +} + +/* + * When a parent image gets probed, we only have the pool, image, + * and snapshot ids but not the names of any of them. This call + * is made later to fill in those names. It has to be done after + * rbd_dev_snaps_update() has completed because some of the + * information (in particular, snapshot name) is not available + * until then. + */ +static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc; + const char *name; + void *reply_buf = NULL; + int ret; + + if (rbd_dev->spec->pool_name) + return 0; /* Already have the names */ + + /* Look up the pool name */ + + osdc = &rbd_dev->rbd_client->client->osdc; + name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); + if (!name) + return -EIO; /* pool id too large (>= 2^31) */ + + rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); + if (!rbd_dev->spec->pool_name) + return -ENOMEM; + + /* Fetch the image name; tolerate failure here */ + + name = rbd_dev_image_name(rbd_dev); + if (name) { + rbd_dev->spec->image_name_len = strlen(name); + rbd_dev->spec->image_name = (char *) name; + } else { + pr_warning(RBD_DRV_NAME "%d " + "unable to get image name for image id %s\n", + rbd_dev->major, rbd_dev->spec->image_id); + } + + /* Look up the snapshot name. */ + + name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); + if (!name) { + ret = -EIO; + goto out_err; + } + rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); + if(!rbd_dev->spec->snap_name) + goto out_err; + + return 0; +out_err: + kfree(reply_buf); + kfree(rbd_dev->spec->pool_name); + rbd_dev->spec->pool_name = NULL; + + return ret; +} + static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) { size_t size; @@ -3372,6 +3501,10 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) return ret; + ret = rbd_dev_probe_update_spec(rbd_dev); + if (ret) + goto err_out_snaps; + ret = rbd_dev_set_mapping(rbd_dev); if (ret) goto err_out_snaps; From 4d1d0534f53863108fdea496288cb3310f88118d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 3 Nov 2012 10:32:37 +0800 Subject: [PATCH 45/68] ceph: Hold caps_list_lock when adjusting caps_{use, total}_count Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3251e9cc640..2d0141e95c8 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { + spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); } return cap; } From 22cddde104d715600a4c218bf9224923208afe90 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Nov 2012 11:07:23 -0800 Subject: [PATCH 46/68] ceph: Fix i_size update race ceph_aio_write() has an optimization that marks cap EPH_CAP_FILE_WR dirty before data is copied to page cache and inode size is updated. If ceph_check_caps() flushes the dirty cap before the inode size is updated, MDS can miss the new inode size. The fix is move ceph_{get,put}_cap_refs() into ceph_write_{begin,end}() and call __ceph_mark_dirty_caps() after inode size is updated. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/addr.c | 51 +++++++++++++++++++++++++++++++--- fs/ceph/file.c | 75 +++++++++++++++++++++----------------------------- 2 files changed, 78 insertions(+), 48 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 22b6e4583fa..21a07187df0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1078,23 +1078,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = file->private_data; struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int r; + int r, want, got = 0; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, len, inode->i_size); + r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); + if (r < 0) + return r; + dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); + if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { + ceph_put_cap_refs(ci, got); + return -EAGAIN; + } do { /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) - return -ENOMEM; - *pagep = page; + if (!page) { + r = -ENOMEM; + break; + } dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); + if (r) + page_cache_release(page); } while (r == -EAGAIN); + if (r) { + ceph_put_cap_refs(ci, got); + } else { + *pagep = page; + *(int *)fsdata = got; + } return r; } @@ -1108,10 +1136,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; + int got = (unsigned long)fsdata; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1134,6 +1164,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); + if (copied > 0) { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5840d2aaed1..d415096800a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; loff_t endoff = pos + iov->iov_len; - int want, got = 0; - int ret, err; + int got = 0; + int ret, err, written; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; retry_snap: + written = 0; if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) return -ENOSPC; __ceph_do_pending_vmtruncate(inode); - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - inode->i_size); - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); - if (ret < 0) - goto out_put; - - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); - - if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) { - ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, - &iocb->ki_pos); - } else { - /* - * buffered write; drop Fw early to avoid slow - * revocation if we get stuck on balance_dirty_pages - */ - int dirty; - - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - ceph_put_cap_refs(ci, got); + /* + * try to do a buffered write. if we don't have sufficient + * caps, we'll get -EAGAIN from generic_file_aio_write, or a + * short write if we only get caps for some pages. + */ + if (!(iocb->ki_filp->f_flags & O_DIRECT) && + !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && + !(fi->flags & CEPH_F_SYNC)) { ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (ret >= 0) + written = ret; + if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); + err = vfs_fsync_range(file, pos, pos + written - 1, 1); if (err < 0) ret = err; } - - if (dirty) - __mark_inode_dirty(inode, dirty); - goto out; + if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) + goto out; } + dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, inode->i_size); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); + if (ret < 0) + goto out; + + dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, ceph_cap_string(got)); + ret = ceph_sync_write(file, iov->iov_base + written, + iov->iov_len - written, &iocb->ki_pos); if (ret >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); @@ -777,13 +767,10 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); } - -out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); - out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", From cfc84c9f73ab8a6933bd4f36efac1196cddad581 Mon Sep 17 00:00:00 2001 From: Cyril Roelandt Date: Tue, 20 Nov 2012 10:23:07 -0600 Subject: [PATCH 47/68] ceph: fix dentry reference leak in ceph_encode_fh(). dput() was not called in the error path. Signed-off-by: Cyril Roelandt Reviewed-by: Alex Elder --- fs/ceph/export.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 862887004d2..f350be34601 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry = d_find_alias(inode); + struct dentry *dentry; struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + dentry = d_find_alias(inode); + /* if we found an alias, generate a connectable fh */ if (*max_len >= connected_handle_length && dentry) { dout("encode_fh %p connectable\n", dentry); From 83aff95eb9d60aff5497e9f44a2ae906b86d8e88 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Nov 2012 12:28:24 -0800 Subject: [PATCH 48/68] libceph: remove 'osdtimeout' option This would reset a connection with any OSD that had an outstanding request that was taking more than N seconds. The idea was that if the OSD was buggy, the client could compensate by resending the request. In reality, this only served to hide server bugs, and we haven't actually seen such a bug in quite a while. Moreover, the userspace client code never did this. More importantly, often the request is taking a long time because the OSD is trying to recover, or overloaded, and killing the connection and retrying would only make the situation worse by giving the OSD more work to do. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/super.c | 2 -- include/linux/ceph/libceph.h | 2 -- net/ceph/ceph_common.c | 3 +-- net/ceph/osd_client.c | 47 +++--------------------------------- 4 files changed, 5 insertions(+), 49 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2f586b0e5e0..fcda1c73a1e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) seq_printf(m, ",osdkeepalivetimeout=%d", opt->osd_keepalive_timeout); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 42624789b06..317aff8feb0 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -43,7 +43,6 @@ struct ceph_options { struct ceph_entity_addr my_addr; int mount_timeout; int osd_idle_ttl; - int osd_timeout; int osd_keepalive_timeout; /* @@ -63,7 +62,6 @@ struct ceph_options { * defaults */ #define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ #define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_IDLE_TTL_DEFAULT 60 diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a8020293f34..ee71ea26777 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -305,7 +305,6 @@ ceph_parse_options(char *options, const char *dev_name, /* start with defaults */ opt->flags = CEPH_OPT_DEFAULT; - opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ @@ -391,7 +390,7 @@ ceph_parse_options(char *options, const char *dev_name, /* misc */ case Opt_osdtimeout: - opt->osd_timeout = intval; + pr_warning("ignoring deprecated osdtimeout option\n"); break; case Opt_osdkeepalivetimeout: opt->osd_keepalive_timeout = intval; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccbdfbba9e5..7ebfe13267e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -608,14 +608,6 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, } } -static void kick_osd_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) -{ - mutex_lock(&osdc->request_mutex); - __kick_osd_requests(osdc, kickosd); - mutex_unlock(&osdc->request_mutex); -} - /* * If the osd connection drops, we need to resubmit all requests. */ @@ -629,7 +621,9 @@ static void osd_reset(struct ceph_connection *con) dout("osd_reset osd%d\n", osd->o_osd); osdc = osd->o_osdc; down_read(&osdc->map_sem); - kick_osd_requests(osdc, osd); + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, osd); + mutex_unlock(&osdc->request_mutex); send_queued(osdc); up_read(&osdc->map_sem); } @@ -1091,12 +1085,10 @@ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); - struct ceph_osd_request *req, *last_req = NULL; + struct ceph_osd_request *req; struct ceph_osd *osd; - unsigned long timeout = osdc->client->options->osd_timeout * HZ; unsigned long keepalive = osdc->client->options->osd_keepalive_timeout * HZ; - unsigned long last_stamp = 0; struct list_head slow_osds; dout("timeout\n"); down_read(&osdc->map_sem); @@ -1105,37 +1097,6 @@ static void handle_timeout(struct work_struct *work) mutex_lock(&osdc->request_mutex); - /* - * reset osds that appear to be _really_ unresponsive. this - * is a failsafe measure.. we really shouldn't be getting to - * this point if the system is working properly. the monitors - * should mark the osd as failed and we should find out about - * it from an updated osd map. - */ - while (timeout && !list_empty(&osdc->req_lru)) { - req = list_entry(osdc->req_lru.next, struct ceph_osd_request, - r_req_lru_item); - - /* hasn't been long enough since we sent it? */ - if (time_before(jiffies, req->r_stamp + timeout)) - break; - - /* hasn't been long enough since it was acked? */ - if (req->r_request->ack_stamp == 0 || - time_before(jiffies, req->r_request->ack_stamp + timeout)) - break; - - BUG_ON(req == last_req && req->r_stamp == last_stamp); - last_req = req; - last_stamp = req->r_stamp; - - osd = req->r_osd; - BUG_ON(!osd); - pr_warning(" tid %llu timed out on osd%d, will reset osd\n", - req->r_tid, osd->o_osd); - __kick_osd_requests(osdc, osd); - } - /* * ping osds that are a bit slow. this ensures that if there * is a break in the TCP connection we will notice, and reopen From d2cc4dde9206aa2c7fb237aa689d3277cc070547 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 29 Nov 2012 08:37:03 -0600 Subject: [PATCH 49/68] bdi_register: add __printf verification, fix arg mismatch __printf is useful to verify format and arguments. Signed-off-by: Joe Perches Reviewed-by: Alex Elder --- fs/ceph/super.c | 2 +- include/linux/backing-dev.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fcda1c73a1e..1a144001b2e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -842,7 +842,7 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = default_backing_dev_info.ra_pages; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &fsc->backing_dev_info; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2a9a9abc912..12731a19ef0 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -114,6 +114,7 @@ struct backing_dev_info { int bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); +__printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); From 5e62ad30157d0da04cf40c6d1a2f4bc840948b9c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:04 +0800 Subject: [PATCH 50/68] ceph: Don't update i_max_size when handling non-auth cap The cap from non-auth mds doesn't have a meaningful max_size value. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2d0141e95c8..8072aefc427 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2390,7 +2390,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, &atime); /* max size increase? */ - if (max_size != ci->i_max_size) { + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { From ed75ec2cd19b47efcd292b6e23f58e56f4c5bc34 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:06 +0800 Subject: [PATCH 51/68] ceph: Fix infinite loop in __wake_requests __wake_requests() will enter infinite loop if we use it to wake requests in the session->s_waiting list. __wake_requests() deletes requests from the list and __do_request() adds requests back to the list. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 62d2342eb26..9165eb8309e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1876,9 +1876,14 @@ finish: static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { - struct ceph_mds_request *req, *nreq; + struct ceph_mds_request *req; + LIST_HEAD(tmp_list); - list_for_each_entry_safe(req, nreq, head, r_wait) { + list_splice_init(head, &tmp_list); + + while (!list_empty(&tmp_list)) { + req = list_entry(tmp_list.next, + struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); __do_request(mdsc, req); } From 0685235ffd9dbdb9ccbda587f8a3c83ad1d5a921 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:07 +0800 Subject: [PATCH 52/68] ceph: Don't add dirty inode to dirty list if caps is in migration Add dirty inode to cap_dirty_migrating list instead, this can avoid ceph_flush_dirty_caps() entering infinite loop. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8072aefc427..5efa3f5e2f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1351,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (!ci->i_head_snapc) ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); - dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, - ci->i_head_snapc); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + if (ci->i_auth_cap) + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + else + list_add(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); From a85f50b6ef93fbbb2ae932ce9b2376509d172796 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:08 +0800 Subject: [PATCH 53/68] ceph: Fix __ceph_do_pending_vmtruncate we should set i_truncate_pending to 0 after page cache is truncated to i_truncate_size Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4b5762ef7c2..81613bced19 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; - int wrbuffer_refs, wake = 0; + int wrbuffer_refs, finish = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1498,15 +1498,18 @@ retry: truncate_inode_pages(inode->i_mapping, to); spin_lock(&ci->i_ceph_lock); - ci->i_truncate_pending--; - if (ci->i_truncate_pending == 0) - wake = 1; + if (to == ci->i_truncate_size) { + ci->i_truncate_pending = 0; + finish = 1; + } spin_unlock(&ci->i_ceph_lock); + if (!finish) + goto retry; if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - if (wake) - wake_up_all(&ci->i_cap_wq); + + wake_up_all(&ci->i_cap_wq); } From 0e5e1774a92e6fe9c511585de8f078b4c4c68dbb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:09 +0800 Subject: [PATCH 54/68] ceph: call handle_cap_grant() for cap import message If client sends cap message that requests new max size during exporting caps, the exporting MDS will drop the message quietly. So the client may wait for the reply that updates the max size forever. call handle_cap_grant() for cap import message can avoid this issue. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5efa3f5e2f7..a1d9bb30c1b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2751,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, /* make sure we re-request max_size, if necessary */ spin_lock(&ci->i_ceph_lock); + ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } @@ -2846,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), 0, session); - goto done_unlocked; } /* the rest require a cap */ @@ -2864,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: + case CEPH_CAP_OP_IMPORT: handle_cap_grant(inode, h, session, cap, msg->middle); goto done_unlocked; From 8884d53dd63b1d9315b343564fcbe1ede004a99e Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 3 Dec 2012 19:14:05 -0800 Subject: [PATCH 55/68] libceph: Unlock unprocessed pages in start_read() error path Function start_read() can get an error before processing all pages. It must not only release the remaining pages, but unlock them too. This fixes http://tracker.newdream.net/issues/3370 Signed-off-by: David Zafman Reviewed-by: Alex Elder --- fs/ceph/addr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 21a07187df0..8e8a818cba0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) kfree(req->r_pages); } +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + unlock_page(pages[i]); +} + /* * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. @@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) return nr_pages; out_pages: + ceph_unlock_page_vector(pages, nr_pages); ceph_release_page_vector(pages, nr_pages); out: ceph_osdc_put_request(req); From 42382b709bd1d143b9f0fa93e0a3a1f2f4210707 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 16 Nov 2012 09:29:16 -0600 Subject: [PATCH 56/68] rbd: do not allow remove of mounted-on image There is no check in rbd_remove() to see if anybody holds open the image being removed. That's not cool. Add a simple open count that goes up and down with opens and closes (releases) of the device, and don't allow an rbd image to be removed if the count is non-zero. Protect the updates of the open count value with ctl_mutex to ensure the underlying rbd device doesn't get removed while concurrently being opened. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- drivers/block/rbd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 842caf4aab4..c7bf9613ad4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -235,6 +235,7 @@ struct rbd_device { /* sysfs related */ struct device dev; + unsigned long open_count; }; static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ @@ -309,8 +310,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rbd_get_dev(rbd_dev); set_device_ro(bdev, rbd_dev->mapping.read_only); + rbd_dev->open_count++; + mutex_unlock(&ctl_mutex); return 0; } @@ -319,7 +323,11 @@ static int rbd_release(struct gendisk *disk, fmode_t mode) { struct rbd_device *rbd_dev = disk->private_data; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rbd_assert(rbd_dev->open_count > 0); + rbd_dev->open_count--; rbd_put_dev(rbd_dev); + mutex_unlock(&ctl_mutex); return 0; } @@ -3745,6 +3753,11 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } + if (rbd_dev->open_count) { + ret = -EBUSY; + goto done; + } + rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); From 7d5f24812bd182a2471cb69c1c2baf0648332e1f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 29 Nov 2012 08:37:03 -0600 Subject: [PATCH 57/68] ceph: don't reference req after put In __unregister_request(), there is a call to list_del_init() referencing a request that was the subject of a call to ceph_osdc_put_request() on the previous line. This is not safe, because the request structure could have been freed by the time we reach the list_del_init(). Fix this by reversing the order of these lines. Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7ebfe13267e..ac7be7202fa 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -871,9 +871,9 @@ static void __unregister_request(struct ceph_osd_client *osdc, req->r_osd = NULL; } + list_del_init(&req->r_req_lru_item); ceph_osdc_put_request(req); - list_del_init(&req->r_req_lru_item); if (osdc->num_requests == 0) { dout(" no requests, canceling timeout\n"); __cancel_osd_timeout(osdc); From 685a7555ca69030739ddb57a47f0ea8ea80196a4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 7 Dec 2012 09:57:58 -0600 Subject: [PATCH 58/68] libceph: avoid using freed osd in __kick_osd_requests() If an osd has no requests and no linger requests, __reset_osd() will just remove it with a call to __remove_osd(). That drops a reference to the osd, and therefore the osd may have been free by the time __reset_osd() returns. That function offers no indication this may have occurred, and as a result the osd will continue to be used even when it's no longer valid. Change__reset_osd() so it returns an error (ENODEV) when it deletes the osd being reset. And change __kick_osd_requests() so it returns immediately (before referencing osd again) if __reset_osd() returns *any* error. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ac7be7202fa..60c74c1f1ea 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -581,7 +581,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, dout("__kick_osd_requests osd%d\n", osd->o_osd); err = __reset_osd(osdc, osd); - if (err == -EAGAIN) + if (err) return; list_for_each_entry(req, &osd->o_requests, r_osd_item) { @@ -745,6 +745,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); + ret = -ENODEV; } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], &osd->o_con.peer_addr, sizeof(osd->o_con.peer_addr)) == 0 && From 2fd82b9e92c2a718ae81fc987b4468ceeee6979b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 15:05:54 -0600 Subject: [PATCH 59/68] rbd: get rid of RBD_MAX_SEG_NAME_LEN RBD_MAX_SEG_NAME_LEN represents the maximum length of an rbd object name (i.e., one of the objects providing storage backing an rbd image). Another symbol, MAX_OBJ_NAME_SIZE, is used in the osd client code to define the maximum length of any object name in an osd request. Right now they disagree, with RBD_MAX_SEG_NAME_LEN being too big. There's no real benefit at this point to defining the rbd object name length limit separate from any other object name, so just get rid of RBD_MAX_SEG_NAME_LEN and use MAX_OBJ_NAME_SIZE in its place. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- drivers/block/rbd.c | 6 +++--- drivers/block/rbd_types.h | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c7bf9613ad4..ce26b749ede 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -740,13 +740,13 @@ static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) u64 segment; int ret; - name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; - ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", + ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { + if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index cbe77fa105b..49d77cbcf8b 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -46,8 +46,6 @@ #define RBD_MIN_OBJ_ORDER 16 #define RBD_MAX_OBJ_ORDER 30 -#define RBD_MAX_SEG_NAME_LEN 128 - #define RBD_COMP_NONE 0 #define RBD_CRYPT_NONE 0 From 61c74035626beb25a39b0273ccf7d75510bc36a1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 6 Dec 2012 09:37:23 -0600 Subject: [PATCH 60/68] rbd: remove linger unconditionally In __unregister_linger_request(), the request is being removed from the osd client's req_linger list only when the request has a non-null osd pointer. It should be done whether or not the request currently has an osd. This is most likely a non-issue because I believe the request will always have an osd when this function is called. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 60c74c1f1ea..32bd696b39a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -906,8 +906,8 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { dout("__unregister_linger_request %p\n", req); + list_del_init(&req->r_linger_item); if (req->r_osd) { - list_del_init(&req->r_linger_item); list_del_init(&req->r_linger_osd); if (list_empty(&req->r_osd->o_requests) && From b8f5c6edca34ff441e1ccdec68828e933a1b905b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: [PATCH 61/68] rbd: don't use ENOTSUPP ENOTSUPP is not a standard errno (it shows up as "Unknown error 524" in an error message). This is what was getting produced when the the local rbd code does not implement features required by a discovered rbd image. Change the error code returned in this case to ENXIO. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ce26b749ede..4daa400c13a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2456,7 +2456,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, incompat = le64_to_cpu(features_buf.incompat); if (incompat & ~RBD_FEATURES_ALL) - return -ENOTSUPP; + return -ENXIO; *snap_features = le64_to_cpu(features_buf.features); From 7bb21d68c535ad8be38e14a715632ae398b37ac1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 7 Dec 2012 19:50:07 -0600 Subject: [PATCH 62/68] libceph: socket can close in any connection state A connection's socket can close for any reason, independent of the state of the connection (and without irrespective of the connection mutex). As a result, the connectino can be in pretty much any state at the time its socket is closed. Handle those other cases at the top of con_work(). Pull this whole block of code into a separate function to reduce the clutter. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1041114453d..4b04ccc60db 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2273,6 +2273,35 @@ static void queue_con(struct ceph_connection *con) (void) queue_con_delay(con, 0); } +static bool con_sock_closed(struct ceph_connection *con) +{ + if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) + return false; + +#define CASE(x) \ + case CON_STATE_ ## x: \ + con->error_msg = "socket closed (con state " #x ")"; \ + break; + + switch (con->state) { + CASE(CLOSED); + CASE(PREOPEN); + CASE(CONNECTING); + CASE(NEGOTIATING); + CASE(OPEN); + CASE(STANDBY); + default: + pr_warning("%s con %p unrecognized state %lu\n", + __func__, con, con->state); + con->error_msg = "unrecognized con state"; + BUG(); + break; + } +#undef CASE + + return true; +} + /* * Do some work on a connection. Drop a connection ref when we're done. */ @@ -2284,24 +2313,8 @@ static void con_work(struct work_struct *work) mutex_lock(&con->mutex); restart: - if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) { - switch (con->state) { - case CON_STATE_CONNECTING: - con->error_msg = "connection failed"; - break; - case CON_STATE_NEGOTIATING: - con->error_msg = "negotiation failed"; - break; - case CON_STATE_OPEN: - con->error_msg = "socket closed"; - break; - default: - dout("unrecognized con state %d\n", (int)con->state); - con->error_msg = "unrecognized con state"; - BUG(); - } + if (con_sock_closed(con)) goto fault; - } if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { dout("con_work %p backing off\n", con); From 28362986f8743124b3a0fda20a8ed3e80309cce1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 14 Dec 2012 16:47:41 -0600 Subject: [PATCH 63/68] libceph: report connection fault with warning When a connection's socket disconnects, or if there's a protocol error of some kind on the connection, a fault is signaled and the connection is reset (closed and reopened, basically). We currently get an error message on the log whenever this occurs. A ceph connection will attempt to reestablish a socket connection repeatedly if a fault occurs. This means that these error messages will get repeatedly added to the log, which is undesirable. Change the error message to be a warning, so they don't get logged by default. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4b04ccc60db..4d111fd2b49 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2377,7 +2377,7 @@ fault: static void ceph_fault(struct ceph_connection *con) __releases(con->mutex) { - pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); From f407731d12214e7686819018f3a1e9d7b6f83a02 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 6 Dec 2012 07:22:04 -0600 Subject: [PATCH 64/68] libceph: init osd->o_node in create_osd() The red-black node node in the ceph osd structure is not initialized in create_osd(). Because this node can be the subject of a RB_EMPTY_NODE() call later on, we should ensure the node is initialized properly for that. Add a call to RB_CLEAR_NODE() initialize it. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 32bd696b39a..a6dc6acd656 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; osd->o_osd = onum; + RB_CLEAR_NODE(&osd->o_node); INIT_LIST_HEAD(&osd->o_requests); INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); From 3ee5234df68d253c415ba4f2db72ad250d9c21a9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 17 Dec 2012 12:23:48 -0600 Subject: [PATCH 65/68] libceph: init event->node in ceph_osdc_create_event() The red-black node node in the ceph osd event structure is not initialized in create_osdc_create_event(). Because this node can be the subject of a RB_EMPTY_NODE() call later on, we should ensure the node is initialized properly for that. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a6dc6acd656..2bce3d4be1c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1563,6 +1563,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc, event->data = data; event->osdc = osdc; INIT_LIST_HEAD(&event->osd_node); + RB_CLEAR_NODE(&event->node); kref_init(&event->kref); /* one ref for us */ kref_get(&event->kref); /* one ref for the caller */ init_completion(&event->completion); From a978fa20fb657548561dddbfb605fe43654f0825 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 17 Dec 2012 12:23:48 -0600 Subject: [PATCH 66/68] libceph: don't use rb_init_node() in ceph_osdc_alloc_request() The red-black node in the ceph osd request structure is initialized in ceph_osdc_alloc_request() using rbd_init_node(). We do need to initialize this, because in __unregister_request() we call RB_EMPTY_NODE(), which expects the node it's checking to have been initialized. But rb_init_node() is apparently overkill, and may in fact be on its way out. So use RB_CLEAR_NODE() instead. For a little more background, see this commit: 4c199a93 rbtree: empty nodes have no color" Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2bce3d4be1c..7cd0a7f3bf4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -221,7 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); - rb_init_node(&req->r_node); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); From c89ce05e0c5a01a256100ac6a6019f276bdd1ca6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 6 Dec 2012 07:22:04 -0600 Subject: [PATCH 67/68] libceph: register request before unregister linger In kick_requests(), we need to register the request before we unregister the linger request. Otherwise the unregister will reset the request's osd pointer to NULL. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7cd0a7f3bf4..780caf6b049 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1328,8 +1328,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - __unregister_linger_request(osdc, req); __register_request(osdc, req); + __unregister_linger_request(osdc, req); } mutex_unlock(&osdc->request_mutex); From c3e946ce7276faf0b302acd25c7b874edbeba661 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 16 Nov 2012 09:29:16 -0600 Subject: [PATCH 68/68] rbd: get rid of rbd_{get,put}_dev() The functions rbd_get_dev() and rbd_put_dev() are trivial wrappers that add no value, and their existence suggests they may do more than what they do. Get rid of them. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- drivers/block/rbd.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4daa400c13a..89576a0b3f2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -290,16 +290,6 @@ static struct device rbd_root_dev = { # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ -static struct device *rbd_get_dev(struct rbd_device *rbd_dev) -{ - return get_device(&rbd_dev->dev); -} - -static void rbd_put_dev(struct rbd_device *rbd_dev) -{ - put_device(&rbd_dev->dev); -} - static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); @@ -311,7 +301,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) return -EROFS; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbd_get_dev(rbd_dev); + (void) get_device(&rbd_dev->dev); set_device_ro(bdev, rbd_dev->mapping.read_only); rbd_dev->open_count++; mutex_unlock(&ctl_mutex); @@ -326,7 +316,7 @@ static int rbd_release(struct gendisk *disk, fmode_t mode) mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rbd_assert(rbd_dev->open_count > 0); rbd_dev->open_count--; - rbd_put_dev(rbd_dev); + put_device(&rbd_dev->dev); mutex_unlock(&ctl_mutex); return 0;