From 8618e30bc14b06bfafa0f164cca7b0e06451f88a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 8 Oct 2012 20:37:30 -0700
Subject: [PATCH 01/68] rbd: let con_work() handle backoff

Both ceph_fault() and con_work() include handling for imposing a
delay before doing further processing on a faulted connection.
The latter is used only if ceph_fault() is unable to.

Instead, just let con_work() always be responsible for implementing
the delay.  After setting up the delay value, set the BACKOFF flag
on the connection unconditionally and call queue_con() to ensure
con_work() will get called to handle it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cad0d17ec45..973c16c20c4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2398,24 +2398,8 @@ static void ceph_fault(struct ceph_connection *con)
 			con->delay = BASE_DELAY_INTERVAL;
 		else if (con->delay < MAX_DELAY_INTERVAL)
 			con->delay *= 2;
-		con->ops->get(con);
-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay))) {
-			dout("fault queued %p delay %lu\n", con, con->delay);
-		} else {
-			con->ops->put(con);
-			dout("fault failed to queue %p delay %lu, backoff\n",
-			     con, con->delay);
-			/*
-			 * In many cases we see a socket state change
-			 * while con_work is running and end up
-			 * queuing (non-delayed) work, such that we
-			 * can't backoff with a delay.  Set a flag so
-			 * that when con_work restarts we schedule the
-			 * delay then.
-			 */
-			set_bit(CON_FLAG_BACKOFF, &con->flags);
-		}
+		set_bit(CON_FLAG_BACKOFF, &con->flags);
+		queue_con(con);
 	}
 
 out_unlock:

From 802c6d967fbdcd2cbc91b917425661bb8bbfaade Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 8 Oct 2012 20:37:30 -0700
Subject: [PATCH 02/68] rbd: define common queue_con_delay()

This patch defines a single function, queue_con_delay() to call
queue_delayed_work() for a connection.  It basically generalizes
what was previously queue_con() by adding the delay argument.
queue_con() is now a simple helper that passes 0 for its delay.
queue_con_delay() returns 0 if it queued work or an errno if it
did not for some reason.

If con_work() finds the BACKOFF flag set for a connection, it now
calls queue_con_delay() to handle arranging to start again after a
delay.

Note about connection reference counts:  con_work() only ever gets
called as a work item function.  At the time that work is scheduled,
a reference to the connection is acquired, and the corresponding
con_work() call is then responsible for dropping that reference
before it returns.

Previously, the backoff handling inside con_work() silently handed
off its reference to delayed work it scheduled.  Now that
queue_con_delay() is used, a new reference is acquired for the
newly-scheduled work, and the original reference is dropped by the
con->ops->put() call at the end of the function.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 973c16c20c4..66f6f56bcb2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2244,22 +2244,33 @@ bad_tag:
 
 
 /*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
+ * Atomically queue work on a connection after the specified delay.
+ * Bump @con reference to avoid races with connection teardown.
+ * Returns 0 if work was queued, or an error code otherwise.
  */
-static void queue_con(struct ceph_connection *con)
+static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 {
 	if (!con->ops->get(con)) {
-		dout("queue_con %p ref count 0\n", con);
-		return;
+		dout("%s %p ref count 0\n", __func__, con);
+
+		return -ENOENT;
 	}
 
-	if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
-		dout("queue_con %p - already queued\n", con);
+	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
+		dout("%s %p - already queued\n", __func__, con);
 		con->ops->put(con);
-	} else {
-		dout("queue_con %p\n", con);
+
+		return -EBUSY;
 	}
+
+	dout("%s %p %lu\n", __func__, con, delay);
+
+	return 0;
+}
+
+static void queue_con(struct ceph_connection *con)
+{
+	(void) queue_con_delay(con, 0);
 }
 
 /*
@@ -2294,14 +2305,11 @@ restart:
 
 	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);
-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay))) {
-			dout("con_work %p backoff %lu\n", con, con->delay);
-			mutex_unlock(&con->mutex);
-			return;
-		} else {
+		ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+		if (ret) {
 			dout("con_work %p FAILED to back off %lu\n", con,
 			     con->delay);
+			BUG_ON(ret == -ENOENT);
 			set_bit(CON_FLAG_BACKOFF, &con->flags);
 		}
 		goto done;

From 9478554ae5d21d65e948a3eff4ee2a8ad30d70e9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 9 Oct 2012 13:50:17 -0700
Subject: [PATCH 03/68] rbd: define rbd_update_mapping_size()

Encapsulate the code that handles updating the size of a mapping
after an rbd image has been refreshed.  This is done in anticipation
of the next patch, which will make this common code for format 1 and
2 images.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bb3d9be3b1b..b64125d1d7b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1716,6 +1716,19 @@ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 		__rbd_remove_snap_dev(snap);
 }
 
+static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
+{
+	sector_t size;
+
+	if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
+		return;
+
+	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
+	dout("setting size to %llu sectors", (unsigned long long) size);
+	rbd_dev->mapping.size = (u64) size;
+	set_capacity(rbd_dev->disk, size);
+}
+
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
@@ -1730,17 +1743,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
 
 	down_write(&rbd_dev->header_rwsem);
 
-	/* resized? */
-	if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
-		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
-
-		if (size != (sector_t) rbd_dev->mapping.size) {
-			dout("setting size to %llu sectors",
-				(unsigned long long) size);
-			rbd_dev->mapping.size = (u64) size;
-			set_capacity(rbd_dev->disk, size);
-		}
-	}
+	/* Update image size, and check for resize of mapped image */
+	rbd_dev->header.image_size = h.image_size;
+	rbd_update_mapping_size(rbd_dev);
 
 	/* rbd_dev->header.object_prefix shouldn't change */
 	kfree(rbd_dev->header.snap_sizes);

From 117973fb4c91f3fd913127577e9f71b3aa6cb556 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 31 Aug 2012 17:29:55 -0500
Subject: [PATCH 04/68] rbd: define rbd_dev_v2_refresh()

Define a new function rbd_dev_v2_refresh() to update/refresh the
snapshot context for a format version 2 rbd image.  This function
will update anything that is not fixed for the life of an rbd
image--at the moment this is mainly the snapshot context and (for
a base mapping) the size.

Update rbd_refresh_header() so it selects which function to use
based on the image format.

Rename __rbd_refresh_header() to be rbd_dev_v1_refresh()
to be consistent with the naming of its version 2 counterpart.
Similarly rename rbd_refresh_header() to be rbd_dev_refresh().

Unrelated--we use rbd_image_format_valid() here.  Delete the other
use of it, which was primarily put in place to ensure that function
was referenced at the time it was defined.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 55 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b64125d1d7b..f11b839166e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -268,7 +268,8 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
 	put_device(&rbd_dev->dev);
 }
 
-static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
@@ -1304,7 +1305,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
 		rbd_dev->header_name, (unsigned long long) notify_id,
 		(unsigned int) opcode);
-	rc = rbd_refresh_header(rbd_dev, &hver);
+	rc = rbd_dev_refresh(rbd_dev, &hver);
 	if (rc)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
 			   " update snaps: %d\n", rbd_dev->major, rc);
@@ -1732,7 +1733,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
-static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
 {
 	int ret;
 	struct rbd_image_header h;
@@ -1773,12 +1774,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
 	return ret;
 }
 
-static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
 {
 	int ret;
 
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	ret = __rbd_refresh_header(rbd_dev, hver);
+	if (rbd_dev->image_format == 1)
+		ret = rbd_dev_v1_refresh(rbd_dev, hver);
+	else
+		ret = rbd_dev_v2_refresh(rbd_dev, hver);
 	mutex_unlock(&ctl_mutex);
 
 	return ret;
@@ -1938,7 +1943,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 	int ret;
 
-	ret = rbd_refresh_header(rbd_dev, NULL);
+	ret = rbd_dev_refresh(rbd_dev, NULL);
 
 	return ret < 0 ? ret : size;
 }
@@ -2402,6 +2407,41 @@ static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
 	return ERR_PTR(-EINVAL);
 }
 
+static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
+{
+	int ret;
+	__u8 obj_order;
+
+	down_write(&rbd_dev->header_rwsem);
+
+	/* Grab old order first, to see if it changes */
+
+	obj_order = rbd_dev->header.obj_order,
+	ret = rbd_dev_v2_image_size(rbd_dev);
+	if (ret)
+		goto out;
+	if (rbd_dev->header.obj_order != obj_order) {
+		ret = -EIO;
+		goto out;
+	}
+	rbd_update_mapping_size(rbd_dev);
+
+	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
+	dout("rbd_dev_v2_snap_context returned %d\n", ret);
+	if (ret)
+		goto out;
+	ret = rbd_dev_snaps_update(rbd_dev);
+	dout("rbd_dev_snaps_update returned %d\n", ret);
+	if (ret)
+		goto out;
+	ret = rbd_dev_snaps_register(rbd_dev);
+	dout("rbd_dev_snaps_register returned %d\n", ret);
+out:
+	up_write(&rbd_dev->header_rwsem);
+
+	return ret;
+}
+
 /*
  * Scan the rbd device's current snapshot list and compare it to the
  * newly-received snapshot context.  Remove any existing snapshots
@@ -2564,7 +2604,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	do {
 		ret = rbd_req_sync_watch(rbd_dev);
 		if (ret == -ERANGE) {
-			rc = rbd_refresh_header(rbd_dev, NULL);
+			rc = rbd_dev_refresh(rbd_dev, NULL);
 			if (rc < 0)
 				return rc;
 		}
@@ -3045,7 +3085,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 	rc = rbd_dev_probe(rbd_dev);
 	if (rc < 0)
 		goto err_out_client;
-	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
 	/* no need to lock here, as rbd_dev is not registered yet */
 	rc = rbd_dev_snaps_update(rbd_dev);

From d889140c4a1c5edb6a7bd90392b9d878bfaccfb6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 9 Oct 2012 13:50:17 -0700
Subject: [PATCH 05/68] rbd: implement feature checks

Version 2 images have two sets of feature bit fields.  The first
indicates features possibly used by the image.  The second indicates
features that the client *must* support in order to use the image.

When an image (or snapshot) is first examined, we need to make sure
that the local implementation supports the image's required
features.  If not, fail the probe for the image.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f11b839166e..0f260a6e97c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -70,6 +70,14 @@
 #define RBD_IMAGE_ID_LEN_MAX	64
 #define RBD_OBJ_PREFIX_LEN_MAX	64
 
+/* Feature bits */
+
+#define RBD_FEATURE_LAYERING      1
+
+/* Features supported by this (client software) implementation. */
+
+#define RBD_FEATURES_ALL          (0)
+
 /*
  * An RBD device name will be "rbd#", where the "rbd" comes from
  * RBD_DRV_NAME above, and # is a unique integer identifier.
@@ -2226,6 +2234,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 		__le64 features;
 		__le64 incompat;
 	} features_buf = { 0 };
+	u64 incompat;
 	int ret;
 
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
@@ -2236,6 +2245,11 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
+
+	incompat = le64_to_cpu(features_buf.incompat);
+	if (incompat & ~RBD_FEATURES_ALL)
+		return -ENOTSUPP;
+
 	*snap_features = le64_to_cpu(features_buf.features);
 
 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
@@ -2977,7 +2991,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		goto out_err;
 
-	/* Get the features for the image */
+	/* Get the and check features for the image */
 
 	ret = rbd_dev_v2_features(rbd_dev);
 	if (ret < 0)

From 35152979e6181b1fbb4b61c3ff641c14df53ad66 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 31 Aug 2012 17:29:55 -0500
Subject: [PATCH 06/68] rbd: activate v2 image support

Now that v2 images support is fully implemented, have
rbd_dev_v2_probe() return 0 to indicate a successful probe.

(Note that an image that implements layering will fail
the probe early because of the feature chekc.)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0f260a6e97c..8f56d37637a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3014,7 +3014,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 	dout("discovered version 2 image, header name is %s\n",
 		rbd_dev->header_name);
 
-	return -ENOTSUPP;
+	return 0;
 out_err:
 	kfree(rbd_dev->header_name);
 	rbd_dev->header_name = NULL;

From 0f9831a89310cebba52d3f526e6cc5c2e403e6f1 Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Thu, 18 Oct 2012 14:01:43 -0700
Subject: [PATCH 07/68] ceph: fix dentry reference leak in encode_fh()

Call to d_find_alias() needs a corresponding dput()

This fixes http://tracker.newdream.net/issues/3271

Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/export.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8e1b60e557b..862887004d2 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 		*max_len = handle_length;
 		type = 255;
 	}
+	if (dentry)
+		dput(dentry);
 	return type;
 }
 

From 7246240c7c186542f73af4fadc744d66440f616f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 25 Oct 2012 08:49:41 -0700
Subject: [PATCH 08/68] libceph: avoid NULL kref_put from NULL alloc_msg return

The ceph_on_in_msg_alloc() method calls the ->alloc_msg() helper which
may return NULL.  It also drops con->mutex while it allocates a message,
which means that the connection state may change (e.g., get closed).  If
that happens, we clean up and bail out.  Avoid calling ceph_msg_put() on
a NULL return value and triggering a crash.

This was observed when an ->alloc_msg() call races with a timeout that
resends a zillion messages and resets the connection, and ->alloc_msg()
returns NULL (because the request was resent to another target).

Fixes http://tracker.newdream.net/issues/3342

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 66f6f56bcb2..1041114453d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2742,7 +2742,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 		msg = con->ops->alloc_msg(con, hdr, skip);
 		mutex_lock(&con->mutex);
 		if (con->state != CON_STATE_OPEN) {
-			ceph_msg_put(msg);
+			if (msg)
+				ceph_msg_put(msg);
 			return -EAGAIN;
 		}
 		con->in_msg = msg;

From b000056a5a8d3f5a4a9fb80184a7ec14f86a43d4 Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Thu, 25 Oct 2012 10:23:46 -0700
Subject: [PATCH 09/68] ceph: Fix NULL ptr crash in strlen()

set_request_path_attr() checks for NULL ptr before calling strlen()

This fixes http://tracker.newdream.net/issues/3404

Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1bcf712655d..62d2342eb26 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 	} else if (rpath || rino) {
 		*ino = rino;
 		*ppath = rpath;
-		*pathlen = strlen(rpath);
+		*pathlen = rpath ? strlen(rpath) : 0;
 		dout(" path %.*s\n", *pathlen, rpath);
 	}
 

From b213e0b1a62637b2a9395a34349b13d73ca2b90a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 10 Oct 2012 21:19:13 -0700
Subject: [PATCH 10/68] rbd: fix bug in rbd_dev_id_put()

In rbd_dev_id_put(), there's a loop that's intended to determine
the maximum device id in use.  But it isn't doing that at all,
the effect of how it's written is to simply use the just-put id
number, which ignores whole purpose of this function.

Fix the bug.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8f56d37637a..4a16464ad5b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2680,8 +2680,8 @@ static void rbd_dev_id_put(struct rbd_device *rbd_dev)
 		struct rbd_device *rbd_dev;
 
 		rbd_dev = list_entry(tmp, struct rbd_device, node);
-		if (rbd_id > max_id)
-			max_id = rbd_id;
+		if (rbd_dev->dev_id > max_id)
+			max_id = rbd_dev->dev_id;
 	}
 	spin_unlock(&rbd_dev_list_lock);
 

From a0ea3a40fd20b8c66381f747c454f89d6d1f50d4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 10 Oct 2012 21:19:13 -0700
Subject: [PATCH 11/68] rbd: zero return code in rbd_dev_image_id()

When rbd_dev_probe() calls rbd_dev_image_id() it expects to get
a 0 return code if successful, but it is getting a positive value.

The reason is that rbd_dev_image_id() returns the value it gets from
rbd_req_sync_exec(), which returns the number of bytes read in as a
result of the request.  (This ultimately comes from
ceph_copy_from_page_vector() in rbd_req_sync_op()).

Force the return value to 0 when successful in rbd_dev_image_id().
Do the same in rbd_dev_v2_object_prefix().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 drivers/block/rbd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4a16464ad5b..94d613208c4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2207,6 +2207,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
+	ret = 0;    /* rbd_req_sync_exec() can return positive */
 
 	p = reply_buf;
 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -2900,6 +2901,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
+	ret = 0;    /* rbd_req_sync_exec() can return positive */
 
 	p = response;
 	rbd_dev->image_id = ceph_extract_encoded_string(&p,

From be466c1cc36621590ef17b05a6d342dfd33f7280 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 22 Oct 2012 11:31:26 -0500
Subject: [PATCH 12/68] rbd: fix read-only option name

The name of the "read-only" mapping option was inadvertently changed
in this commit:

    f84344f3 rbd: separate mapping info in rbd_dev

Revert that hunk to return it to what it should be.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 94d613208c4..5d9e2cc5c51 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -397,7 +397,7 @@ enum {
 static match_table_t rbd_opts_tokens = {
 	/* int args above */
 	/* string args above */
-	{Opt_read_only, "mapping.read_only"},
+	{Opt_read_only, "read_only"},
 	{Opt_read_only, "ro"},		/* Alternate spelling */
 	{Opt_read_write, "read_write"},
 	{Opt_read_write, "rw"},		/* Alternate spelling */

From 13f4042c05b6a1a638ccab3f0cabdb84993803a2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 10 Oct 2012 18:59:29 -0700
Subject: [PATCH 13/68] rbd: kill rbd_req_{read,write}()

Both rbd_req_read() and rbd_req_write() are simple wrapper routines
for rbd_do_op(), and each is only called once.  Replace each wrapper
call with a direct call to rbd_do_op(), and get rid of the wrapper
functions.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 60 ++++++++++-----------------------------------
 1 file changed, 13 insertions(+), 47 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5d9e2cc5c51..8f2c39ad3bb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1210,41 +1210,6 @@ done:
 	return ret;
 }
 
-/*
- * Request async osd write
- */
-static int rbd_req_write(struct request *rq,
-			 struct rbd_device *rbd_dev,
-			 struct ceph_snap_context *snapc,
-			 u64 ofs, u64 len,
-			 struct bio *bio,
-			 struct rbd_req_coll *coll,
-			 int coll_index)
-{
-	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
-			 CEPH_OSD_OP_WRITE,
-			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			 ofs, len, bio, coll, coll_index);
-}
-
-/*
- * Request async osd read
- */
-static int rbd_req_read(struct request *rq,
-			 struct rbd_device *rbd_dev,
-			 u64 snapid,
-			 u64 ofs, u64 len,
-			 struct bio *bio,
-			 struct rbd_req_coll *coll,
-			 int coll_index)
-{
-	return rbd_do_op(rq, rbd_dev, NULL,
-			 snapid,
-			 CEPH_OSD_OP_READ,
-			 CEPH_OSD_FLAG_READ,
-			 ofs, len, bio, coll, coll_index);
-}
-
 /*
  * Request sync osd read
  */
@@ -1550,21 +1515,22 @@ static void rbd_rq_fn(struct request_queue *q)
 				goto next_seg;
 			}
 
-
 			/* init OSD command: write or read */
 			if (do_write)
-				rbd_req_write(rq, rbd_dev,
-					      snapc,
-					      ofs,
-					      op_size, bio,
-					      coll, cur_seg);
+				(void) rbd_do_op(rq, rbd_dev,
+						snapc, CEPH_NOSNAP,
+						CEPH_OSD_OP_WRITE,
+						CEPH_OSD_FLAG_WRITE |
+						    CEPH_OSD_FLAG_ONDISK,
+						ofs, op_size, bio,
+						coll, cur_seg);
 			else
-				rbd_req_read(rq, rbd_dev,
-					     rbd_dev->mapping.snap_id,
-					     ofs,
-					     op_size, bio,
-					     coll, cur_seg);
-
+				(void) rbd_do_op(rq, rbd_dev,
+						NULL, rbd_dev->mapping.snap_id,
+						CEPH_OSD_OP_READ,
+						CEPH_OSD_FLAG_READ,
+						ofs, op_size, bio,
+						coll, cur_seg);
 next_seg:
 			size -= op_size;
 			ofs += op_size;

From ff2e4bb5b32f89c455979a4222a9b78007cde254 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 10 Oct 2012 18:59:29 -0700
Subject: [PATCH 14/68] rbd: drop rbd_do_op() opcode and flags

The only callers of rbd_do_op() are in rbd_rq_fn(), where call one
is used for writes and the other used for reads.  The request passed
to rbd_do_op() already encodes the I/O direction, and that
information can be used inside the function to set the opcode and
flags value (rather than passing them in as arguments).

So get rid of the opcode and flags arguments to rbd_do_op().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8f2c39ad3bb..a29c6d2a49a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1164,7 +1164,6 @@ static int rbd_do_op(struct request *rq,
 		     struct rbd_device *rbd_dev,
 		     struct ceph_snap_context *snapc,
 		     u64 snapid,
-		     int opcode, int flags,
 		     u64 ofs, u64 len,
 		     struct bio *bio,
 		     struct rbd_req_coll *coll,
@@ -1176,6 +1175,8 @@ static int rbd_do_op(struct request *rq,
 	int ret;
 	struct ceph_osd_req_op *ops;
 	u32 payload_len;
+	int opcode;
+	int flags;
 
 	seg_name = rbd_segment_name(rbd_dev, ofs);
 	if (!seg_name)
@@ -1183,7 +1184,15 @@ static int rbd_do_op(struct request *rq,
 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
 
-	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+	if (rq_data_dir(rq) == WRITE) {
+		opcode = CEPH_OSD_OP_WRITE;
+		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
+		payload_len = seg_len;
+	} else {
+		opcode = CEPH_OSD_OP_READ;
+		flags = CEPH_OSD_FLAG_READ;
+		payload_len = 0;
+	}
 
 	ret = -ENOMEM;
 	ops = rbd_create_rw_ops(1, opcode, payload_len);
@@ -1519,16 +1528,11 @@ static void rbd_rq_fn(struct request_queue *q)
 			if (do_write)
 				(void) rbd_do_op(rq, rbd_dev,
 						snapc, CEPH_NOSNAP,
-						CEPH_OSD_OP_WRITE,
-						CEPH_OSD_FLAG_WRITE |
-						    CEPH_OSD_FLAG_ONDISK,
 						ofs, op_size, bio,
 						coll, cur_seg);
 			else
 				(void) rbd_do_op(rq, rbd_dev,
 						NULL, rbd_dev->mapping.snap_id,
-						CEPH_OSD_OP_READ,
-						CEPH_OSD_FLAG_READ,
 						ofs, op_size, bio,
 						coll, cur_seg);
 next_seg:

From 4634246db8cb2e5117ef7c682efcc383fa3354f8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 10 Oct 2012 18:59:29 -0700
Subject: [PATCH 15/68] rbd: consolidate rbd_do_op() calls

The two calls to rbd_do_op() from rbd_rq_fn() differ only in the
value passed for the snapshot id and the snapshot context.

For reads the snapshot always comes from the mapping, and for writes
the snapshot id is always CEPH_NOSNAP.

The snapshot context is always null for reads.  For writes, the
snapshot context always comes from the rbd header, but it is
acquired under protection of header semaphore and could change
thereafter, so we can't simply use what's available inside
rbd_do_op().

Eliminate the snapid parameter from rbd_do_op(), and set it
based on the I/O direction inside that function instead.  Always
pass the snapshot context acquired in the caller, but reset it
to a null pointer inside rbd_do_op() if the operation is a read.

As a result, there is no difference in the read and write calls
to rbd_do_op() made in rbd_rq_fn(), so just call it unconditionally.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a29c6d2a49a..d0328835bbd 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1163,7 +1163,6 @@ done:
 static int rbd_do_op(struct request *rq,
 		     struct rbd_device *rbd_dev,
 		     struct ceph_snap_context *snapc,
-		     u64 snapid,
 		     u64 ofs, u64 len,
 		     struct bio *bio,
 		     struct rbd_req_coll *coll,
@@ -1177,6 +1176,7 @@ static int rbd_do_op(struct request *rq,
 	u32 payload_len;
 	int opcode;
 	int flags;
+	u64 snapid;
 
 	seg_name = rbd_segment_name(rbd_dev, ofs);
 	if (!seg_name)
@@ -1187,10 +1187,13 @@ static int rbd_do_op(struct request *rq,
 	if (rq_data_dir(rq) == WRITE) {
 		opcode = CEPH_OSD_OP_WRITE;
 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
+		snapid = CEPH_NOSNAP;
 		payload_len = seg_len;
 	} else {
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
+		snapc = NULL;
+		snapid = rbd_dev->mapping.snap_id;
 		payload_len = 0;
 	}
 
@@ -1518,24 +1521,13 @@ static void rbd_rq_fn(struct request_queue *q)
 			kref_get(&coll->kref);
 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
 					      op_size, GFP_ATOMIC);
-			if (!bio) {
+			if (bio)
+				(void) rbd_do_op(rq, rbd_dev, snapc,
+						ofs, op_size,
+						bio, coll, cur_seg);
+			else
 				rbd_coll_end_req_index(rq, coll, cur_seg,
 						       -ENOMEM, op_size);
-				goto next_seg;
-			}
-
-			/* init OSD command: write or read */
-			if (do_write)
-				(void) rbd_do_op(rq, rbd_dev,
-						snapc, CEPH_NOSNAP,
-						ofs, op_size, bio,
-						coll, cur_seg);
-			else
-				(void) rbd_do_op(rq, rbd_dev,
-						NULL, rbd_dev->mapping.snap_id,
-						ofs, op_size, bio,
-						coll, cur_seg);
-next_seg:
 			size -= op_size;
 			ofs += op_size;
 

From db2388b6ee40a949084e4cdddc3b0a4357068a62 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 20 Oct 2012 22:17:27 -0500
Subject: [PATCH 16/68] rbd: verify rbd image order value

This adds a verification that an rbd image's object order is
within the upper and lower bounds supported by this implementation.

It must be at least 9 (SECTOR_SHIFT), because the Linux bio system
assumes that minimum granularity.

It also must be less than 32 (at the moment anyway) because there
exist spots in the code that store the size of a "segment" (object
backing an rbd image) in a signed int variable, which can be 32 bits
including the sign.  We should be able to relax this limit once
we've verified the code uses 64-bit types where needed.

Note that the CLI tool already limits the order to the range 12-25.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d0328835bbd..4734446c3b5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -533,6 +533,16 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 		return false;
 
+	/* The bio layer requires at least sector-sized I/O */
+
+	if (ondisk->options.order < SECTOR_SHIFT)
+		return false;
+
+	/* If we use u64 in a few spots we may be able to loosen this */
+
+	if (ondisk->options.order > 8 * sizeof (int) - 1)
+		return false;
+
 	/*
 	 * The size of a snapshot header has to fit in a size_t, and
 	 * that limits the number of snapshots.

From d4b125e9eb43babd14538ba61718e3db71a98d29 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 3 Jul 2012 16:01:19 -0500
Subject: [PATCH 17/68] rbd: increase maximum snapshot name length

Change RBD_MAX_SNAP_NAME_LEN to be based on NAME_MAX.  That is a
practical limit for the length of a snapshot name (based on the
presence of a directory using the name under /sys/bus/rbd to
represent the snapshot).

The /sys entry is created by prefixing it with "snap_"; define that
prefix symbolically, and take its length into account in defining
the snapshot name length limit.

Enforce the limit in rbd_add_parse_args().  Also delete a dout()
call in that function that was not meant to be committed.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4734446c3b5..4858d925b95 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -61,7 +61,10 @@
 
 #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
 
-#define RBD_MAX_SNAP_NAME_LEN	32
+#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
+#define RBD_MAX_SNAP_NAME_LEN	\
+			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
+
 #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
 #define RBD_MAX_OPT_LEN		1024
 
@@ -2063,7 +2066,7 @@ static int rbd_register_snap_dev(struct rbd_snap *snap,
 	dev->type = &rbd_snap_device_type;
 	dev->parent = parent;
 	dev->release = rbd_snap_dev_release;
-	dev_set_name(dev, "snap_%s", snap->name);
+	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
 
 	ret = device_register(dev);
@@ -2797,8 +2800,13 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!rbd_dev->image_name)
 		goto out_err;
 
-	/* Snapshot name is optional */
+	/* Snapshot name is optional; default is to use "head" */
+
 	len = next_token(&buf);
+	if (len > RBD_MAX_SNAP_NAME_LEN) {
+		err_ptr = ERR_PTR(-ENAMETOOLONG);
+		goto out_err;
+	}
 	if (!len) {
 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
@@ -2809,8 +2817,6 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
 	memcpy(snap_name, buf, len);
 	*(snap_name + len) = '\0';
 
-dout("    SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
-
 	return snap_name;
 
 out_err:

From e5cfeed281a842a37c9da84bad2911c9b470347e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 20 Oct 2012 22:17:27 -0500
Subject: [PATCH 18/68] rbd: simplify rbd_merge_bvec()

The aim of this patch is to make what's going on rbd_merge_bvec() a
bit more obvious than it was before.  This was an issue when a
recent btrfs bug led us to question whether the merge function was
working correctly.

Use "obj" rather than "chunk" to indicate the units whose boundaries
we care about we call (rados) "objects".

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 drivers/block/rbd.c | 47 +++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4858d925b95..76fbfa12006 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1566,22 +1566,41 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
 			  struct bio_vec *bvec)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
-	unsigned int chunk_sectors;
-	sector_t sector;
-	unsigned int bio_sectors;
-	int max;
+	sector_t sector_offset;
+	sector_t sectors_per_obj;
+	sector_t obj_sector_offset;
+	int ret;
 
-	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
-	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
-	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
+	/*
+	 * Find how far into its rbd object the partition-relative
+	 * bio start sector is to offset relative to the enclosing
+	 * device.
+	 */
+	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
+	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
 
-	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
-				 + bio_sectors)) << SECTOR_SHIFT;
-	if (max < 0)
-		max = 0; /* bio_add cannot handle a negative return */
-	if (max <= bvec->bv_len && bio_sectors == 0)
-		return bvec->bv_len;
-	return max;
+	/*
+	 * Compute the number of bytes from that offset to the end
+	 * of the object.  Account for what's already used by the bio.
+	 */
+	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
+	if (ret > bmd->bi_size)
+		ret -= bmd->bi_size;
+	else
+		ret = 0;
+
+	/*
+	 * Don't send back more than was asked for.  And if the bio
+	 * was empty, let the whole thing through because:  "Note
+	 * that a block device *must* allow a single page to be
+	 * added to an empty bio."
+	 */
+	rbd_assert(bvec->bv_len <= PAGE_SIZE);
+	if (ret > (int) bvec->bv_len || !bmd->bi_size)
+		ret = (int) bvec->bv_len;
+
+	return ret;
 }
 
 static void rbd_free_disk(struct rbd_device *rbd_dev)

From 069a4b5690a952e74157fd489833c71c73f012b3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 22 Oct 2012 11:31:27 -0500
Subject: [PATCH 19/68] rbd: kill rbd_device->rbd_opts

The rbd_device structure has an embedded rbd_options structure.
Such a structure is needed to work with the generic ceph argument
parsing code, but there's no need to keep it around once argument
parsing is done.

Use a local variable to hold the rbd options used in parsing in
rbd_get_client(), and just transfer its content (it's just a
read_only flag) into the field in the rbd_mapping sub-structure
that requires that information.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 drivers/block/rbd.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 76fbfa12006..c800047f583 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -184,7 +184,6 @@ struct rbd_device {
 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 
 	u32			image_format;	/* Either 1 or 2 */
-	struct rbd_options	rbd_opts;
 	struct rbd_client	*rbd_client;
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -456,18 +455,24 @@ static int parse_rbd_opts_token(char *c, void *private)
 static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 				size_t mon_addr_len, char *options)
 {
-	struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
+	struct rbd_options rbd_opts;
 	struct ceph_options *ceph_opts;
 	struct rbd_client *rbdc;
 
-	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+	/* Initialize all rbd options to the defaults */
+
+	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
 
 	ceph_opts = ceph_parse_options(options, mon_addr,
 					mon_addr + mon_addr_len,
-					parse_rbd_opts_token, rbd_opts);
+					parse_rbd_opts_token, &rbd_opts);
 	if (IS_ERR(ceph_opts))
 		return PTR_ERR(ceph_opts);
 
+	/* Record the parsed rbd options */
+
+	rbd_dev->mapping.read_only = rbd_opts.read_only;
+
 	rbdc = rbd_client_find(ceph_opts);
 	if (rbdc) {
 		/* using an existing client */
@@ -685,7 +690,6 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
 		rbd_dev->mapping.size = rbd_dev->header.image_size;
 		rbd_dev->mapping.features = rbd_dev->header.features;
 		rbd_dev->mapping.snap_exists = false;
-		rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
 		ret = 0;
 	} else {
 		ret = snap_by_name(rbd_dev, snap_name);

From 0ed7285e0001b960c888e5455ae982025210ed3d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 29 Oct 2012 11:01:42 -0700
Subject: [PATCH 20/68] libceph: fix osdmap decode error paths

Ensure that we set the err value correctly so that we do not pass a 0
value to ERR_PTR and confuse the calling code.  (In particular,
osd_client.c handle_map() will BUG(!newmap)).

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 5433fb0eb3c..f552aa48fd9 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -645,10 +645,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
 		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		err = -ENOMEM;
 		pi = kzalloc(sizeof(*pi), GFP_NOFS);
 		if (!pi)
 			goto bad;
 		pi->id = ceph_decode_32(p);
+		err = -EINVAL;
 		ev = ceph_decode_8(p); /* encoding version */
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
@@ -664,8 +666,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		__insert_pg_pool(&map->pg_pools, pi);
 	}
 
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
+	if (version >= 5) {
+		err = __decode_pool_names(p, end, map);
+		if (err < 0) {
+			dout("fail to decode pool names");
+			goto bad;
+		}
+	}
 
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
 
@@ -745,7 +752,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	return map;
 
 bad:
-	dout("osdmap_decode fail\n");
+	dout("osdmap_decode fail err %d\n", err);
 	ceph_osdmap_destroy(map);
 	return ERR_PTR(err);
 }
@@ -839,6 +846,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
 				   ev, CEPH_PG_POOL_VERSION);
+			err = -EINVAL;
 			goto bad;
 		}
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
@@ -855,8 +863,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		if (err < 0)
 			goto bad;
 	}
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
+	if (version >= 5) {
+		err = __decode_pool_names(p, end, map);
+		if (err < 0)
+			goto bad;
+	}
 
 	/* old_pool */
 	ceph_decode_32_safe(p, end, len, bad);
@@ -932,15 +943,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			(void) __remove_pg_mapping(&map->pg_temp, pgid);
 
 			/* insert */
-			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
-				err = -EINVAL;
+			err = -EINVAL;
+			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
 				goto bad;
-			}
+			err = -ENOMEM;
 			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-			if (!pg) {
-				err = -ENOMEM;
+			if (!pg)
 				goto bad;
-			}
 			pg->pgid = pgid;
 			pg->len = pglen;
 			for (j = 0; j < pglen; j++)

From f7760dad286829682a8d36f4563ab20a65732414 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 20 Oct 2012 22:17:27 -0500
Subject: [PATCH 21/68] rbd: simplify rbd_rq_fn()

When processing a request, rbd_rq_fn() makes clones of the bio's in
the request's bio chain and submits the results to osd's to be
satisfied.  If a request bio straddles the boundary between objects
backing the rbd image, it must be represented by two cloned bio's,
one for the first part (at the end of one object) and one for the
second (at the beginning of the next object).

This has been handled by a function bio_chain_clone(), which
includes an interface only a mother could love, and which has
been found to have other problems.

This patch defines two new fairly generic bio functions (one which
replaces bio_chain_clone()) to help out the situation, and then
revises rbd_rq_fn() to make use of them.

First, bio_clone_range() clones a portion of a single bio, starting
at a given offset within the bio and including only as many bytes
as requested.  As a convenience, a request to clone the entire bio
is passed directly to bio_clone().

Second, bio_chain_clone_range() performs a similar function,
producing a chain of cloned bio's covering a sub-range of the
source chain.  No bio_pair structures are used, and if successful
the result will represent exactly the specified range.

Using bio_chain_clone_range() makes bio_rq_fn() a little easier
to understand, because it avoids the need to pass very much
state information between consecutive calls.  By avoiding the need
to track a bio_pair structure, it also eliminates the problem
described here:  http://tracker.newdream.net/issues/2933

Note that a block request (and therefore the complete length of
a bio chain processed in rbd_rq_fn()) is an unsigned int, while
the result of rbd_segment_length() is u64.  This change makes
this range trunctation explicit, and trips a bug if the the
segment boundary is too far off.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 231 +++++++++++++++++++++++++++++---------------
 1 file changed, 152 insertions(+), 79 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c800047f583..cc06c55875b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -826,77 +826,144 @@ static void zero_bio_chain(struct bio *chain, int start_ofs)
 }
 
 /*
- * bio_chain_clone - clone a chain of bios up to a certain length.
- * might return a bio_pair that will need to be released.
+ * Clone a portion of a bio, starting at the given byte offset
+ * and continuing for the number of bytes indicated.
  */
-static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
-				   struct bio_pair **bp,
-				   int len, gfp_t gfpmask)
+static struct bio *bio_clone_range(struct bio *bio_src,
+					unsigned int offset,
+					unsigned int len,
+					gfp_t gfpmask)
 {
-	struct bio *old_chain = *old;
-	struct bio *new_chain = NULL;
-	struct bio *tail;
-	int total = 0;
+	struct bio_vec *bv;
+	unsigned int resid;
+	unsigned short idx;
+	unsigned int voff;
+	unsigned short end_idx;
+	unsigned short vcnt;
+	struct bio *bio;
 
-	if (*bp) {
-		bio_pair_release(*bp);
-		*bp = NULL;
+	/* Handle the easy case for the caller */
+
+	if (!offset && len == bio_src->bi_size)
+		return bio_clone(bio_src, gfpmask);
+
+	if (WARN_ON_ONCE(!len))
+		return NULL;
+	if (WARN_ON_ONCE(len > bio_src->bi_size))
+		return NULL;
+	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
+		return NULL;
+
+	/* Find first affected segment... */
+
+	resid = offset;
+	__bio_for_each_segment(bv, bio_src, idx, 0) {
+		if (resid < bv->bv_len)
+			break;
+		resid -= bv->bv_len;
+	}
+	voff = resid;
+
+	/* ...and the last affected segment */
+
+	resid += len;
+	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
+		if (resid <= bv->bv_len)
+			break;
+		resid -= bv->bv_len;
+	}
+	vcnt = end_idx - idx + 1;
+
+	/* Build the clone */
+
+	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+	if (!bio)
+		return NULL;	/* ENOMEM */
+
+	bio->bi_bdev = bio_src->bi_bdev;
+	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
+	bio->bi_rw = bio_src->bi_rw;
+	bio->bi_flags |= 1 << BIO_CLONED;
+
+	/*
+	 * Copy over our part of the bio_vec, then update the first
+	 * and last (or only) entries.
+	 */
+	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
+			vcnt * sizeof (struct bio_vec));
+	bio->bi_io_vec[0].bv_offset += voff;
+	if (vcnt > 1) {
+		bio->bi_io_vec[0].bv_len -= voff;
+		bio->bi_io_vec[vcnt - 1].bv_len = resid;
+	} else {
+		bio->bi_io_vec[0].bv_len = len;
 	}
 
-	while (old_chain && (total < len)) {
-		struct bio *tmp;
+	bio->bi_vcnt = vcnt;
+	bio->bi_size = len;
+	bio->bi_idx = 0;
 
-		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
-		if (!tmp)
-			goto err_out;
-		gfpmask &= ~__GFP_WAIT;	/* can't wait after the first */
+	return bio;
+}
 
-		if (total + old_chain->bi_size > len) {
-			struct bio_pair *bp;
+/*
+ * Clone a portion of a bio chain, starting at the given byte offset
+ * into the first bio in the source chain and continuing for the
+ * number of bytes indicated.  The result is another bio chain of
+ * exactly the given length, or a null pointer on error.
+ *
+ * The bio_src and offset parameters are both in-out.  On entry they
+ * refer to the first source bio and the offset into that bio where
+ * the start of data to be cloned is located.
+ *
+ * On return, bio_src is updated to refer to the bio in the source
+ * chain that contains first un-cloned byte, and *offset will
+ * contain the offset of that byte within that bio.
+ */
+static struct bio *bio_chain_clone_range(struct bio **bio_src,
+					unsigned int *offset,
+					unsigned int len,
+					gfp_t gfpmask)
+{
+	struct bio *bi = *bio_src;
+	unsigned int off = *offset;
+	struct bio *chain = NULL;
+	struct bio **end;
 
-			/*
-			 * this split can only happen with a single paged bio,
-			 * split_bio will BUG_ON if this is not the case
-			 */
-			dout("bio_chain_clone split! total=%d remaining=%d"
-			     "bi_size=%u\n",
-			     total, len - total, old_chain->bi_size);
+	/* Build up a chain of clone bios up to the limit */
 
-			/* split the bio. We'll release it either in the next
-			   call, or it will have to be released outside */
-			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
-			if (!bp)
-				goto err_out;
+	if (!bi || off >= bi->bi_size || !len)
+		return NULL;		/* Nothing to clone */
 
-			__bio_clone(tmp, &bp->bio1);
+	end = &chain;
+	while (len) {
+		unsigned int bi_size;
+		struct bio *bio;
 
-			*next = &bp->bio2;
-		} else {
-			__bio_clone(tmp, old_chain);
-			*next = old_chain->bi_next;
+		if (!bi)
+			goto out_err;	/* EINVAL; ran out of bio's */
+		bi_size = min_t(unsigned int, bi->bi_size - off, len);
+		bio = bio_clone_range(bi, off, bi_size, gfpmask);
+		if (!bio)
+			goto out_err;	/* ENOMEM */
+
+		*end = bio;
+		end = &bio->bi_next;
+
+		off += bi_size;
+		if (off == bi->bi_size) {
+			bi = bi->bi_next;
+			off = 0;
 		}
-
-		tmp->bi_bdev = NULL;
-		tmp->bi_next = NULL;
-		if (new_chain)
-			tail->bi_next = tmp;
-		else
-			new_chain = tmp;
-		tail = tmp;
-		old_chain = old_chain->bi_next;
-
-		total += tmp->bi_size;
+		len -= bi_size;
 	}
+	*bio_src = bi;
+	*offset = off;
 
-	rbd_assert(total == len);
+	return chain;
+out_err:
+	bio_chain_put(chain);
 
-	*old = old_chain;
-
-	return new_chain;
-
-err_out:
-	dout("bio_chain_clone with err\n");
-	bio_chain_put(new_chain);
 	return NULL;
 }
 
@@ -1014,8 +1081,9 @@ static int rbd_do_request(struct request *rq,
 		req_data->coll_index = coll_index;
 	}
 
-	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
-		(unsigned long long) ofs, (unsigned long long) len);
+	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
+		object_name, (unsigned long long) ofs,
+		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
@@ -1463,18 +1531,16 @@ static void rbd_rq_fn(struct request_queue *q)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
 	struct request *rq;
-	struct bio_pair *bp = NULL;
 
 	while ((rq = blk_fetch_request(q))) {
 		struct bio *bio;
-		struct bio *rq_bio, *next_bio = NULL;
 		bool do_write;
 		unsigned int size;
-		u64 op_size = 0;
 		u64 ofs;
 		int num_segs, cur_seg = 0;
 		struct rbd_req_coll *coll;
 		struct ceph_snap_context *snapc;
+		unsigned int bio_offset;
 
 		dout("fetched request\n");
 
@@ -1486,10 +1552,6 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		/* deduce our operation (read, write) */
 		do_write = (rq_data_dir(rq) == WRITE);
-
-		size = blk_rq_bytes(rq);
-		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
-		rq_bio = rq->bio;
 		if (do_write && rbd_dev->mapping.read_only) {
 			__blk_end_request_all(rq, -EROFS);
 			continue;
@@ -1512,6 +1574,10 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		up_read(&rbd_dev->header_rwsem);
 
+		size = blk_rq_bytes(rq);
+		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
+		bio = rq->bio;
+
 		dout("%s 0x%x bytes at 0x%llx\n",
 		     do_write ? "write" : "read",
 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
@@ -1531,30 +1597,37 @@ static void rbd_rq_fn(struct request_queue *q)
 			continue;
 		}
 
+		bio_offset = 0;
 		do {
-			/* a bio clone to be passed down to OSD req */
+			u64 limit = rbd_segment_length(rbd_dev, ofs, size);
+			unsigned int chain_size;
+			struct bio *bio_chain;
+
+			BUG_ON(limit > (u64) UINT_MAX);
+			chain_size = (unsigned int) limit;
 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
-			op_size = rbd_segment_length(rbd_dev, ofs, size);
+
 			kref_get(&coll->kref);
-			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
-					      op_size, GFP_ATOMIC);
-			if (bio)
+
+			/* Pass a cloned bio chain via an osd request */
+
+			bio_chain = bio_chain_clone_range(&bio,
+						&bio_offset, chain_size,
+						GFP_ATOMIC);
+			if (bio_chain)
 				(void) rbd_do_op(rq, rbd_dev, snapc,
-						ofs, op_size,
-						bio, coll, cur_seg);
+						ofs, chain_size,
+						bio_chain, coll, cur_seg);
 			else
 				rbd_coll_end_req_index(rq, coll, cur_seg,
-						       -ENOMEM, op_size);
-			size -= op_size;
-			ofs += op_size;
+						       -ENOMEM, chain_size);
+			size -= chain_size;
+			ofs += chain_size;
 
 			cur_seg++;
-			rq_bio = next_bio;
 		} while (size > 0);
 		kref_put(&coll->kref, rbd_coll_release);
 
-		if (bp)
-			bio_pair_release(bp);
 		spin_lock_irq(q->queue_lock);
 
 		ceph_put_snap_context(snapc);
@@ -1564,7 +1637,7 @@ static void rbd_rq_fn(struct request_queue *q)
 /*
  * a queue callback. Makes sure that we don't create a bio that spans across
  * multiple osd objects. One exception would be with a single page bios,
- * which we handle later at bio_chain_clone
+ * which we handle later at bio_chain_clone_range()
  */
 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
 			  struct bio_vec *bvec)

From 41f38c2b2f8b66b176a0e548ef06294343a7bfa2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:40 -0500
Subject: [PATCH 22/68] rbd: remove snapshots on error in rbd_add()

If rbd_dev_snaps_update() has ever been called for an rbd device
structure there could be snapshot structures on its snaps list.
In rbd_add(), this function is called but a subsequent error
path neglected to clean up any of these snapshots.

Add a call to rbd_remove_all_snaps() in the appropriate spot to
remedy this.  Change a couple of error labels to be a little
clearer while there.

Drop the leading underscores from the function name; there's nothing
special about that function that they might signify.  As suggested
in review, the leading underscores in __rbd_remove_snap_dev() have
been removed as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cc06c55875b..c7681d46cf8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -228,7 +228,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 
 static void rbd_dev_release(struct device *dev);
-static void __rbd_remove_snap_dev(struct rbd_snap *snap);
+static void rbd_remove_snap_dev(struct rbd_snap *snap);
 
 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 		       size_t count);
@@ -1787,13 +1787,13 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 	return ret;
 }
 
-static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
+static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 {
 	struct rbd_snap *snap;
 	struct rbd_snap *next;
 
 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
-		__rbd_remove_snap_dev(snap);
+		rbd_remove_snap_dev(snap);
 }
 
 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
@@ -2146,7 +2146,7 @@ static bool rbd_snap_registered(struct rbd_snap *snap)
 	return ret;
 }
 
-static void __rbd_remove_snap_dev(struct rbd_snap *snap)
+static void rbd_remove_snap_dev(struct rbd_snap *snap)
 {
 	list_del(&snap->node);
 	if (device_is_registered(&snap->dev))
@@ -2569,7 +2569,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 
 			if (rbd_dev->mapping.snap_id == snap->id)
 				rbd_dev->mapping.snap_exists = false;
-			__rbd_remove_snap_dev(snap);
+			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
 				rbd_dev->mapping.snap_id == snap->id ?
 								"mapped " : "",
@@ -3179,11 +3179,11 @@ static ssize_t rbd_add(struct bus_type *bus,
 	/* no need to lock here, as rbd_dev is not registered yet */
 	rc = rbd_dev_snaps_update(rbd_dev);
 	if (rc)
-		goto err_out_header;
+		goto err_out_probe;
 
 	rc = rbd_dev_set_mapping(rbd_dev, snap_name);
 	if (rc)
-		goto err_out_header;
+		goto err_out_snaps;
 
 	/* generate unique id: find highest unique id, add one */
 	rbd_dev_id_get(rbd_dev);
@@ -3247,7 +3247,9 @@ err_out_blkdev:
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 err_out_id:
 	rbd_dev_id_put(rbd_dev);
-err_out_header:
+err_out_snaps:
+	rbd_remove_all_snaps(rbd_dev);
+err_out_probe:
 	rbd_header_free(&rbd_dev->header);
 err_out_client:
 	kfree(rbd_dev->header_name);
@@ -3345,7 +3347,7 @@ static ssize_t rbd_remove(struct bus_type *bus,
 		goto done;
 	}
 
-	__rbd_remove_all_snaps(rbd_dev);
+	rbd_remove_all_snaps(rbd_dev);
 	rbd_bus_del_dev(rbd_dev);
 
 done:

From 86992098e7fdb97d01feb51495a952b264a55b7c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 23/68] rbd: make pool_id a 64 bit value

If a format 2 image has a parent, its pool id will be specified
using a 64-bit value.  Change the pool id we save for an image to
match that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c7681d46cf8..ef82e9091f4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -197,7 +197,7 @@ struct rbd_device {
 	size_t			image_name_len;
 	char			*header_name;
 	char			*pool_name;
-	int			pool_id;
+	u64			pool_id;
 
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
@@ -1113,7 +1113,7 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
+	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
 				   req, ops);
 	rbd_assert(ret == 0);
@@ -1982,7 +1982,7 @@ static ssize_t rbd_pool_id_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%d\n", rbd_dev->pool_id);
+	return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
 }
 
 static ssize_t rbd_name_show(struct device *dev,
@@ -3170,7 +3170,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
 	if (rc < 0)
 		goto err_out_client;
-	rbd_dev->pool_id = rc;
+	rbd_dev->pool_id = (u64) rc;
 
 	rc = rbd_dev_probe(rbd_dev);
 	if (rc < 0)

From 971f839a7670197366c04e99472943532caeb0dc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 24/68] rbd: move snap info out of rbd_mapping struct

Moving the snap_id and snap_name fields into the separate
rbd_mapping structure was misguided.  (And in time, perhaps
we'll do away with that structure altogether...)

Move these fields back into struct rbd_device.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ef82e9091f4..7d28ce33056 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -166,8 +166,6 @@ struct rbd_snap {
 };
 
 struct rbd_mapping {
-	char                    *snap_name;
-	u64                     snap_id;
 	u64                     size;
 	u64                     features;
 	bool                    snap_exists;
@@ -199,6 +197,9 @@ struct rbd_device {
 	char			*pool_name;
 	u64			pool_id;
 
+	char                    *snap_name;
+	u64                     snap_id;
+
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
 
@@ -669,7 +670,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 
 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
 		if (!strcmp(snap_name, snap->name)) {
-			rbd_dev->mapping.snap_id = snap->id;
+			rbd_dev->snap_id = snap->id;
 			rbd_dev->mapping.size = snap->size;
 			rbd_dev->mapping.features = snap->features;
 
@@ -686,7 +687,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
 
 	if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
 		    sizeof (RBD_SNAP_HEAD_NAME))) {
-		rbd_dev->mapping.snap_id = CEPH_NOSNAP;
+		rbd_dev->snap_id = CEPH_NOSNAP;
 		rbd_dev->mapping.size = rbd_dev->header.image_size;
 		rbd_dev->mapping.features = rbd_dev->header.features;
 		rbd_dev->mapping.snap_exists = false;
@@ -698,7 +699,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
 		rbd_dev->mapping.snap_exists = true;
 		rbd_dev->mapping.read_only = true;
 	}
-	rbd_dev->mapping.snap_name = snap_name;
+	rbd_dev->snap_name = snap_name;
 done:
 	return ret;
 }
@@ -1278,7 +1279,7 @@ static int rbd_do_op(struct request *rq,
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
 		snapc = NULL;
-		snapid = rbd_dev->mapping.snap_id;
+		snapid = rbd_dev->snap_id;
 		payload_len = 0;
 	}
 
@@ -1561,7 +1562,7 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		down_read(&rbd_dev->header_rwsem);
 
-		if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
+		if (rbd_dev->snap_id != CEPH_NOSNAP &&
 				!rbd_dev->mapping.snap_exists) {
 			up_read(&rbd_dev->header_rwsem);
 			dout("request for non-existent snapshot");
@@ -1800,7 +1801,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
 {
 	sector_t size;
 
-	if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
+	if (rbd_dev->snap_id != CEPH_NOSNAP)
 		return;
 
 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
@@ -2011,7 +2012,7 @@ static ssize_t rbd_snap_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
+	return sprintf(buf, "%s\n", rbd_dev->snap_name);
 }
 
 static ssize_t rbd_image_refresh(struct device *dev,
@@ -2567,12 +2568,11 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 
 			/* Existing snapshot not in the new snap context */
 
-			if (rbd_dev->mapping.snap_id == snap->id)
+			if (rbd_dev->snap_id == snap->id)
 				rbd_dev->mapping.snap_exists = false;
 			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
-				rbd_dev->mapping.snap_id == snap->id ?
-								"mapped " : "",
+				rbd_dev->snap_id == snap->id ?  "mapped " : "",
 				(unsigned long long) snap->id);
 
 			/* Done with this list entry; advance */
@@ -3256,7 +3256,7 @@ err_out_client:
 	rbd_put_client(rbd_dev);
 	kfree(rbd_dev->image_id);
 err_out_args:
-	kfree(rbd_dev->mapping.snap_name);
+	kfree(rbd_dev->snap_name);
 	kfree(rbd_dev->image_name);
 	kfree(rbd_dev->pool_name);
 err_out_mem:
@@ -3309,7 +3309,7 @@ static void rbd_dev_release(struct device *dev)
 	rbd_header_free(&rbd_dev->header);
 
 	/* done with the id, and with the rbd_dev */
-	kfree(rbd_dev->mapping.snap_name);
+	kfree(rbd_dev->snap_name);
 	kfree(rbd_dev->image_id);
 	kfree(rbd_dev->header_name);
 	kfree(rbd_dev->pool_name);

From daba5fdb4c469838dcee4b8dd4fecf7be69fa218 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 26 Oct 2012 17:25:23 -0500
Subject: [PATCH 25/68] rbd: rename snap_exists field

A Boolean field "snap_exists" in an rbd mapping is used to indicate
whether a mapped snapshot has been removed from an image's snapshot
context, to stop sending requests for that snapshot as soon as we
know it's gone.

Generalize the interpretation of this field so it applies to
non-snapshot (i.e. "head") mappings.  That is, define its value
to be false until the mapping has been set, and then define it to be
true for both snapshot mappings or head mappings.

Rename the field "exists" to reflect the broader interpretation.
The rbd_mapping structure is on its way out, so move the field
back into the rbd_device structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7d28ce33056..589f56542df 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -168,7 +168,6 @@ struct rbd_snap {
 struct rbd_mapping {
 	u64                     size;
 	u64                     features;
-	bool                    snap_exists;
 	bool			read_only;
 };
 
@@ -189,6 +188,7 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
+	bool                    exists;
 	char			*image_id;
 	size_t			image_id_len;
 	char			*image_name;
@@ -690,16 +690,15 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
 		rbd_dev->snap_id = CEPH_NOSNAP;
 		rbd_dev->mapping.size = rbd_dev->header.image_size;
 		rbd_dev->mapping.features = rbd_dev->header.features;
-		rbd_dev->mapping.snap_exists = false;
 		ret = 0;
 	} else {
 		ret = snap_by_name(rbd_dev, snap_name);
 		if (ret < 0)
 			goto done;
-		rbd_dev->mapping.snap_exists = true;
 		rbd_dev->mapping.read_only = true;
 	}
 	rbd_dev->snap_name = snap_name;
+	rbd_dev->exists = true;
 done:
 	return ret;
 }
@@ -1562,8 +1561,8 @@ static void rbd_rq_fn(struct request_queue *q)
 
 		down_read(&rbd_dev->header_rwsem);
 
-		if (rbd_dev->snap_id != CEPH_NOSNAP &&
-				!rbd_dev->mapping.snap_exists) {
+		if (!rbd_dev->exists) {
+			rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
 			up_read(&rbd_dev->header_rwsem);
 			dout("request for non-existent snapshot");
 			spin_lock_irq(q->queue_lock);
@@ -2569,7 +2568,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 			/* Existing snapshot not in the new snap context */
 
 			if (rbd_dev->snap_id == snap->id)
-				rbd_dev->mapping.snap_exists = false;
+				rbd_dev->exists = false;
 			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
 				rbd_dev->snap_id == snap->id ?  "mapped " : "",

From 78cea76e0580befaf561c6989f4fc985bc66c8f7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 26/68] rbd: move ceph_parse_options() call up

Move option parsing out of rbd_get_client() and into its caller.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 48 +++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 589f56542df..e83bddcca34 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -453,27 +453,11 @@ static int parse_rbd_opts_token(char *c, void *private)
  * Get a ceph client with specific addr and configuration, if one does
  * not exist create it.
  */
-static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
-				size_t mon_addr_len, char *options)
+static int rbd_get_client(struct rbd_device *rbd_dev,
+				struct ceph_options *ceph_opts)
 {
-	struct rbd_options rbd_opts;
-	struct ceph_options *ceph_opts;
 	struct rbd_client *rbdc;
 
-	/* Initialize all rbd options to the defaults */
-
-	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
-
-	ceph_opts = ceph_parse_options(options, mon_addr,
-					mon_addr + mon_addr_len,
-					parse_rbd_opts_token, &rbd_opts);
-	if (IS_ERR(ceph_opts))
-		return PTR_ERR(ceph_opts);
-
-	/* Record the parsed rbd options */
-
-	rbd_dev->mapping.read_only = rbd_opts.read_only;
-
 	rbdc = rbd_client_find(ceph_opts);
 	if (rbdc) {
 		/* using an existing client */
@@ -3132,9 +3116,11 @@ static ssize_t rbd_add(struct bus_type *bus,
 	struct rbd_device *rbd_dev = NULL;
 	const char *mon_addrs = NULL;
 	size_t mon_addrs_size = 0;
+	char *snap_name;
+	struct rbd_options rbd_opts;
+	struct ceph_options *ceph_opts;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
-	char *snap_name;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
@@ -3160,9 +3146,26 @@ static ssize_t rbd_add(struct bus_type *bus,
 		goto err_out_mem;
 	}
 
-	rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
-	if (rc < 0)
+	/* Initialize all rbd options to the defaults */
+
+	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
+
+	ceph_opts = ceph_parse_options(options, mon_addrs,
+					mon_addrs + mon_addrs_size - 1,
+					parse_rbd_opts_token, &rbd_opts);
+	if (IS_ERR(ceph_opts)) {
+		rc = PTR_ERR(ceph_opts);
 		goto err_out_args;
+	}
+
+	/* Record the parsed rbd options */
+
+	rbd_dev->mapping.read_only = rbd_opts.read_only;
+
+	rc = rbd_get_client(rbd_dev, ceph_opts);
+	if (rc < 0)
+		goto err_out_opts;
+	ceph_opts = NULL;	/* ceph_opts now owned by rbd_dev client */
 
 	/* pick the pool */
 	osdc = &rbd_dev->rbd_client->client->osdc;
@@ -3254,6 +3257,9 @@ err_out_client:
 	kfree(rbd_dev->header_name);
 	rbd_put_client(rbd_dev);
 	kfree(rbd_dev->image_id);
+err_out_opts:
+	if (ceph_opts)
+		ceph_destroy_options(ceph_opts);
 err_out_args:
 	kfree(rbd_dev->snap_name);
 	kfree(rbd_dev->image_name);

From 0ddebc0c6c518ae42c376151e34d9d4b84443ba5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 27/68] rbd: do all argument parsing in one place

This patch makes rbd_add_parse_args() be the single place all
argument parsing occurs for an image map request:
    - Move the ceph_parse_options() call into that function
    - Use local variables rather than parameters to hold the list
      of monitor addresses supplied
    - Rather than returning it, pass the snapshot name (and its
      length) back via parameters
    - Have the function return a ceph_options structure pointer

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 79 ++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e83bddcca34..68447d83288 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2845,24 +2845,27 @@ static inline char *dup_token(const char **buf, size_t *lenp)
  *
  * Note: rbd_dev is assumed to have been initially zero-filled.
  */
-static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
-				const char *buf,
-				const char **mon_addrs,
-				size_t *mon_addrs_size,
-				char *options,
-				size_t options_size)
+static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
+						const char *buf,
+						char *options,
+						size_t options_size,
+						char **snap_name,
+						size_t *snap_name_len)
 {
 	size_t len;
-	char *err_ptr = ERR_PTR(-EINVAL);
-	char *snap_name;
+	const char *mon_addrs;
+	size_t mon_addrs_size;
+	struct rbd_options rbd_opts;
+	struct ceph_options *ceph_opts;
+	struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
 
 	/* The first four tokens are required */
 
 	len = next_token(&buf);
 	if (!len)
 		return err_ptr;
-	*mon_addrs_size = len + 1;
-	*mon_addrs = buf;
+	mon_addrs_size = len + 1;
+	mon_addrs = buf;
 
 	buf += len;
 
@@ -2890,14 +2893,27 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
 	}
-	snap_name = kmalloc(len + 1, GFP_KERNEL);
-	if (!snap_name)
+	*snap_name = kmalloc(len + 1, GFP_KERNEL);
+	if (!*snap_name)
 		goto out_err;
-	memcpy(snap_name, buf, len);
-	*(snap_name + len) = '\0';
+	memcpy(*snap_name, buf, len);
+	*(*snap_name + len) = '\0';
+	*snap_name_len = len;
+	/* Initialize all rbd options to the defaults */
 
-	return snap_name;
+	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
 
+	ceph_opts = ceph_parse_options(options, mon_addrs,
+					mon_addrs + mon_addrs_size - 1,
+					parse_rbd_opts_token, &rbd_opts);
+
+	/* Record the parsed rbd options */
+
+	if (!IS_ERR(ceph_opts)) {
+		rbd_dev->mapping.read_only = rbd_opts.read_only;
+	}
+
+	return ceph_opts;
 out_err:
 	kfree(rbd_dev->image_name);
 	rbd_dev->image_name = NULL;
@@ -3114,10 +3130,8 @@ static ssize_t rbd_add(struct bus_type *bus,
 {
 	char *options;
 	struct rbd_device *rbd_dev = NULL;
-	const char *mon_addrs = NULL;
-	size_t mon_addrs_size = 0;
 	char *snap_name;
-	struct rbd_options rbd_opts;
+	size_t snap_name_len = 0;
 	struct ceph_options *ceph_opts;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
@@ -3139,32 +3153,16 @@ static ssize_t rbd_add(struct bus_type *bus,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* parse add command */
-	snap_name = rbd_add_parse_args(rbd_dev, buf,
-				&mon_addrs, &mon_addrs_size, options, count);
-	if (IS_ERR(snap_name)) {
-		rc = PTR_ERR(snap_name);
+	ceph_opts = rbd_add_parse_args(rbd_dev, buf, options, count,
+				&snap_name, &snap_name_len);
+	if (IS_ERR(ceph_opts)) {
+		rc = PTR_ERR(ceph_opts);
 		goto err_out_mem;
 	}
 
-	/* Initialize all rbd options to the defaults */
-
-	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
-
-	ceph_opts = ceph_parse_options(options, mon_addrs,
-					mon_addrs + mon_addrs_size - 1,
-					parse_rbd_opts_token, &rbd_opts);
-	if (IS_ERR(ceph_opts)) {
-		rc = PTR_ERR(ceph_opts);
-		goto err_out_args;
-	}
-
-	/* Record the parsed rbd options */
-
-	rbd_dev->mapping.read_only = rbd_opts.read_only;
-
 	rc = rbd_get_client(rbd_dev, ceph_opts);
 	if (rc < 0)
-		goto err_out_opts;
+		goto err_out_args;
 	ceph_opts = NULL;	/* ceph_opts now owned by rbd_dev client */
 
 	/* pick the pool */
@@ -3257,10 +3255,9 @@ err_out_client:
 	kfree(rbd_dev->header_name);
 	rbd_put_client(rbd_dev);
 	kfree(rbd_dev->image_id);
-err_out_opts:
+err_out_args:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
-err_out_args:
 	kfree(rbd_dev->snap_name);
 	kfree(rbd_dev->image_name);
 	kfree(rbd_dev->pool_name);

From e5c35534042f4b5957a32bba651222c91247beba Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 28/68] rbd: get rid of snap_name_len

The value returned in the "snap_name_len" argument to
rbd_add_parse_args() is never actually used, so get rid of it.

The snap_name_len recorded in rbd_dev_v2_snap_name() is not
useful either, so get rid of that too.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 68447d83288..7bd23139b94 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2408,7 +2408,6 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 	int ret;
 	void *p;
 	void *end;
-	size_t snap_name_len;
 	char *snap_name;
 
 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
@@ -2428,9 +2427,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 
 	p = reply_buf;
 	end = (char *) reply_buf + size;
-	snap_name_len = 0;
-	snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
-				GFP_KERNEL);
+	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 	if (IS_ERR(snap_name)) {
 		ret = PTR_ERR(snap_name);
 		goto out;
@@ -2849,8 +2846,7 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 						const char *buf,
 						char *options,
 						size_t options_size,
-						char **snap_name,
-						size_t *snap_name_len)
+						char **snap_name)
 {
 	size_t len;
 	const char *mon_addrs;
@@ -2898,7 +2894,7 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 		goto out_err;
 	memcpy(*snap_name, buf, len);
 	*(*snap_name + len) = '\0';
-	*snap_name_len = len;
+
 	/* Initialize all rbd options to the defaults */
 
 	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
@@ -3131,7 +3127,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 	char *options;
 	struct rbd_device *rbd_dev = NULL;
 	char *snap_name;
-	size_t snap_name_len = 0;
 	struct ceph_options *ceph_opts;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
@@ -3154,7 +3149,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 
 	/* parse add command */
 	ceph_opts = rbd_add_parse_args(rbd_dev, buf, options, count,
-				&snap_name, &snap_name_len);
+				&snap_name);
 	if (IS_ERR(ceph_opts)) {
 		rc = PTR_ERR(ceph_opts);
 		goto err_out_mem;

From f28e565a1b15eef62618db4011d9e320089a4214 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 29/68] rbd: remove options args from rbd_add_parse_args()

They "options" argument to rbd_add_parse_args() (and it's partner
options_size) is now only needed within the function, so there's no
need to have the caller allocate and pass the options buffer.  Just
allocate the options buffer within the function using dup_token().

Also distinguish between failures due to failed memory allocation
and failing because a required argument was missing.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 58 ++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7bd23139b94..62df67a1132 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2844,54 +2844,58 @@ static inline char *dup_token(const char **buf, size_t *lenp)
  */
 static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 						const char *buf,
-						char *options,
-						size_t options_size,
 						char **snap_name)
 {
 	size_t len;
 	const char *mon_addrs;
 	size_t mon_addrs_size;
+	char *options;
+	struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
 	struct rbd_options rbd_opts;
 	struct ceph_options *ceph_opts;
-	struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
 
 	/* The first four tokens are required */
 
 	len = next_token(&buf);
 	if (!len)
-		return err_ptr;
-	mon_addrs_size = len + 1;
+		return err_ptr;	/* Missing monitor address(es) */
 	mon_addrs = buf;
-
+	mon_addrs_size = len + 1;
 	buf += len;
 
-	len = copy_token(&buf, options, options_size);
-	if (!len || len >= options_size)
-		return err_ptr;
+	options = dup_token(&buf, NULL);
+	if (!options)
+		goto out_mem;
+	if (!*options)
+		goto out_err;	/* Missing options */
 
-	err_ptr = ERR_PTR(-ENOMEM);
 	rbd_dev->pool_name = dup_token(&buf, NULL);
 	if (!rbd_dev->pool_name)
-		goto out_err;
+		goto out_mem;
+	if (!*rbd_dev->pool_name)
+		goto out_err;	/* Missing pool name */
 
 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
 	if (!rbd_dev->image_name)
-		goto out_err;
-
-	/* Snapshot name is optional; default is to use "head" */
+		goto out_mem;
+	if (!*rbd_dev->image_name)
+		goto out_err;	/* Missing image name */
 
+	/*
+	 * Snapshot name is optional; default is to use "-"
+	 * (indicating the head/no snapshot).
+	 */
 	len = next_token(&buf);
-	if (len > RBD_MAX_SNAP_NAME_LEN) {
-		err_ptr = ERR_PTR(-ENAMETOOLONG);
-		goto out_err;
-	}
 	if (!len) {
 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
+	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
+		err_ptr = ERR_PTR(-ENAMETOOLONG);
+		goto out_err;
 	}
 	*snap_name = kmalloc(len + 1, GFP_KERNEL);
 	if (!*snap_name)
-		goto out_err;
+		goto out_mem;
 	memcpy(*snap_name, buf, len);
 	*(*snap_name + len) = '\0';
 
@@ -2902,20 +2906,23 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 	ceph_opts = ceph_parse_options(options, mon_addrs,
 					mon_addrs + mon_addrs_size - 1,
 					parse_rbd_opts_token, &rbd_opts);
+	kfree(options);
 
 	/* Record the parsed rbd options */
 
-	if (!IS_ERR(ceph_opts)) {
+	if (!IS_ERR(ceph_opts))
 		rbd_dev->mapping.read_only = rbd_opts.read_only;
-	}
 
 	return ceph_opts;
+out_mem:
+	err_ptr = ERR_PTR(-ENOMEM);
 out_err:
 	kfree(rbd_dev->image_name);
 	rbd_dev->image_name = NULL;
 	rbd_dev->image_name_len = 0;
 	kfree(rbd_dev->pool_name);
 	rbd_dev->pool_name = NULL;
+	kfree(options);
 
 	return err_ptr;
 }
@@ -3124,7 +3131,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 		       const char *buf,
 		       size_t count)
 {
-	char *options;
 	struct rbd_device *rbd_dev = NULL;
 	char *snap_name;
 	struct ceph_options *ceph_opts;
@@ -3134,9 +3140,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	options = kmalloc(count, GFP_KERNEL);
-	if (!options)
-		goto err_out_mem;
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
 		goto err_out_mem;
@@ -3148,8 +3151,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* parse add command */
-	ceph_opts = rbd_add_parse_args(rbd_dev, buf, options, count,
-				&snap_name);
+	ceph_opts = rbd_add_parse_args(rbd_dev, buf, &snap_name);
 	if (IS_ERR(ceph_opts)) {
 		rc = PTR_ERR(ceph_opts);
 		goto err_out_mem;
@@ -3233,7 +3235,6 @@ err_out_bus:
 	/* this will also clean up rest of rbd_dev stuff */
 
 	rbd_bus_del_dev(rbd_dev);
-	kfree(options);
 	return rc;
 
 err_out_disk:
@@ -3258,7 +3259,6 @@ err_out_args:
 	kfree(rbd_dev->pool_name);
 err_out_mem:
 	kfree(rbd_dev);
-	kfree(options);
 
 	dout("Error adding device %s\n", buf);
 	module_put(THIS_MODULE);

From 819d52bf72b61a8455024ff7863eed5d681e73c7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 30/68] rbd: remove snap_name arg from rbd_add_parse_args()

The snapshot name returned by rbd_add_parse_args() just gets saved
in the rbd_dev eventually.  So just do that inside that function and
do away with the snap_name argument, both in rbd_add_parse_args()
and rbd_dev_set_mapping().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 62df67a1132..ae16cf615f0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -665,23 +665,22 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 	return -ENOENT;
 }
 
-static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
+static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 {
 	int ret;
 
-	if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
+	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
 		    sizeof (RBD_SNAP_HEAD_NAME))) {
 		rbd_dev->snap_id = CEPH_NOSNAP;
 		rbd_dev->mapping.size = rbd_dev->header.image_size;
 		rbd_dev->mapping.features = rbd_dev->header.features;
 		ret = 0;
 	} else {
-		ret = snap_by_name(rbd_dev, snap_name);
+		ret = snap_by_name(rbd_dev, rbd_dev->snap_name);
 		if (ret < 0)
 			goto done;
 		rbd_dev->mapping.read_only = true;
 	}
-	rbd_dev->snap_name = snap_name;
 	rbd_dev->exists = true;
 done:
 	return ret;
@@ -2843,8 +2842,7 @@ static inline char *dup_token(const char **buf, size_t *lenp)
  * Note: rbd_dev is assumed to have been initially zero-filled.
  */
 static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
-						const char *buf,
-						char **snap_name)
+						const char *buf)
 {
 	size_t len;
 	const char *mon_addrs;
@@ -2893,11 +2891,11 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 		err_ptr = ERR_PTR(-ENAMETOOLONG);
 		goto out_err;
 	}
-	*snap_name = kmalloc(len + 1, GFP_KERNEL);
-	if (!*snap_name)
+	rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL);
+	if (!rbd_dev->snap_name)
 		goto out_mem;
-	memcpy(*snap_name, buf, len);
-	*(*snap_name + len) = '\0';
+	memcpy(rbd_dev->snap_name, buf, len);
+	*(rbd_dev->snap_name + len) = '\0';
 
 	/* Initialize all rbd options to the defaults */
 
@@ -3132,7 +3130,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 		       size_t count)
 {
 	struct rbd_device *rbd_dev = NULL;
-	char *snap_name;
 	struct ceph_options *ceph_opts;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
@@ -3151,7 +3148,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* parse add command */
-	ceph_opts = rbd_add_parse_args(rbd_dev, buf, &snap_name);
+	ceph_opts = rbd_add_parse_args(rbd_dev, buf);
 	if (IS_ERR(ceph_opts)) {
 		rc = PTR_ERR(ceph_opts);
 		goto err_out_mem;
@@ -3178,7 +3175,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	if (rc)
 		goto err_out_probe;
 
-	rc = rbd_dev_set_mapping(rbd_dev, snap_name);
+	rc = rbd_dev_set_mapping(rbd_dev);
 	if (rc)
 		goto err_out_snaps;
 

From 4e9afeba7aa9ca925a79d9ce82f1a8e79e14c291 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 31/68] rbd: pass and populate rbd_options structure

Have the caller pass the address of an rbd_options structure to
rbd_add_parse_args(), to be initialized with the information
gleaned as a result of the parse.

I know, this is another near-reversal of a recent change...

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ae16cf615f0..338cfadad63 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2842,15 +2842,16 @@ static inline char *dup_token(const char **buf, size_t *lenp)
  * Note: rbd_dev is assumed to have been initially zero-filled.
  */
 static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
-						const char *buf)
+						const char *buf,
+						struct rbd_options **opts)
 {
 	size_t len;
 	const char *mon_addrs;
 	size_t mon_addrs_size;
 	char *options;
 	struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
-	struct rbd_options rbd_opts;
 	struct ceph_options *ceph_opts;
+	struct rbd_options *rbd_opts = NULL;
 
 	/* The first four tokens are required */
 
@@ -2899,17 +2900,17 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 
 	/* Initialize all rbd options to the defaults */
 
-	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
+	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
+	if (!rbd_opts)
+		goto out_mem;
+
+	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 
 	ceph_opts = ceph_parse_options(options, mon_addrs,
 					mon_addrs + mon_addrs_size - 1,
-					parse_rbd_opts_token, &rbd_opts);
+					parse_rbd_opts_token, rbd_opts);
 	kfree(options);
-
-	/* Record the parsed rbd options */
-
-	if (!IS_ERR(ceph_opts))
-		rbd_dev->mapping.read_only = rbd_opts.read_only;
+	*opts = rbd_opts;
 
 	return ceph_opts;
 out_mem:
@@ -3131,6 +3132,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 {
 	struct rbd_device *rbd_dev = NULL;
 	struct ceph_options *ceph_opts;
+	struct rbd_options *rbd_opts = NULL;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
 
@@ -3139,7 +3141,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
-		goto err_out_mem;
+		return -ENOMEM;
 
 	/* static rbd_device initialization */
 	spin_lock_init(&rbd_dev->lock);
@@ -3148,11 +3150,12 @@ static ssize_t rbd_add(struct bus_type *bus,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* parse add command */
-	ceph_opts = rbd_add_parse_args(rbd_dev, buf);
+	ceph_opts = rbd_add_parse_args(rbd_dev, buf, &rbd_opts);
 	if (IS_ERR(ceph_opts)) {
 		rc = PTR_ERR(ceph_opts);
 		goto err_out_mem;
 	}
+	rbd_dev->mapping.read_only = rbd_opts->read_only;
 
 	rc = rbd_get_client(rbd_dev, ceph_opts);
 	if (rc < 0)
@@ -3219,6 +3222,8 @@ static ssize_t rbd_add(struct bus_type *bus,
 	if (rc)
 		goto err_out_bus;
 
+	kfree(rbd_opts);
+
 	/* Everything's ready.  Announce the disk to the world. */
 
 	add_disk(rbd_dev->disk);
@@ -3232,6 +3237,8 @@ err_out_bus:
 	/* this will also clean up rest of rbd_dev stuff */
 
 	rbd_bus_del_dev(rbd_dev);
+	kfree(rbd_opts);
+
 	return rc;
 
 err_out_disk:
@@ -3254,6 +3261,7 @@ err_out_args:
 	kfree(rbd_dev->snap_name);
 	kfree(rbd_dev->image_name);
 	kfree(rbd_dev->pool_name);
+	kfree(rbd_opts);
 err_out_mem:
 	kfree(rbd_dev);
 

From dc79b113d6db48ee6ee7decf0216feee48757f01 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 32/68] rbd: have rbd_add_parse_args() return error

Change the interface to rbd_add_parse_args() so it returns an
error code rather than a pointer.  Return the ceph_options result
via a pointer whose address is passed as an argument.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 338cfadad63..ab6e6d8364e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2841,30 +2841,31 @@ static inline char *dup_token(const char **buf, size_t *lenp)
  *
  * Note: rbd_dev is assumed to have been initially zero-filled.
  */
-static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
-						const char *buf,
-						struct rbd_options **opts)
+static int rbd_add_parse_args(struct rbd_device *rbd_dev,
+				const char *buf,
+				struct ceph_options **ceph_opts,
+				struct rbd_options **opts)
 {
 	size_t len;
 	const char *mon_addrs;
 	size_t mon_addrs_size;
 	char *options;
-	struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
-	struct ceph_options *ceph_opts;
 	struct rbd_options *rbd_opts = NULL;
+	int ret;
 
 	/* The first four tokens are required */
 
 	len = next_token(&buf);
 	if (!len)
-		return err_ptr;	/* Missing monitor address(es) */
+		return -EINVAL;	/* Missing monitor address(es) */
 	mon_addrs = buf;
 	mon_addrs_size = len + 1;
 	buf += len;
 
+	ret = -EINVAL;
 	options = dup_token(&buf, NULL);
 	if (!options)
-		goto out_mem;
+		return -ENOMEM;
 	if (!*options)
 		goto out_err;	/* Missing options */
 
@@ -2889,7 +2890,7 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
-		err_ptr = ERR_PTR(-ENAMETOOLONG);
+		ret = -ENAMETOOLONG;
 		goto out_err;
 	}
 	rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL);
@@ -2906,15 +2907,19 @@ static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
 
 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 
-	ceph_opts = ceph_parse_options(options, mon_addrs,
+	*ceph_opts = ceph_parse_options(options, mon_addrs,
 					mon_addrs + mon_addrs_size - 1,
 					parse_rbd_opts_token, rbd_opts);
 	kfree(options);
+	if (IS_ERR(*ceph_opts)) {
+		ret = PTR_ERR(*ceph_opts);
+		goto out_err;
+	}
 	*opts = rbd_opts;
 
-	return ceph_opts;
+	return 0;
 out_mem:
-	err_ptr = ERR_PTR(-ENOMEM);
+	ret = -ENOMEM;
 out_err:
 	kfree(rbd_dev->image_name);
 	rbd_dev->image_name = NULL;
@@ -2923,7 +2928,7 @@ out_err:
 	rbd_dev->pool_name = NULL;
 	kfree(options);
 
-	return err_ptr;
+	return ret;
 }
 
 /*
@@ -3131,7 +3136,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 		       size_t count)
 {
 	struct rbd_device *rbd_dev = NULL;
-	struct ceph_options *ceph_opts;
+	struct ceph_options *ceph_opts = NULL;
 	struct rbd_options *rbd_opts = NULL;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
@@ -3150,11 +3155,9 @@ static ssize_t rbd_add(struct bus_type *bus,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* parse add command */
-	ceph_opts = rbd_add_parse_args(rbd_dev, buf, &rbd_opts);
-	if (IS_ERR(ceph_opts)) {
-		rc = PTR_ERR(ceph_opts);
+	rc = rbd_add_parse_args(rbd_dev, buf, &ceph_opts, &rbd_opts);
+	if (rc < 0)
 		goto err_out_mem;
-	}
 	rbd_dev->mapping.read_only = rbd_opts->read_only;
 
 	rc = rbd_get_client(rbd_dev, ceph_opts);

From 0d7dbfce9d6e3a57a6946fadf7f92b1792b8acc0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:41 -0500
Subject: [PATCH 33/68] rbd: define image specification structure

Group the fields that uniquely specify an rbd image into a new
reference-counted rbd_spec structure.  This structure will be used
to describe the desired image when mapping an image, and when
probing parent images in layered rbd devices.  Replace the set of
fields in the rbd device structure with a pointer to a dynamically
allocated rbd_spec.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 158 +++++++++++++++++++++++++-------------------
 1 file changed, 90 insertions(+), 68 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ab6e6d8364e..2049810fcdc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -112,6 +112,27 @@ struct rbd_image_header {
 	u64 obj_version;
 };
 
+/*
+ * An rbd image specification.
+ *
+ * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
+ * identify an image.
+ */
+struct rbd_spec {
+	u64		pool_id;
+	char		*pool_name;
+
+	char		*image_id;
+	size_t		image_id_len;
+	char		*image_name;
+	size_t		image_name_len;
+
+	u64		snap_id;
+	char		*snap_name;
+
+	struct kref	kref;
+};
+
 struct rbd_options {
 	bool	read_only;
 };
@@ -189,16 +210,9 @@ struct rbd_device {
 
 	struct rbd_image_header	header;
 	bool                    exists;
-	char			*image_id;
-	size_t			image_id_len;
-	char			*image_name;
-	size_t			image_name_len;
-	char			*header_name;
-	char			*pool_name;
-	u64			pool_id;
+	struct rbd_spec		*spec;
 
-	char                    *snap_name;
-	u64                     snap_id;
+	char			*header_name;
 
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
@@ -654,7 +668,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 
 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
 		if (!strcmp(snap_name, snap->name)) {
-			rbd_dev->snap_id = snap->id;
+			rbd_dev->spec->snap_id = snap->id;
 			rbd_dev->mapping.size = snap->size;
 			rbd_dev->mapping.features = snap->features;
 
@@ -669,14 +683,14 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 {
 	int ret;
 
-	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 		    sizeof (RBD_SNAP_HEAD_NAME))) {
-		rbd_dev->snap_id = CEPH_NOSNAP;
+		rbd_dev->spec->snap_id = CEPH_NOSNAP;
 		rbd_dev->mapping.size = rbd_dev->header.image_size;
 		rbd_dev->mapping.features = rbd_dev->header.features;
 		ret = 0;
 	} else {
-		ret = snap_by_name(rbd_dev, rbd_dev->snap_name);
+		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 		if (ret < 0)
 			goto done;
 		rbd_dev->mapping.read_only = true;
@@ -1096,7 +1110,7 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
+	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
 				   req, ops);
 	rbd_assert(ret == 0);
@@ -1261,7 +1275,7 @@ static int rbd_do_op(struct request *rq,
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
 		snapc = NULL;
-		snapid = rbd_dev->snap_id;
+		snapid = rbd_dev->spec->snap_id;
 		payload_len = 0;
 	}
 
@@ -1545,7 +1559,7 @@ static void rbd_rq_fn(struct request_queue *q)
 		down_read(&rbd_dev->header_rwsem);
 
 		if (!rbd_dev->exists) {
-			rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
+			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
 			up_read(&rbd_dev->header_rwsem);
 			dout("request for non-existent snapshot");
 			spin_lock_irq(q->queue_lock);
@@ -1726,13 +1740,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 			ret = -ENXIO;
 			pr_warning("short header read for image %s"
 					" (want %zd got %d)\n",
-				rbd_dev->image_name, size, ret);
+				rbd_dev->spec->image_name, size, ret);
 			goto out_err;
 		}
 		if (!rbd_dev_ondisk_valid(ondisk)) {
 			ret = -ENXIO;
 			pr_warning("invalid header for image %s\n",
-				rbd_dev->image_name);
+				rbd_dev->spec->image_name);
 			goto out_err;
 		}
 
@@ -1783,7 +1797,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
 {
 	sector_t size;
 
-	if (rbd_dev->snap_id != CEPH_NOSNAP)
+	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
 		return;
 
 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
@@ -1957,7 +1971,7 @@ static ssize_t rbd_pool_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->pool_name);
+	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
 }
 
 static ssize_t rbd_pool_id_show(struct device *dev,
@@ -1965,7 +1979,8 @@ static ssize_t rbd_pool_id_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
+	return sprintf(buf, "%llu\n",
+		(unsigned long long) rbd_dev->spec->pool_id);
 }
 
 static ssize_t rbd_name_show(struct device *dev,
@@ -1973,7 +1988,7 @@ static ssize_t rbd_name_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->image_name);
+	return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
 }
 
 static ssize_t rbd_image_id_show(struct device *dev,
@@ -1981,7 +1996,7 @@ static ssize_t rbd_image_id_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->image_id);
+	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
 }
 
 /*
@@ -1994,7 +2009,7 @@ static ssize_t rbd_snap_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->snap_name);
+	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
 }
 
 static ssize_t rbd_image_refresh(struct device *dev,
@@ -2547,11 +2562,12 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 
 			/* Existing snapshot not in the new snap context */
 
-			if (rbd_dev->snap_id == snap->id)
+			if (rbd_dev->spec->snap_id == snap->id)
 				rbd_dev->exists = false;
 			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
-				rbd_dev->snap_id == snap->id ?  "mapped " : "",
+				rbd_dev->spec->snap_id == snap->id ?
+							"mapped " : "",
 				(unsigned long long) snap->id);
 
 			/* Done with this list entry; advance */
@@ -2869,16 +2885,17 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!*options)
 		goto out_err;	/* Missing options */
 
-	rbd_dev->pool_name = dup_token(&buf, NULL);
-	if (!rbd_dev->pool_name)
+	rbd_dev->spec->pool_name = dup_token(&buf, NULL);
+	if (!rbd_dev->spec->pool_name)
 		goto out_mem;
-	if (!*rbd_dev->pool_name)
+	if (!*rbd_dev->spec->pool_name)
 		goto out_err;	/* Missing pool name */
 
-	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
-	if (!rbd_dev->image_name)
+	rbd_dev->spec->image_name =
+		dup_token(&buf, &rbd_dev->spec->image_name_len);
+	if (!rbd_dev->spec->image_name)
 		goto out_mem;
-	if (!*rbd_dev->image_name)
+	if (!*rbd_dev->spec->image_name)
 		goto out_err;	/* Missing image name */
 
 	/*
@@ -2893,11 +2910,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 		ret = -ENAMETOOLONG;
 		goto out_err;
 	}
-	rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL);
-	if (!rbd_dev->snap_name)
+	rbd_dev->spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
+	if (!rbd_dev->spec->snap_name)
 		goto out_mem;
-	memcpy(rbd_dev->snap_name, buf, len);
-	*(rbd_dev->snap_name + len) = '\0';
+	memcpy(rbd_dev->spec->snap_name, buf, len);
+	*(rbd_dev->spec->snap_name + len) = '\0';
 
 	/* Initialize all rbd options to the defaults */
 
@@ -2921,11 +2938,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 out_mem:
 	ret = -ENOMEM;
 out_err:
-	kfree(rbd_dev->image_name);
-	rbd_dev->image_name = NULL;
-	rbd_dev->image_name_len = 0;
-	kfree(rbd_dev->pool_name);
-	rbd_dev->pool_name = NULL;
+	kfree(rbd_dev->spec->image_name);
+	rbd_dev->spec->image_name = NULL;
+	rbd_dev->spec->image_name_len = 0;
+	kfree(rbd_dev->spec->pool_name);
+	rbd_dev->spec->pool_name = NULL;
 	kfree(options);
 
 	return ret;
@@ -2957,11 +2974,11 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	 * First, see if the format 2 image id file exists, and if
 	 * so, get the image's persistent id from it.
 	 */
-	size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
+	size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
 	object_name = kmalloc(size, GFP_NOIO);
 	if (!object_name)
 		return -ENOMEM;
-	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
+	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
 	dout("rbd id object name is %s\n", object_name);
 
 	/* Response will be an encoded string, which includes a length */
@@ -2984,15 +3001,15 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	ret = 0;    /* rbd_req_sync_exec() can return positive */
 
 	p = response;
-	rbd_dev->image_id = ceph_extract_encoded_string(&p,
+	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
 						p + RBD_IMAGE_ID_LEN_MAX,
-						&rbd_dev->image_id_len,
+						&rbd_dev->spec->image_id_len,
 						GFP_NOIO);
-	if (IS_ERR(rbd_dev->image_id)) {
-		ret = PTR_ERR(rbd_dev->image_id);
-		rbd_dev->image_id = NULL;
+	if (IS_ERR(rbd_dev->spec->image_id)) {
+		ret = PTR_ERR(rbd_dev->spec->image_id);
+		rbd_dev->spec->image_id = NULL;
 	} else {
-		dout("image_id is %s\n", rbd_dev->image_id);
+		dout("image_id is %s\n", rbd_dev->spec->image_id);
 	}
 out:
 	kfree(response);
@@ -3008,20 +3025,21 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 
 	/* Version 1 images have no id; empty string is used */
 
-	rbd_dev->image_id = kstrdup("", GFP_KERNEL);
-	if (!rbd_dev->image_id)
+	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
+	if (!rbd_dev->spec->image_id)
 		return -ENOMEM;
-	rbd_dev->image_id_len = 0;
+	rbd_dev->spec->image_id_len = 0;
 
 	/* Record the header object name for this rbd image. */
 
-	size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
+	size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 	if (!rbd_dev->header_name) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
-	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
+	sprintf(rbd_dev->header_name, "%s%s",
+		rbd_dev->spec->image_name, RBD_SUFFIX);
 
 	/* Populate rbd image metadata */
 
@@ -3038,8 +3056,8 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 out_err:
 	kfree(rbd_dev->header_name);
 	rbd_dev->header_name = NULL;
-	kfree(rbd_dev->image_id);
-	rbd_dev->image_id = NULL;
+	kfree(rbd_dev->spec->image_id);
+	rbd_dev->spec->image_id = NULL;
 
 	return ret;
 }
@@ -3054,12 +3072,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 	 * Image id was filled in by the caller.  Record the header
 	 * object name for this rbd image.
 	 */
-	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
+	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 	if (!rbd_dev->header_name)
 		return -ENOMEM;
 	sprintf(rbd_dev->header_name, "%s%s",
-			RBD_HEADER_PREFIX, rbd_dev->image_id);
+			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
 
 	/* Get the size and object order for the image */
 
@@ -3147,6 +3165,9 @@ static ssize_t rbd_add(struct bus_type *bus,
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
 		return -ENOMEM;
+	rbd_dev->spec = kzalloc(sizeof (*rbd_dev->spec), GFP_KERNEL);
+	if (!rbd_dev->spec)
+		goto err_out_mem;
 
 	/* static rbd_device initialization */
 	spin_lock_init(&rbd_dev->lock);
@@ -3167,10 +3188,10 @@ static ssize_t rbd_add(struct bus_type *bus,
 
 	/* pick the pool */
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->spec->pool_name);
 	if (rc < 0)
 		goto err_out_client;
-	rbd_dev->pool_id = (u64) rc;
+	rbd_dev->spec->pool_id = (u64) rc;
 
 	rc = rbd_dev_probe(rbd_dev);
 	if (rc < 0)
@@ -3257,15 +3278,16 @@ err_out_probe:
 err_out_client:
 	kfree(rbd_dev->header_name);
 	rbd_put_client(rbd_dev);
-	kfree(rbd_dev->image_id);
+	kfree(rbd_dev->spec->image_id);
 err_out_args:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
-	kfree(rbd_dev->snap_name);
-	kfree(rbd_dev->image_name);
-	kfree(rbd_dev->pool_name);
+	kfree(rbd_dev->spec->snap_name);
+	kfree(rbd_dev->spec->image_name);
+	kfree(rbd_dev->spec->pool_name);
 	kfree(rbd_opts);
 err_out_mem:
+	kfree(rbd_dev->spec);
 	kfree(rbd_dev);
 
 	dout("Error adding device %s\n", buf);
@@ -3314,11 +3336,11 @@ static void rbd_dev_release(struct device *dev)
 	rbd_header_free(&rbd_dev->header);
 
 	/* done with the id, and with the rbd_dev */
-	kfree(rbd_dev->snap_name);
-	kfree(rbd_dev->image_id);
+	kfree(rbd_dev->spec->snap_name);
+	kfree(rbd_dev->spec->image_id);
 	kfree(rbd_dev->header_name);
-	kfree(rbd_dev->pool_name);
-	kfree(rbd_dev->image_name);
+	kfree(rbd_dev->spec->pool_name);
+	kfree(rbd_dev->spec->image_name);
 	rbd_dev_id_put(rbd_dev);
 	kfree(rbd_dev);
 

From 8b8fb99c5c93a0bdfe7b0c0c9fd2d41a3244555e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 26 Oct 2012 17:25:24 -0500
Subject: [PATCH 34/68] rbd: add reference counting to rbd_spec

With layered images we'll share rbd_spec structures, so add a
reference count to it.  It neatens up some code also.

A silly get/put pair is added to the alloc routine just to avoid
"defined but not used" warnings.  It will go away soon.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 52 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 2049810fcdc..86206a75017 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2134,6 +2134,45 @@ static struct device_type rbd_snap_device_type = {
 	.release	= rbd_snap_dev_release,
 };
 
+static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
+{
+	kref_get(&spec->kref);
+
+	return spec;
+}
+
+static void rbd_spec_free(struct kref *kref);
+static void rbd_spec_put(struct rbd_spec *spec)
+{
+	if (spec)
+		kref_put(&spec->kref, rbd_spec_free);
+}
+
+static struct rbd_spec *rbd_spec_alloc(void)
+{
+	struct rbd_spec *spec;
+
+	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
+	if (!spec)
+		return NULL;
+	kref_init(&spec->kref);
+
+	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
+
+	return spec;
+}
+
+static void rbd_spec_free(struct kref *kref)
+{
+	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
+
+	kfree(spec->pool_name);
+	kfree(spec->image_id);
+	kfree(spec->image_name);
+	kfree(spec->snap_name);
+	kfree(spec);
+}
+
 static bool rbd_snap_registered(struct rbd_snap *snap)
 {
 	bool ret = snap->dev.type == &rbd_snap_device_type;
@@ -3165,7 +3204,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
 		return -ENOMEM;
-	rbd_dev->spec = kzalloc(sizeof (*rbd_dev->spec), GFP_KERNEL);
+	rbd_dev->spec = rbd_spec_alloc();
 	if (!rbd_dev->spec)
 		goto err_out_mem;
 
@@ -3278,16 +3317,12 @@ err_out_probe:
 err_out_client:
 	kfree(rbd_dev->header_name);
 	rbd_put_client(rbd_dev);
-	kfree(rbd_dev->spec->image_id);
 err_out_args:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
-	kfree(rbd_dev->spec->snap_name);
-	kfree(rbd_dev->spec->image_name);
-	kfree(rbd_dev->spec->pool_name);
 	kfree(rbd_opts);
 err_out_mem:
-	kfree(rbd_dev->spec);
+	rbd_spec_put(rbd_dev->spec);
 	kfree(rbd_dev);
 
 	dout("Error adding device %s\n", buf);
@@ -3336,12 +3371,9 @@ static void rbd_dev_release(struct device *dev)
 	rbd_header_free(&rbd_dev->header);
 
 	/* done with the id, and with the rbd_dev */
-	kfree(rbd_dev->spec->snap_name);
-	kfree(rbd_dev->spec->image_id);
 	kfree(rbd_dev->header_name);
-	kfree(rbd_dev->spec->pool_name);
-	kfree(rbd_dev->spec->image_name);
 	rbd_dev_id_put(rbd_dev);
+	rbd_spec_put(rbd_dev->spec);
 	kfree(rbd_dev);
 
 	/* release module ref */

From 859c31df9cee9d1e1308b3b024b61355e6a629a5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:42 -0500
Subject: [PATCH 35/68] rbd: fill rbd_spec in rbd_add_parse_args()

Pass the address of an rbd_spec structure to rbd_add_parse_args().
Use it to hold the information defining the rbd image to be mapped
in an rbd_add() call.

Use the result in the caller to initialize the rbd_dev->id field.

This means rbd_dev is no longer needed in rbd_add_parse_args(),
so get rid of it.

Now that this transformation of rbd_add_parse_args() is complete,
correct and expand on the its header documentation to reflect the
new reality.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 113 +++++++++++++++++++++++++++++---------------
 1 file changed, 75 insertions(+), 38 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 86206a75017..be85d925dfd 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2887,25 +2887,58 @@ static inline char *dup_token(const char **buf, size_t *lenp)
 }
 
 /*
- * This fills in the pool_name, image_name, image_name_len, rbd_dev,
- * rbd_md_name, and name fields of the given rbd_dev, based on the
- * list of monitor addresses and other options provided via
- * /sys/bus/rbd/add.  Returns a pointer to a dynamically-allocated
- * copy of the snapshot name to map if successful, or a
- * pointer-coded error otherwise.
+ * Parse the options provided for an "rbd add" (i.e., rbd image
+ * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
+ * and the data written is passed here via a NUL-terminated buffer.
+ * Returns 0 if successful or an error code otherwise.
  *
- * Note: rbd_dev is assumed to have been initially zero-filled.
+ * The information extracted from these options is recorded in
+ * the other parameters which return dynamically-allocated
+ * structures:
+ *  ceph_opts
+ *      The address of a pointer that will refer to a ceph options
+ *      structure.  Caller must release the returned pointer using
+ *      ceph_destroy_options() when it is no longer needed.
+ *  rbd_opts
+ *	Address of an rbd options pointer.  Fully initialized by
+ *	this function; caller must release with kfree().
+ *  spec
+ *	Address of an rbd image specification pointer.  Fully
+ *	initialized by this function based on parsed options.
+ *	Caller must release with rbd_spec_put().
+ *
+ * The options passed take this form:
+ *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
+ * where:
+ *  <mon_addrs>
+ *      A comma-separated list of one or more monitor addresses.
+ *      A monitor address is an ip address, optionally followed
+ *      by a port number (separated by a colon).
+ *        I.e.:  ip1[:port1][,ip2[:port2]...]
+ *  <options>
+ *      A comma-separated list of ceph and/or rbd options.
+ *  <pool_name>
+ *      The name of the rados pool containing the rbd image.
+ *  <image_name>
+ *      The name of the image in that pool to map.
+ *  <snap_id>
+ *      An optional snapshot id.  If provided, the mapping will
+ *      present data from the image at the time that snapshot was
+ *      created.  The image head is used if no snapshot id is
+ *      provided.  Snapshot mappings are always read-only.
  */
-static int rbd_add_parse_args(struct rbd_device *rbd_dev,
-				const char *buf,
+static int rbd_add_parse_args(const char *buf,
 				struct ceph_options **ceph_opts,
-				struct rbd_options **opts)
+				struct rbd_options **opts,
+				struct rbd_spec **rbd_spec)
 {
 	size_t len;
+	char *options;
 	const char *mon_addrs;
 	size_t mon_addrs_size;
-	char *options;
+	struct rbd_spec *spec = NULL;
 	struct rbd_options *rbd_opts = NULL;
+	struct ceph_options *copts;
 	int ret;
 
 	/* The first four tokens are required */
@@ -2924,17 +2957,20 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 	if (!*options)
 		goto out_err;	/* Missing options */
 
-	rbd_dev->spec->pool_name = dup_token(&buf, NULL);
-	if (!rbd_dev->spec->pool_name)
+	spec = rbd_spec_alloc();
+	if (!spec)
 		goto out_mem;
-	if (!*rbd_dev->spec->pool_name)
+
+	spec->pool_name = dup_token(&buf, NULL);
+	if (!spec->pool_name)
+		goto out_mem;
+	if (!*spec->pool_name)
 		goto out_err;	/* Missing pool name */
 
-	rbd_dev->spec->image_name =
-		dup_token(&buf, &rbd_dev->spec->image_name_len);
-	if (!rbd_dev->spec->image_name)
+	spec->image_name = dup_token(&buf, &spec->image_name_len);
+	if (!spec->image_name)
 		goto out_mem;
-	if (!*rbd_dev->spec->image_name)
+	if (!*spec->image_name)
 		goto out_err;	/* Missing image name */
 
 	/*
@@ -2949,11 +2985,11 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 		ret = -ENAMETOOLONG;
 		goto out_err;
 	}
-	rbd_dev->spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
-	if (!rbd_dev->spec->snap_name)
+	spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
+	if (!spec->snap_name)
 		goto out_mem;
-	memcpy(rbd_dev->spec->snap_name, buf, len);
-	*(rbd_dev->spec->snap_name + len) = '\0';
+	memcpy(spec->snap_name, buf, len);
+	*(spec->snap_name + len) = '\0';
 
 	/* Initialize all rbd options to the defaults */
 
@@ -2963,25 +2999,25 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
 
 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 
-	*ceph_opts = ceph_parse_options(options, mon_addrs,
+	copts = ceph_parse_options(options, mon_addrs,
 					mon_addrs + mon_addrs_size - 1,
 					parse_rbd_opts_token, rbd_opts);
-	kfree(options);
-	if (IS_ERR(*ceph_opts)) {
-		ret = PTR_ERR(*ceph_opts);
+	if (IS_ERR(copts)) {
+		ret = PTR_ERR(copts);
 		goto out_err;
 	}
+	kfree(options);
+
+	*ceph_opts = copts;
 	*opts = rbd_opts;
+	*rbd_spec = spec;
 
 	return 0;
 out_mem:
 	ret = -ENOMEM;
 out_err:
-	kfree(rbd_dev->spec->image_name);
-	rbd_dev->spec->image_name = NULL;
-	rbd_dev->spec->image_name_len = 0;
-	kfree(rbd_dev->spec->pool_name);
-	rbd_dev->spec->pool_name = NULL;
+	kfree(rbd_opts);
+	rbd_spec_put(spec);
 	kfree(options);
 
 	return ret;
@@ -3195,6 +3231,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	struct rbd_device *rbd_dev = NULL;
 	struct ceph_options *ceph_opts = NULL;
 	struct rbd_options *rbd_opts = NULL;
+	struct rbd_spec *spec = NULL;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
 
@@ -3204,9 +3241,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
 		return -ENOMEM;
-	rbd_dev->spec = rbd_spec_alloc();
-	if (!rbd_dev->spec)
-		goto err_out_mem;
 
 	/* static rbd_device initialization */
 	spin_lock_init(&rbd_dev->lock);
@@ -3215,9 +3249,10 @@ static ssize_t rbd_add(struct bus_type *bus,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* parse add command */
-	rc = rbd_add_parse_args(rbd_dev, buf, &ceph_opts, &rbd_opts);
+	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
 	if (rc < 0)
 		goto err_out_mem;
+
 	rbd_dev->mapping.read_only = rbd_opts->read_only;
 
 	rc = rbd_get_client(rbd_dev, ceph_opts);
@@ -3227,10 +3262,12 @@ static ssize_t rbd_add(struct bus_type *bus,
 
 	/* pick the pool */
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->spec->pool_name);
+	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
 	if (rc < 0)
 		goto err_out_client;
-	rbd_dev->spec->pool_id = (u64) rc;
+	spec->pool_id = (u64) rc;
+
+	rbd_dev->spec = spec;
 
 	rc = rbd_dev_probe(rbd_dev);
 	if (rc < 0)
@@ -3321,8 +3358,8 @@ err_out_args:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
 	kfree(rbd_opts);
+	rbd_spec_put(spec);
 err_out_mem:
-	rbd_spec_put(rbd_dev->spec);
 	kfree(rbd_dev);
 
 	dout("Error adding device %s\n", buf);

From 9d3997fdf4c82adfb37a4886a21eaa513ee071b6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:42 -0500
Subject: [PATCH 36/68] rbd: don't pass rbd_dev to rbd_get_client()

The only reason rbd_dev is passed to rbd_get_client() is so its
rbd_client field can get assigned.  Instead, just return the
rbd_client pointer as a result and have the caller do the
assignment.

Change rbd_put_client() so it takes an rbd_client structure,
so follows the more typical symmetry with rbd_get_client().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index be85d925dfd..a528d4ca7a6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -467,23 +467,17 @@ static int parse_rbd_opts_token(char *c, void *private)
  * Get a ceph client with specific addr and configuration, if one does
  * not exist create it.
  */
-static int rbd_get_client(struct rbd_device *rbd_dev,
-				struct ceph_options *ceph_opts)
+static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 {
 	struct rbd_client *rbdc;
 
 	rbdc = rbd_client_find(ceph_opts);
-	if (rbdc) {
-		/* using an existing client */
+	if (rbdc)	/* using an existing client */
 		ceph_destroy_options(ceph_opts);
-	} else {
+	else
 		rbdc = rbd_client_create(ceph_opts);
-		if (IS_ERR(rbdc))
-			return PTR_ERR(rbdc);
-	}
-	rbd_dev->rbd_client = rbdc;
 
-	return 0;
+	return rbdc;
 }
 
 /*
@@ -508,10 +502,9 @@ static void rbd_client_release(struct kref *kref)
  * Drop reference to ceph client node. If it's not referenced anymore, release
  * it.
  */
-static void rbd_put_client(struct rbd_device *rbd_dev)
+static void rbd_put_client(struct rbd_client *rbdc)
 {
-	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
-	rbd_dev->rbd_client = NULL;
+	kref_put(&rbdc->kref, rbd_client_release);
 }
 
 /*
@@ -3232,6 +3225,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	struct ceph_options *ceph_opts = NULL;
 	struct rbd_options *rbd_opts = NULL;
 	struct rbd_spec *spec = NULL;
+	struct rbd_client *rbdc;
 	struct ceph_osd_client *osdc;
 	int rc = -ENOMEM;
 
@@ -3255,13 +3249,16 @@ static ssize_t rbd_add(struct bus_type *bus,
 
 	rbd_dev->mapping.read_only = rbd_opts->read_only;
 
-	rc = rbd_get_client(rbd_dev, ceph_opts);
-	if (rc < 0)
+	rbdc = rbd_get_client(ceph_opts);
+	if (IS_ERR(rbdc)) {
+		rc = PTR_ERR(rbdc);
 		goto err_out_args;
+	}
+	rbd_dev->rbd_client = rbdc;
 	ceph_opts = NULL;	/* ceph_opts now owned by rbd_dev client */
 
 	/* pick the pool */
-	osdc = &rbd_dev->rbd_client->client->osdc;
+	osdc = &rbdc->client->osdc;
 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
 	if (rc < 0)
 		goto err_out_client;
@@ -3353,7 +3350,7 @@ err_out_probe:
 	rbd_header_free(&rbd_dev->header);
 err_out_client:
 	kfree(rbd_dev->header_name);
-	rbd_put_client(rbd_dev);
+	rbd_put_client(rbdc);
 err_out_args:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
@@ -3398,7 +3395,7 @@ static void rbd_dev_release(struct device *dev)
 	if (rbd_dev->watch_event)
 		rbd_req_sync_unwatch(rbd_dev);
 
-	rbd_put_client(rbd_dev);
+	rbd_put_client(rbd_dev->rbd_client);
 
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);

From bd4ba6554dcbae652b8b27a44f5a7795c9f3178a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:42 -0500
Subject: [PATCH 37/68] rbd: consolidate rbd_dev init in rbd_add()

Group the allocation and initialization of fields of the rbd device
structure created in rbd_add().  Move the grouped code down later in
the function, just prior to the call to rbd_dev_probe().  This is
for the most part simple code movement.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a528d4ca7a6..4771de2fba8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3232,29 +3232,16 @@ static ssize_t rbd_add(struct bus_type *bus,
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
-	if (!rbd_dev)
-		return -ENOMEM;
-
-	/* static rbd_device initialization */
-	spin_lock_init(&rbd_dev->lock);
-	INIT_LIST_HEAD(&rbd_dev->node);
-	INIT_LIST_HEAD(&rbd_dev->snaps);
-	init_rwsem(&rbd_dev->header_rwsem);
-
 	/* parse add command */
 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
 	if (rc < 0)
-		goto err_out_mem;
-
-	rbd_dev->mapping.read_only = rbd_opts->read_only;
+		goto err_out_module;
 
 	rbdc = rbd_get_client(ceph_opts);
 	if (IS_ERR(rbdc)) {
 		rc = PTR_ERR(rbdc);
 		goto err_out_args;
 	}
-	rbd_dev->rbd_client = rbdc;
 	ceph_opts = NULL;	/* ceph_opts now owned by rbd_dev client */
 
 	/* pick the pool */
@@ -3264,11 +3251,22 @@ static ssize_t rbd_add(struct bus_type *bus,
 		goto err_out_client;
 	spec->pool_id = (u64) rc;
 
+	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
+	if (!rbd_dev)
+		goto err_out_client;
+
+	spin_lock_init(&rbd_dev->lock);
+	INIT_LIST_HEAD(&rbd_dev->node);
+	INIT_LIST_HEAD(&rbd_dev->snaps);
+	init_rwsem(&rbd_dev->header_rwsem);
+	rbd_dev->rbd_client = rbdc;
 	rbd_dev->spec = spec;
 
+	rbd_dev->mapping.read_only = rbd_opts->read_only;
+
 	rc = rbd_dev_probe(rbd_dev);
 	if (rc < 0)
-		goto err_out_client;
+		goto err_out_mem;
 
 	/* no need to lock here, as rbd_dev is not registered yet */
 	rc = rbd_dev_snaps_update(rbd_dev);
@@ -3348,19 +3346,20 @@ err_out_snaps:
 	rbd_remove_all_snaps(rbd_dev);
 err_out_probe:
 	rbd_header_free(&rbd_dev->header);
-err_out_client:
 	kfree(rbd_dev->header_name);
+err_out_mem:
+	kfree(rbd_dev);
+err_out_client:
 	rbd_put_client(rbdc);
 err_out_args:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
 	kfree(rbd_opts);
 	rbd_spec_put(spec);
-err_out_mem:
-	kfree(rbd_dev);
+err_out_module:
+	module_put(THIS_MODULE);
 
 	dout("Error adding device %s\n", buf);
-	module_put(THIS_MODULE);
 
 	return (ssize_t) rc;
 }

From c53d589337e9a211413484a604c76072e8474dc0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:42 -0500
Subject: [PATCH 38/68] rbd: define rbd_dev_{create,destroy}() helpers

Encapsulate the creation/initialization and destruction of rbd
device structures.  The rbd_client and the rbd_spec structures
provided on creation hold references whose ownership is transferred
to the new rbd_device structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 62 ++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4771de2fba8..a8ad8f8370b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -504,7 +504,8 @@ static void rbd_client_release(struct kref *kref)
  */
 static void rbd_put_client(struct rbd_client *rbdc)
 {
-	kref_put(&rbdc->kref, rbd_client_release);
+	if (rbdc)
+		kref_put(&rbdc->kref, rbd_client_release);
 }
 
 /*
@@ -2166,6 +2167,34 @@ static void rbd_spec_free(struct kref *kref)
 	kfree(spec);
 }
 
+struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
+				struct rbd_spec *spec)
+{
+	struct rbd_device *rbd_dev;
+
+	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
+	if (!rbd_dev)
+		return NULL;
+
+	spin_lock_init(&rbd_dev->lock);
+	INIT_LIST_HEAD(&rbd_dev->node);
+	INIT_LIST_HEAD(&rbd_dev->snaps);
+	init_rwsem(&rbd_dev->header_rwsem);
+
+	rbd_dev->spec = spec;
+	rbd_dev->rbd_client = rbdc;
+
+	return rbd_dev;
+}
+
+static void rbd_dev_destroy(struct rbd_device *rbd_dev)
+{
+	kfree(rbd_dev->header_name);
+	rbd_put_client(rbd_dev->rbd_client);
+	rbd_spec_put(rbd_dev->spec);
+	kfree(rbd_dev);
+}
+
 static bool rbd_snap_registered(struct rbd_snap *snap)
 {
 	bool ret = snap->dev.type == &rbd_snap_device_type;
@@ -3242,7 +3271,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 		rc = PTR_ERR(rbdc);
 		goto err_out_args;
 	}
-	ceph_opts = NULL;	/* ceph_opts now owned by rbd_dev client */
+	ceph_opts = NULL;	/* rbd_dev client now owns this */
 
 	/* pick the pool */
 	osdc = &rbdc->client->osdc;
@@ -3251,22 +3280,19 @@ static ssize_t rbd_add(struct bus_type *bus,
 		goto err_out_client;
 	spec->pool_id = (u64) rc;
 
-	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
+	rbd_dev = rbd_dev_create(rbdc, spec);
 	if (!rbd_dev)
 		goto err_out_client;
-
-	spin_lock_init(&rbd_dev->lock);
-	INIT_LIST_HEAD(&rbd_dev->node);
-	INIT_LIST_HEAD(&rbd_dev->snaps);
-	init_rwsem(&rbd_dev->header_rwsem);
-	rbd_dev->rbd_client = rbdc;
-	rbd_dev->spec = spec;
+	rbdc = NULL;		/* rbd_dev now owns this */
+	spec = NULL;		/* rbd_dev now owns this */
 
 	rbd_dev->mapping.read_only = rbd_opts->read_only;
+	kfree(rbd_opts);
+	rbd_opts = NULL;	/* done with this */
 
 	rc = rbd_dev_probe(rbd_dev);
 	if (rc < 0)
-		goto err_out_mem;
+		goto err_out_rbd_dev;
 
 	/* no need to lock here, as rbd_dev is not registered yet */
 	rc = rbd_dev_snaps_update(rbd_dev);
@@ -3317,8 +3343,6 @@ static ssize_t rbd_add(struct bus_type *bus,
 	if (rc)
 		goto err_out_bus;
 
-	kfree(rbd_opts);
-
 	/* Everything's ready.  Announce the disk to the world. */
 
 	add_disk(rbd_dev->disk);
@@ -3332,7 +3356,6 @@ err_out_bus:
 	/* this will also clean up rest of rbd_dev stuff */
 
 	rbd_bus_del_dev(rbd_dev);
-	kfree(rbd_opts);
 
 	return rc;
 
@@ -3346,9 +3369,8 @@ err_out_snaps:
 	rbd_remove_all_snaps(rbd_dev);
 err_out_probe:
 	rbd_header_free(&rbd_dev->header);
-	kfree(rbd_dev->header_name);
-err_out_mem:
-	kfree(rbd_dev);
+err_out_rbd_dev:
+	rbd_dev_destroy(rbd_dev);
 err_out_client:
 	rbd_put_client(rbdc);
 err_out_args:
@@ -3394,7 +3416,6 @@ static void rbd_dev_release(struct device *dev)
 	if (rbd_dev->watch_event)
 		rbd_req_sync_unwatch(rbd_dev);
 
-	rbd_put_client(rbd_dev->rbd_client);
 
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);
@@ -3404,10 +3425,9 @@ static void rbd_dev_release(struct device *dev)
 	rbd_header_free(&rbd_dev->header);
 
 	/* done with the id, and with the rbd_dev */
-	kfree(rbd_dev->header_name);
 	rbd_dev_id_put(rbd_dev);
-	rbd_spec_put(rbd_dev->spec);
-	kfree(rbd_dev);
+	rbd_assert(rbd_dev->rbd_client != NULL);
+	rbd_dev_destroy(rbd_dev);
 
 	/* release module ref */
 	module_put(THIS_MODULE);

From 83a06263625b823afa3a842ddbf53473c22f24b2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Oct 2012 15:47:17 -0500
Subject: [PATCH 39/68] rbd: encapsulate last part of probe

Group the activities that now take place after an rbd_dev_probe()
call into a single function, and move the call to that function
into rbd_dev_probe() itself.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 161 +++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 75 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a8ad8f8370b..8d26c0f2be1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3221,6 +3221,84 @@ out_err:
 	return ret;
 }
 
+static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	/* no need to lock here, as rbd_dev is not registered yet */
+	ret = rbd_dev_snaps_update(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_dev_set_mapping(rbd_dev);
+	if (ret)
+		goto err_out_snaps;
+
+	/* generate unique id: find highest unique id, add one */
+	rbd_dev_id_get(rbd_dev);
+
+	/* Fill in the device name, now that we have its id. */
+	BUILD_BUG_ON(DEV_NAME_LEN
+			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
+	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
+
+	/* Get our block major device number. */
+
+	ret = register_blkdev(0, rbd_dev->name);
+	if (ret < 0)
+		goto err_out_id;
+	rbd_dev->major = ret;
+
+	/* Set up the blkdev mapping. */
+
+	ret = rbd_init_disk(rbd_dev);
+	if (ret)
+		goto err_out_blkdev;
+
+	ret = rbd_bus_add_dev(rbd_dev);
+	if (ret)
+		goto err_out_disk;
+
+	/*
+	 * At this point cleanup in the event of an error is the job
+	 * of the sysfs code (initiated by rbd_bus_del_dev()).
+	 */
+	down_write(&rbd_dev->header_rwsem);
+	ret = rbd_dev_snaps_register(rbd_dev);
+	up_write(&rbd_dev->header_rwsem);
+	if (ret)
+		goto err_out_bus;
+
+	ret = rbd_init_watch_dev(rbd_dev);
+	if (ret)
+		goto err_out_bus;
+
+	/* Everything's ready.  Announce the disk to the world. */
+
+	add_disk(rbd_dev->disk);
+
+	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
+		(unsigned long long) rbd_dev->mapping.size);
+
+	return ret;
+err_out_bus:
+	/* this will also clean up rest of rbd_dev stuff */
+
+	rbd_bus_del_dev(rbd_dev);
+
+	return ret;
+err_out_disk:
+	rbd_free_disk(rbd_dev);
+err_out_blkdev:
+	unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_id:
+	rbd_dev_id_put(rbd_dev);
+err_out_snaps:
+	rbd_remove_all_snaps(rbd_dev);
+
+	return ret;
+}
+
 /*
  * Probe for the existence of the header object for the given rbd
  * device.  For format 2 images this includes determining the image
@@ -3240,9 +3318,16 @@ static int rbd_dev_probe(struct rbd_device *rbd_dev)
 		ret = rbd_dev_v1_probe(rbd_dev);
 	else
 		ret = rbd_dev_v2_probe(rbd_dev);
-	if (ret)
+	if (ret) {
 		dout("probe failed, returning %d\n", ret);
 
+		return ret;
+	}
+
+	ret = rbd_dev_probe_finish(rbd_dev);
+	if (ret)
+		rbd_header_free(&rbd_dev->header);
+
 	return ret;
 }
 
@@ -3294,81 +3379,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	if (rc < 0)
 		goto err_out_rbd_dev;
 
-	/* no need to lock here, as rbd_dev is not registered yet */
-	rc = rbd_dev_snaps_update(rbd_dev);
-	if (rc)
-		goto err_out_probe;
-
-	rc = rbd_dev_set_mapping(rbd_dev);
-	if (rc)
-		goto err_out_snaps;
-
-	/* generate unique id: find highest unique id, add one */
-	rbd_dev_id_get(rbd_dev);
-
-	/* Fill in the device name, now that we have its id. */
-	BUILD_BUG_ON(DEV_NAME_LEN
-			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
-	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
-
-	/* Get our block major device number. */
-
-	rc = register_blkdev(0, rbd_dev->name);
-	if (rc < 0)
-		goto err_out_id;
-	rbd_dev->major = rc;
-
-	/* Set up the blkdev mapping. */
-
-	rc = rbd_init_disk(rbd_dev);
-	if (rc)
-		goto err_out_blkdev;
-
-	rc = rbd_bus_add_dev(rbd_dev);
-	if (rc)
-		goto err_out_disk;
-
-	/*
-	 * At this point cleanup in the event of an error is the job
-	 * of the sysfs code (initiated by rbd_bus_del_dev()).
-	 */
-
-	down_write(&rbd_dev->header_rwsem);
-	rc = rbd_dev_snaps_register(rbd_dev);
-	up_write(&rbd_dev->header_rwsem);
-	if (rc)
-		goto err_out_bus;
-
-	rc = rbd_init_watch_dev(rbd_dev);
-	if (rc)
-		goto err_out_bus;
-
-	/* Everything's ready.  Announce the disk to the world. */
-
-	add_disk(rbd_dev->disk);
-
-	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
-		(unsigned long long) rbd_dev->mapping.size);
-
 	return count;
-
-err_out_bus:
-	/* this will also clean up rest of rbd_dev stuff */
-
-	rbd_bus_del_dev(rbd_dev);
-
-	return rc;
-
-err_out_disk:
-	rbd_free_disk(rbd_dev);
-err_out_blkdev:
-	unregister_blkdev(rbd_dev->major, rbd_dev->name);
-err_out_id:
-	rbd_dev_id_put(rbd_dev);
-err_out_snaps:
-	rbd_remove_all_snaps(rbd_dev);
-err_out_probe:
-	rbd_header_free(&rbd_dev->header);
 err_out_rbd_dev:
 	rbd_dev_destroy(rbd_dev);
 err_out_client:

From 2c0d0a10ea89456781218f458f6bf72e99d87d2a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Oct 2012 19:40:33 -0500
Subject: [PATCH 40/68] rbd: allow null image name

We will know the image id for format 2 parent images, but won't
initially know its image name.  Avoid making the query for an image
id in rbd_dev_image_id() if it's already known.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8d26c0f2be1..a8521338bf4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3067,6 +3067,14 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	void *response;
 	void *p;
 
+	/*
+	 * When probing a parent image, the image id is already
+	 * known (and the image name likely is not).  There's no
+	 * need to fetch the image id again in this case.
+	 */
+	if (rbd_dev->spec->image_id)
+		return 0;
+
 	/*
 	 * First, see if the format 2 image id file exists, and if
 	 * so, get the image's persistent id from it.

From a92ffdf8a9b09f8fae9a8f418f87f30a5e459570 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Oct 2012 19:40:33 -0500
Subject: [PATCH 41/68] rbd: allow null image name

Format 2 parent images are partially identified by their image id,
but it may not be possible to determine their image name.  The name
is not strictly needed for correct operation, so we won't be
treating it as an error if we don't know it.  Handle this case
gracefully in rbd_name_show().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a8521338bf4..28052ff679c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1982,7 +1982,10 @@ static ssize_t rbd_name_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
+	if (rbd_dev->spec->image_name)
+		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
+
+	return sprintf(buf, "(unknown)\n");
 }
 
 static ssize_t rbd_image_id_show(struct device *dev,

From 86b00e0da6be7bbc16412f126c5b548ac5d91d50 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 25 Oct 2012 23:34:42 -0500
Subject: [PATCH 42/68] rbd: get parent spec for version 2 images

Add support for getting the the information identifying the parent
image for rbd images that have them.  The child image holds a
reference to its parent image specification structure.  Create a new
entry "parent" in /sys/bus/rbd/image/N/ to report the identifying
information for the parent image, if any.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 Documentation/ABI/testing/sysfs-bus-rbd |   4 +
 drivers/block/rbd.c                     | 131 ++++++++++++++++++++++++
 include/linux/ceph/rados.h              |   2 +
 3 files changed, 137 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd
index 1cf2adf46b1..cd9213ccf3d 100644
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -70,6 +70,10 @@ snap_*
 
 	A directory per each snapshot
 
+parent
+
+	Information identifying the pool, image, and snapshot id for
+	the parent image in a layered rbd image (format 2 only).
 
 Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name>
 -------------------------------------------------------------
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 28052ff679c..bce1fcfb518 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -217,6 +217,9 @@ struct rbd_device {
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
 
+	struct rbd_spec		*parent_spec;
+	u64			parent_overlap;
+
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
 
@@ -2009,6 +2012,49 @@ static ssize_t rbd_snap_show(struct device *dev,
 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
 }
 
+/*
+ * For an rbd v2 image, shows the pool id, image id, and snapshot id
+ * for the parent image.  If there is no parent, simply shows
+ * "(no parent image)".
+ */
+static ssize_t rbd_parent_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	struct rbd_spec *spec = rbd_dev->parent_spec;
+	int count;
+	char *bufp = buf;
+
+	if (!spec)
+		return sprintf(buf, "(no parent image)\n");
+
+	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
+			(unsigned long long) spec->pool_id, spec->pool_name);
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
+			spec->image_name ? spec->image_name : "(unknown)");
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
+			(unsigned long long) spec->snap_id, spec->snap_name);
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	return (ssize_t) (bufp - buf);
+}
+
 static ssize_t rbd_image_refresh(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf,
@@ -2032,6 +2078,7 @@ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
+static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
 
 static struct attribute *rbd_attrs[] = {
 	&dev_attr_size.attr,
@@ -2043,6 +2090,7 @@ static struct attribute *rbd_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_image_id.attr,
 	&dev_attr_current_snap.attr,
+	&dev_attr_parent.attr,
 	&dev_attr_refresh.attr,
 	NULL
 };
@@ -2192,6 +2240,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 
 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
 {
+	rbd_spec_put(rbd_dev->parent_spec);
 	kfree(rbd_dev->header_name);
 	rbd_put_client(rbd_dev->rbd_client);
 	rbd_spec_put(rbd_dev->spec);
@@ -2400,6 +2449,71 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
 						&rbd_dev->header.features);
 }
 
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *parent_spec;
+	size_t size;
+	void *reply_buf = NULL;
+	__le64 snapid;
+	void *p;
+	void *end;
+	char *image_id;
+	u64 overlap;
+	size_t len = 0;
+	int ret;
+
+	parent_spec = rbd_spec_alloc();
+	if (!parent_spec)
+		return -ENOMEM;
+
+	size = sizeof (__le64) +				/* pool_id */
+		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
+		sizeof (__le64) +				/* snap_id */
+		sizeof (__le64);				/* overlap */
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	snapid = cpu_to_le64(CEPH_NOSNAP);
+	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_parent",
+				(char *) &snapid, sizeof (snapid),
+				(char *) reply_buf, size,
+				CEPH_OSD_FLAG_READ, NULL);
+	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	if (ret < 0)
+		goto out_err;
+
+	ret = -ERANGE;
+	p = reply_buf;
+	end = (char *) reply_buf + size;
+	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
+	if (parent_spec->pool_id == CEPH_NOPOOL)
+		goto out;	/* No parent?  No problem. */
+
+	image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+	if (IS_ERR(image_id)) {
+		ret = PTR_ERR(image_id);
+		goto out_err;
+	}
+	parent_spec->image_id = image_id;
+	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
+	ceph_decode_64_safe(&p, end, overlap, out_err);
+
+	rbd_dev->parent_overlap = overlap;
+	rbd_dev->parent_spec = parent_spec;
+	parent_spec = NULL;	/* rbd_dev now owns this */
+out:
+	ret = 0;
+out_err:
+	kfree(reply_buf);
+	rbd_spec_put(parent_spec);
+
+	return ret;
+}
+
 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
 {
 	size_t size;
@@ -3154,6 +3268,12 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
 	if (ret < 0)
 		goto out_err;
+
+	/* Version 1 images have no parent (no layering) */
+
+	rbd_dev->parent_spec = NULL;
+	rbd_dev->parent_overlap = 0;
+
 	rbd_dev->image_format = 1;
 
 	dout("discovered version 1 image, header name is %s\n",
@@ -3205,6 +3325,14 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		goto out_err;
 
+	/* If the image supports layering, get the parent info */
+
+	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
+		ret = rbd_dev_v2_parent_info(rbd_dev);
+		if (ret < 0)
+			goto out_err;
+	}
+
 	/* crypto and compression type aren't (yet) supported for v2 images */
 
 	rbd_dev->header.crypt_type = 0;
@@ -3224,6 +3352,9 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 
 	return 0;
 out_err:
+	rbd_dev->parent_overlap = 0;
+	rbd_spec_put(rbd_dev->parent_spec);
+	rbd_dev->parent_spec = NULL;
 	kfree(rbd_dev->header_name);
 	rbd_dev->header_name = NULL;
 	kfree(rbd_dev->header.object_prefix);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 0a99099801a..15077db662e 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -87,6 +87,8 @@ struct ceph_pg {
  *
  *  lpgp_num -- as above.
  */
+#define CEPH_NOPOOL  ((__u64) (-1))  /* pool id not defined */
+
 #define CEPH_PG_TYPE_REP     1
 #define CEPH_PG_TYPE_RAID4   2
 #define CEPH_PG_POOL_VERSION 2

From 72afc71ffca0f444ee0e1ef8c7e34ab209bb48b3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Oct 2012 19:40:33 -0500
Subject: [PATCH 43/68] libceph: define ceph_pg_pool_name_by_id()

Define and export function ceph_pg_pool_name_by_id() to supply
the name of a pg pool whose id is given.  This will be used by
the next patch.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osdmap.h |  1 +
 net/ceph/osdmap.c           | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e88a620b9f8..5ea57ba6932 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -123,6 +123,7 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
+extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
 
 #endif
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f552aa48fd9..de73214b5d2 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -469,6 +469,22 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
+{
+	struct ceph_pg_pool_info *pi;
+
+	if (id == CEPH_NOPOOL)
+		return NULL;
+
+	if (WARN_ON_ONCE(id > (u64) INT_MAX))
+		return NULL;
+
+	pi = __lookup_pg_pool(&map->pg_pools, (int) id);
+
+	return pi ? pi->name : NULL;
+}
+EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
+
 int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
 {
 	struct rb_node *rbp;

From 9e15b77d9af3b63dfbff14e695336dfca88c22b2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Oct 2012 19:40:33 -0500
Subject: [PATCH 44/68] rbd: get additional info in parent spec

When a layered rbd image has a parent, that parent is identified
only by its pool id, image id, and snapshot id.  Images that have
been mapped also record *names* for those three id's.

Add code to look up these names for parent images so they match
mapped images more closely.  Skip doing this for an image if it
already has its pool name defined (this will be the case for images
mapped by the user).

It is possible that an the name of a parent image can't be
determined, even if the image id is valid.  If this occurs it
does not preclude correct operation, so don't treat this as
an error.

On the other hand, defined pools will always have both an id and a
name.   And any snapshot of an image identified as a parent for a
clone image will exist, and will have a name (if not it indicates
some other internal error).  So treat failure to get these bits
of information as errors.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 133 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bce1fcfb518..842caf4aab4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -70,7 +70,10 @@
 
 #define RBD_SNAP_HEAD_NAME	"-"
 
+/* This allows a single page to hold an image name sent by OSD */
+#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
 #define RBD_IMAGE_ID_LEN_MAX	64
+
 #define RBD_OBJ_PREFIX_LEN_MAX	64
 
 /* Feature bits */
@@ -658,6 +661,20 @@ out_err:
 	return -ENOMEM;
 }
 
+static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+{
+	struct rbd_snap *snap;
+
+	if (snap_id == CEPH_NOSNAP)
+		return RBD_SNAP_HEAD_NAME;
+
+	list_for_each_entry(snap, &rbd_dev->snaps, node)
+		if (snap_id == snap->id)
+			return snap->name;
+
+	return NULL;
+}
+
 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 {
 
@@ -2499,6 +2516,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 		goto out_err;
 	}
 	parent_spec->image_id = image_id;
+	parent_spec->image_id_len = len;
 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
 	ceph_decode_64_safe(&p, end, overlap, out_err);
 
@@ -2514,6 +2532,117 @@ out_err:
 	return ret;
 }
 
+static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
+{
+	size_t image_id_size;
+	char *image_id;
+	void *p;
+	void *end;
+	size_t size;
+	void *reply_buf = NULL;
+	size_t len = 0;
+	char *image_name = NULL;
+	int ret;
+
+	rbd_assert(!rbd_dev->spec->image_name);
+
+	image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
+	image_id = kmalloc(image_id_size, GFP_KERNEL);
+	if (!image_id)
+		return NULL;
+
+	p = image_id;
+	end = (char *) image_id + image_id_size;
+	ceph_encode_string(&p, end, rbd_dev->spec->image_id,
+				(u32) rbd_dev->spec->image_id_len);
+
+	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		goto out;
+
+	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
+				"rbd", "dir_get_name",
+				image_id, image_id_size,
+				(char *) reply_buf, size,
+				CEPH_OSD_FLAG_READ, NULL);
+	if (ret < 0)
+		goto out;
+	p = reply_buf;
+	end = (char *) reply_buf + size;
+	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+	if (IS_ERR(image_name))
+		image_name = NULL;
+	else
+		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
+out:
+	kfree(reply_buf);
+	kfree(image_id);
+
+	return image_name;
+}
+
+/*
+ * When a parent image gets probed, we only have the pool, image,
+ * and snapshot ids but not the names of any of them.  This call
+ * is made later to fill in those names.  It has to be done after
+ * rbd_dev_snaps_update() has completed because some of the
+ * information (in particular, snapshot name) is not available
+ * until then.
+ */
+static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc;
+	const char *name;
+	void *reply_buf = NULL;
+	int ret;
+
+	if (rbd_dev->spec->pool_name)
+		return 0;	/* Already have the names */
+
+	/* Look up the pool name */
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
+	if (!name)
+		return -EIO;	/* pool id too large (>= 2^31) */
+
+	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
+	if (!rbd_dev->spec->pool_name)
+		return -ENOMEM;
+
+	/* Fetch the image name; tolerate failure here */
+
+	name = rbd_dev_image_name(rbd_dev);
+	if (name) {
+		rbd_dev->spec->image_name_len = strlen(name);
+		rbd_dev->spec->image_name = (char *) name;
+	} else {
+		pr_warning(RBD_DRV_NAME "%d "
+			"unable to get image name for image id %s\n",
+			rbd_dev->major, rbd_dev->spec->image_id);
+	}
+
+	/* Look up the snapshot name. */
+
+	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
+	if (!name) {
+		ret = -EIO;
+		goto out_err;
+	}
+	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
+	if(!rbd_dev->spec->snap_name)
+		goto out_err;
+
+	return 0;
+out_err:
+	kfree(reply_buf);
+	kfree(rbd_dev->spec->pool_name);
+	rbd_dev->spec->pool_name = NULL;
+
+	return ret;
+}
+
 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
 {
 	size_t size;
@@ -3372,6 +3501,10 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
 	if (ret)
 		return ret;
 
+	ret = rbd_dev_probe_update_spec(rbd_dev);
+	if (ret)
+		goto err_out_snaps;
+
 	ret = rbd_dev_set_mapping(rbd_dev);
 	if (ret)
 		goto err_out_snaps;

From 4d1d0534f53863108fdea496288cb3310f88118d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sat, 3 Nov 2012 10:32:37 +0800
Subject: [PATCH 45/68] ceph: Hold caps_list_lock when adjusting caps_{use,
 total}_count

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3251e9cc640..2d0141e95c8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
 	if (!ctx) {
 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 		if (cap) {
+			spin_lock(&mdsc->caps_list_lock);
 			mdsc->caps_use_count++;
 			mdsc->caps_total_count++;
+			spin_unlock(&mdsc->caps_list_lock);
 		}
 		return cap;
 	}

From 22cddde104d715600a4c218bf9224923208afe90 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 5 Nov 2012 11:07:23 -0800
Subject: [PATCH 46/68] ceph: Fix i_size update race

ceph_aio_write() has an optimization that marks cap EPH_CAP_FILE_WR
dirty before data is copied to page cache and inode size is updated.
If ceph_check_caps() flushes the dirty cap before the inode size is
updated, MDS can miss the new inode size. The fix is move
ceph_{get,put}_cap_refs() into ceph_write_{begin,end}() and call
__ceph_mark_dirty_caps() after inode size is updated.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/addr.c | 51 +++++++++++++++++++++++++++++++---
 fs/ceph/file.c | 75 +++++++++++++++++++++-----------------------------
 2 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 22b6e4583fa..21a07187df0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1078,23 +1078,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = file->private_data;
 	struct page *page;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	int r;
+	int r, want, got = 0;
+
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+
+	dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, len, inode->i_size);
+	r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
+	if (r < 0)
+		return r;
+	dout("write_begin %p %llx.%llx %llu~%u  got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+	if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
+		ceph_put_cap_refs(ci, got);
+		return -EAGAIN;
+	}
 
 	do {
 		/* get a page */
 		page = grab_cache_page_write_begin(mapping, index, 0);
-		if (!page)
-			return -ENOMEM;
-		*pagep = page;
+		if (!page) {
+			r = -ENOMEM;
+			break;
+		}
 
 		dout("write_begin file %p inode %p page %p %d~%d\n", file,
 		     inode, page, (int)pos, (int)len);
 
 		r = ceph_update_writeable_page(file, pos, len, page);
+		if (r)
+			page_cache_release(page);
 	} while (r == -EAGAIN);
 
+	if (r) {
+		ceph_put_cap_refs(ci, got);
+	} else {
+		*pagep = page;
+		*(int *)fsdata = got;
+	}
 	return r;
 }
 
@@ -1108,10 +1136,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
+	int got = (unsigned long)fsdata;
 
 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
 	     inode, page, (int)pos, (int)copied, (int)len);
@@ -1134,6 +1164,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 	up_read(&mdsc->snap_rwsem);
 	page_cache_release(page);
 
+	if (copied > 0) {
+		int dirty;
+		spin_lock(&ci->i_ceph_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	dout("write_end %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+	ceph_put_cap_refs(ci, got);
+
 	if (check_cap)
 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5840d2aaed1..d415096800a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	loff_t endoff = pos + iov->iov_len;
-	int want, got = 0;
-	int ret, err;
+	int got = 0;
+	int ret, err, written;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
 retry_snap:
+	written = 0;
 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
 		return -ENOSPC;
 	__ceph_do_pending_vmtruncate(inode);
-	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-	     inode->i_size);
-	if (fi->fmode & CEPH_FILE_MODE_LAZY)
-		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
-	else
-		want = CEPH_CAP_FILE_BUFFER;
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
-	if (ret < 0)
-		goto out_put;
-
-	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-	     ceph_cap_string(got));
-
-	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
-	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
-	    (fi->flags & CEPH_F_SYNC)) {
-		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
-			&iocb->ki_pos);
-	} else {
-		/*
-		 * buffered write; drop Fw early to avoid slow
-		 * revocation if we get stuck on balance_dirty_pages
-		 */
-		int dirty;
-
-		spin_lock(&ci->i_ceph_lock);
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
-		spin_unlock(&ci->i_ceph_lock);
-		ceph_put_cap_refs(ci, got);
 
+	/*
+	 * try to do a buffered write.  if we don't have sufficient
+	 * caps, we'll get -EAGAIN from generic_file_aio_write, or a
+	 * short write if we only get caps for some pages.
+	 */
+	if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
+	    !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
+	    !(fi->flags & CEPH_F_SYNC)) {
 		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+		if (ret >= 0)
+			written = ret;
+
 		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
 		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
 		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-			err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
+			err = vfs_fsync_range(file, pos, pos + written - 1, 1);
 			if (err < 0)
 				ret = err;
 		}
-
-		if (dirty)
-			__mark_inode_dirty(inode, dirty);
-		goto out;
+		if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
+			goto out;
 	}
 
+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos + written,
+	     (unsigned)iov->iov_len - written, inode->i_size);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
+	if (ret < 0)
+		goto out;
+
+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos + written,
+	     (unsigned)iov->iov_len - written, ceph_cap_string(got));
+	ret = ceph_sync_write(file, iov->iov_base + written,
+			      iov->iov_len - written, &iocb->ki_pos);
 	if (ret >= 0) {
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
@@ -777,13 +767,10 @@ retry_snap:
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
 	}
-
-out_put:
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-	     ceph_cap_string(got));
+	     inode, ceph_vinop(inode), pos + written,
+	     (unsigned)iov->iov_len - written, ceph_cap_string(got));
 	ceph_put_cap_refs(ci, got);
-
 out:
 	if (ret == -EOLDSNAPC) {
 		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",

From cfc84c9f73ab8a6933bd4f36efac1196cddad581 Mon Sep 17 00:00:00 2001
From: Cyril Roelandt <tipecaml@gmail.com>
Date: Tue, 20 Nov 2012 10:23:07 -0600
Subject: [PATCH 47/68] ceph: fix dentry reference leak in ceph_encode_fh().

dput() was not called in the error path.

Signed-off-by: Cyril Roelandt <tipecaml@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/export.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 862887004d2..f350be34601 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 	struct ceph_nfs_confh *cfh = (void *)rawfh;
 	int connected_handle_length = sizeof(*cfh)/4;
 	int handle_length = sizeof(*fh)/4;
-	struct dentry *dentry = d_find_alias(inode);
+	struct dentry *dentry;
 	struct dentry *parent;
 
 	/* don't re-export snaps */
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EINVAL;
 
+	dentry = d_find_alias(inode);
+
 	/* if we found an alias, generate a connectable fh */
 	if (*max_len >= connected_handle_length && dentry) {
 		dout("encode_fh %p connectable\n", dentry);

From 83aff95eb9d60aff5497e9f44a2ae906b86d8e88 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 28 Nov 2012 12:28:24 -0800
Subject: [PATCH 48/68] libceph: remove 'osdtimeout' option

This would reset a connection with any OSD that had an outstanding
request that was taking more than N seconds.  The idea was that if the
OSD was buggy, the client could compensate by resending the request.

In reality, this only served to hide server bugs, and we haven't
actually seen such a bug in quite a while.  Moreover, the userspace
client code never did this.

More importantly, often the request is taking a long time because the
OSD is trying to recover, or overloaded, and killing the connection
and retrying would only make the situation worse by giving the OSD
more work to do.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/super.c              |  2 --
 include/linux/ceph/libceph.h |  2 --
 net/ceph/ceph_common.c       |  3 +--
 net/ceph/osd_client.c        | 47 +++---------------------------------
 4 files changed, 5 insertions(+), 49 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2f586b0e5e0..fcda1c73a1e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
 		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
-	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, ",osdkeepalivetimeout=%d",
 			   opt->osd_keepalive_timeout);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 42624789b06..317aff8feb0 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -43,7 +43,6 @@ struct ceph_options {
 	struct ceph_entity_addr my_addr;
 	int mount_timeout;
 	int osd_idle_ttl;
-	int osd_timeout;
 	int osd_keepalive_timeout;
 
 	/*
@@ -63,7 +62,6 @@ struct ceph_options {
  * defaults
  */
 #define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a8020293f34..ee71ea26777 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -305,7 +305,6 @@ ceph_parse_options(char *options, const char *dev_name,
 
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
-	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
@@ -391,7 +390,7 @@ ceph_parse_options(char *options, const char *dev_name,
 
 			/* misc */
 		case Opt_osdtimeout:
-			opt->osd_timeout = intval;
+			pr_warning("ignoring deprecated osdtimeout option\n");
 			break;
 		case Opt_osdkeepalivetimeout:
 			opt->osd_keepalive_timeout = intval;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccbdfbba9e5..7ebfe13267e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -608,14 +608,6 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 	}
 }
 
-static void kick_osd_requests(struct ceph_osd_client *osdc,
-			      struct ceph_osd *kickosd)
-{
-	mutex_lock(&osdc->request_mutex);
-	__kick_osd_requests(osdc, kickosd);
-	mutex_unlock(&osdc->request_mutex);
-}
-
 /*
  * If the osd connection drops, we need to resubmit all requests.
  */
@@ -629,7 +621,9 @@ static void osd_reset(struct ceph_connection *con)
 	dout("osd_reset osd%d\n", osd->o_osd);
 	osdc = osd->o_osdc;
 	down_read(&osdc->map_sem);
-	kick_osd_requests(osdc, osd);
+	mutex_lock(&osdc->request_mutex);
+	__kick_osd_requests(osdc, osd);
+	mutex_unlock(&osdc->request_mutex);
 	send_queued(osdc);
 	up_read(&osdc->map_sem);
 }
@@ -1091,12 +1085,10 @@ static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
-	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd_request *req;
 	struct ceph_osd *osd;
-	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
 	unsigned long keepalive =
 		osdc->client->options->osd_keepalive_timeout * HZ;
-	unsigned long last_stamp = 0;
 	struct list_head slow_osds;
 	dout("timeout\n");
 	down_read(&osdc->map_sem);
@@ -1105,37 +1097,6 @@ static void handle_timeout(struct work_struct *work)
 
 	mutex_lock(&osdc->request_mutex);
 
-	/*
-	 * reset osds that appear to be _really_ unresponsive.  this
-	 * is a failsafe measure.. we really shouldn't be getting to
-	 * this point if the system is working properly.  the monitors
-	 * should mark the osd as failed and we should find out about
-	 * it from an updated osd map.
-	 */
-	while (timeout && !list_empty(&osdc->req_lru)) {
-		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-				 r_req_lru_item);
-
-		/* hasn't been long enough since we sent it? */
-		if (time_before(jiffies, req->r_stamp + timeout))
-			break;
-
-		/* hasn't been long enough since it was acked? */
-		if (req->r_request->ack_stamp == 0 ||
-		    time_before(jiffies, req->r_request->ack_stamp + timeout))
-			break;
-
-		BUG_ON(req == last_req && req->r_stamp == last_stamp);
-		last_req = req;
-		last_stamp = req->r_stamp;
-
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-			   req->r_tid, osd->o_osd);
-		__kick_osd_requests(osdc, osd);
-	}
-
 	/*
 	 * ping osds that are a bit slow.  this ensures that if there
 	 * is a break in the TCP connection we will notice, and reopen

From d2cc4dde9206aa2c7fb237aa689d3277cc070547 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 29 Nov 2012 08:37:03 -0600
Subject: [PATCH 49/68] bdi_register: add __printf verification, fix arg
 mismatch

__printf is useful to verify format and arguments.

Signed-off-by: Joe Perches <joe@perches.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/super.c             | 2 +-
 include/linux/backing-dev.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fcda1c73a1e..1a144001b2e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -842,7 +842,7 @@ static int ceph_register_bdi(struct super_block *sb,
 		fsc->backing_dev_info.ra_pages =
 			default_backing_dev_info.ra_pages;
 
-	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
 		sb->s_bdi = &fsc->backing_dev_info;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 2a9a9abc912..12731a19ef0 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -114,6 +114,7 @@ struct backing_dev_info {
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
+__printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);

From 5e62ad30157d0da04cf40c6d1a2f4bc840948b9c Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 19 Nov 2012 10:49:04 +0800
Subject: [PATCH 50/68] ceph: Don't update i_max_size when handling non-auth
 cap

The cap from non-auth mds doesn't have a meaningful max_size value.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2d0141e95c8..8072aefc427 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2390,7 +2390,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 			    &atime);
 
 	/* max size increase? */
-	if (max_size != ci->i_max_size) {
+	if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
 		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
 		ci->i_max_size = max_size;
 		if (max_size >= ci->i_wanted_max_size) {

From ed75ec2cd19b47efcd292b6e23f58e56f4c5bc34 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 19 Nov 2012 10:49:06 +0800
Subject: [PATCH 51/68] ceph: Fix infinite loop in __wake_requests

__wake_requests() will enter infinite loop if we use it to wake
requests in the session->s_waiting list. __wake_requests() deletes
requests from the list and __do_request() adds requests back to
the list.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 62d2342eb26..9165eb8309e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1876,9 +1876,14 @@ finish:
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head)
 {
-	struct ceph_mds_request *req, *nreq;
+	struct ceph_mds_request *req;
+	LIST_HEAD(tmp_list);
 
-	list_for_each_entry_safe(req, nreq, head, r_wait) {
+	list_splice_init(head, &tmp_list);
+
+	while (!list_empty(&tmp_list)) {
+		req = list_entry(tmp_list.next,
+				 struct ceph_mds_request, r_wait);
 		list_del_init(&req->r_wait);
 		__do_request(mdsc, req);
 	}

From 0685235ffd9dbdb9ccbda587f8a3c83ad1d5a921 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 19 Nov 2012 10:49:07 +0800
Subject: [PATCH 52/68] ceph: Don't add dirty inode to dirty list if caps is in
 migration

Add dirty inode to cap_dirty_migrating list instead, this can avoid
ceph_flush_dirty_caps() entering infinite loop.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8072aefc427..5efa3f5e2f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1351,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 		if (!ci->i_head_snapc)
 			ci->i_head_snapc = ceph_get_snap_context(
 				ci->i_snap_realm->cached_context);
-		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
-			ci->i_head_snapc);
+		dout(" inode %p now dirty snapc %p auth cap %p\n",
+		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
 		spin_lock(&mdsc->cap_dirty_lock);
-		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		if (ci->i_auth_cap)
+			list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		else
+			list_add(&ci->i_dirty_item,
+				 &mdsc->cap_dirty_migrating);
 		spin_unlock(&mdsc->cap_dirty_lock);
 		if (ci->i_flushing_caps == 0) {
 			ihold(inode);

From a85f50b6ef93fbbb2ae932ce9b2376509d172796 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 19 Nov 2012 10:49:08 +0800
Subject: [PATCH 53/68] ceph: Fix __ceph_do_pending_vmtruncate

we should set i_truncate_pending to 0 after page cache is truncated
to i_truncate_size

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/inode.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4b5762ef7c2..81613bced19 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 to;
-	int wrbuffer_refs, wake = 0;
+	int wrbuffer_refs, finish = 0;
 
 retry:
 	spin_lock(&ci->i_ceph_lock);
@@ -1498,15 +1498,18 @@ retry:
 	truncate_inode_pages(inode->i_mapping, to);
 
 	spin_lock(&ci->i_ceph_lock);
-	ci->i_truncate_pending--;
-	if (ci->i_truncate_pending == 0)
-		wake = 1;
+	if (to == ci->i_truncate_size) {
+		ci->i_truncate_pending = 0;
+		finish = 1;
+	}
 	spin_unlock(&ci->i_ceph_lock);
+	if (!finish)
+		goto retry;
 
 	if (wrbuffer_refs == 0)
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
-	if (wake)
-		wake_up_all(&ci->i_cap_wq);
+
+	wake_up_all(&ci->i_cap_wq);
 }
 
 

From 0e5e1774a92e6fe9c511585de8f078b4c4c68dbb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 19 Nov 2012 10:49:09 +0800
Subject: [PATCH 54/68] ceph: call handle_cap_grant() for cap import message

If client sends cap message that requests new max size during
exporting caps, the exporting MDS will drop the message quietly.
So the client may wait for the reply that updates the max size
forever. call handle_cap_grant() for cap import message can
avoid this issue.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5efa3f5e2f7..a1d9bb30c1b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2751,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 
 	/* make sure we re-request max_size, if necessary */
 	spin_lock(&ci->i_ceph_lock);
+	ci->i_wanted_max_size = 0;  /* reset */
 	ci->i_requested_max_size = 0;
 	spin_unlock(&ci->i_ceph_lock);
 }
@@ -2846,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
 				  snaptrace, snaptrace_len);
-		ceph_check_caps(ceph_inode(inode), 0, session);
-		goto done_unlocked;
 	}
 
 	/* the rest require a cap */
@@ -2864,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	switch (op) {
 	case CEPH_CAP_OP_REVOKE:
 	case CEPH_CAP_OP_GRANT:
+	case CEPH_CAP_OP_IMPORT:
 		handle_cap_grant(inode, h, session, cap, msg->middle);
 		goto done_unlocked;
 

From 8884d53dd63b1d9315b343564fcbe1ede004a99e Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Mon, 3 Dec 2012 19:14:05 -0800
Subject: [PATCH 55/68] libceph: Unlock unprocessed pages in start_read() error
 path

Function start_read() can get an error before processing all pages.
It must not only release the remaining pages, but unlock them too.

This fixes http://tracker.newdream.net/issues/3370

Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/addr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 21a07187df0..8e8a818cba0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	kfree(req->r_pages);
 }
 
+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		unlock_page(pages[i]);
+}
+
 /*
  * start an async read(ahead) operation.  return nr_pages we submitted
  * a read for on success, or negative error code.
@@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	return nr_pages;
 
 out_pages:
+	ceph_unlock_page_vector(pages, nr_pages);
 	ceph_release_page_vector(pages, nr_pages);
 out:
 	ceph_osdc_put_request(req);

From 42382b709bd1d143b9f0fa93e0a3a1f2f4210707 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 16 Nov 2012 09:29:16 -0600
Subject: [PATCH 56/68] rbd: do not allow remove of mounted-on image

There is no check in rbd_remove() to see if anybody holds open the
image being removed.  That's not cool.

Add a simple open count that goes up and down with opens and closes
(releases) of the device, and don't allow an rbd image to be removed
if the count is non-zero.

Protect the updates of the open count value with ctl_mutex to ensure
the underlying rbd device doesn't get removed while concurrently
being opened.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 drivers/block/rbd.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 842caf4aab4..c7bf9613ad4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -235,6 +235,7 @@ struct rbd_device {
 
 	/* sysfs related */
 	struct device		dev;
+	unsigned long		open_count;
 };
 
 static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
@@ -309,8 +310,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 		return -EROFS;
 
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rbd_get_dev(rbd_dev);
 	set_device_ro(bdev, rbd_dev->mapping.read_only);
+	rbd_dev->open_count++;
+	mutex_unlock(&ctl_mutex);
 
 	return 0;
 }
@@ -319,7 +323,11 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct rbd_device *rbd_dev = disk->private_data;
 
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	rbd_assert(rbd_dev->open_count > 0);
+	rbd_dev->open_count--;
 	rbd_put_dev(rbd_dev);
+	mutex_unlock(&ctl_mutex);
 
 	return 0;
 }
@@ -3745,6 +3753,11 @@ static ssize_t rbd_remove(struct bus_type *bus,
 		goto done;
 	}
 
+	if (rbd_dev->open_count) {
+		ret = -EBUSY;
+		goto done;
+	}
+
 	rbd_remove_all_snaps(rbd_dev);
 	rbd_bus_del_dev(rbd_dev);
 

From 7d5f24812bd182a2471cb69c1c2baf0648332e1f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 29 Nov 2012 08:37:03 -0600
Subject: [PATCH 57/68] ceph: don't reference req after put

In __unregister_request(), there is a call to list_del_init()
referencing a request that was the subject of a call to
ceph_osdc_put_request() on the previous line.  This is not
safe, because the request structure could have been freed
by the time we reach the list_del_init().

Fix this by reversing the order of these lines.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7ebfe13267e..ac7be7202fa 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -871,9 +871,9 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 			req->r_osd = NULL;
 	}
 
+	list_del_init(&req->r_req_lru_item);
 	ceph_osdc_put_request(req);
 
-	list_del_init(&req->r_req_lru_item);
 	if (osdc->num_requests == 0) {
 		dout(" no requests, canceling timeout\n");
 		__cancel_osd_timeout(osdc);

From 685a7555ca69030739ddb57a47f0ea8ea80196a4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 7 Dec 2012 09:57:58 -0600
Subject: [PATCH 58/68] libceph: avoid using freed osd in __kick_osd_requests()

If an osd has no requests and no linger requests, __reset_osd()
will just remove it with a call to __remove_osd().  That drops
a reference to the osd, and therefore the osd may have been free
by the time __reset_osd() returns.  That function offers no
indication this may have occurred, and as a result the osd will
continue to be used even when it's no longer valid.

Change__reset_osd() so it returns an error (ENODEV) when it
deletes the osd being reset.  And change __kick_osd_requests() so it
returns immediately (before referencing osd again) if __reset_osd()
returns *any* error.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac7be7202fa..60c74c1f1ea 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -581,7 +581,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 
 	dout("__kick_osd_requests osd%d\n", osd->o_osd);
 	err = __reset_osd(osdc, osd);
-	if (err == -EAGAIN)
+	if (err)
 		return;
 
 	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
@@ -745,6 +745,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
 		__remove_osd(osdc, osd);
+		ret = -ENODEV;
 	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
 			  &osd->o_con.peer_addr,
 			  sizeof(osd->o_con.peer_addr)) == 0 &&

From 2fd82b9e92c2a718ae81fc987b4468ceeee6979b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 15:05:54 -0600
Subject: [PATCH 59/68] rbd: get rid of RBD_MAX_SEG_NAME_LEN

RBD_MAX_SEG_NAME_LEN represents the maximum length of an rbd object
name (i.e., one of the objects providing storage backing an rbd
image).

Another symbol, MAX_OBJ_NAME_SIZE, is used in the osd client code to
define the maximum length of any object name in an osd request.

Right now they disagree, with RBD_MAX_SEG_NAME_LEN being too big.

There's no real benefit at this point to defining the rbd object
name length limit separate from any other object name, so just
get rid of RBD_MAX_SEG_NAME_LEN and use MAX_OBJ_NAME_SIZE in its
place.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 drivers/block/rbd.c       | 6 +++---
 drivers/block/rbd_types.h | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c7bf9613ad4..ce26b749ede 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -740,13 +740,13 @@ static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 	u64 segment;
 	int ret;
 
-	name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 	if (!name)
 		return NULL;
 	segment = offset >> rbd_dev->header.obj_order;
-	ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
+	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 			rbd_dev->header.object_prefix, segment);
-	if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
+	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 		pr_err("error formatting segment name for #%llu (%d)\n",
 			segment, ret);
 		kfree(name);
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index cbe77fa105b..49d77cbcf8b 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -46,8 +46,6 @@
 #define RBD_MIN_OBJ_ORDER       16
 #define RBD_MAX_OBJ_ORDER       30
 
-#define RBD_MAX_SEG_NAME_LEN	128
-
 #define RBD_COMP_NONE		0
 #define RBD_CRYPT_NONE		0
 

From 61c74035626beb25a39b0273ccf7d75510bc36a1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 6 Dec 2012 09:37:23 -0600
Subject: [PATCH 60/68] rbd: remove linger unconditionally

In __unregister_linger_request(), the request is being removed
from the osd client's req_linger list only when the request
has a non-null osd pointer.  It should be done whether or not
the request currently has an osd.

This is most likely a non-issue because I believe the request
will always have an osd when this function is called.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 60c74c1f1ea..32bd696b39a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -906,8 +906,8 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 					struct ceph_osd_request *req)
 {
 	dout("__unregister_linger_request %p\n", req);
+	list_del_init(&req->r_linger_item);
 	if (req->r_osd) {
-		list_del_init(&req->r_linger_item);
 		list_del_init(&req->r_linger_osd);
 
 		if (list_empty(&req->r_osd->o_requests) &&

From b8f5c6edca34ff441e1ccdec68828e933a1b905b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: [PATCH 61/68] rbd: don't use ENOTSUPP

ENOTSUPP is not a standard errno (it shows up as "Unknown error 524"
in an error message).  This is what was getting produced when the
the local rbd code does not implement features required by a
discovered rbd image.

Change the error code returned in this case to ENXIO.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ce26b749ede..4daa400c13a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2456,7 +2456,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 
 	incompat = le64_to_cpu(features_buf.incompat);
 	if (incompat & ~RBD_FEATURES_ALL)
-		return -ENOTSUPP;
+		return -ENXIO;
 
 	*snap_features = le64_to_cpu(features_buf.features);
 

From 7bb21d68c535ad8be38e14a715632ae398b37ac1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 7 Dec 2012 19:50:07 -0600
Subject: [PATCH 62/68] libceph: socket can close in any connection state

A connection's socket can close for any reason, independent of the
state of the connection (and without irrespective of the connection
mutex).  As a result, the connectino can be in pretty much any state
at the time its socket is closed.

Handle those other cases at the top of con_work().  Pull this whole
block of code into a separate function to reduce the clutter.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1041114453d..4b04ccc60db 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2273,6 +2273,35 @@ static void queue_con(struct ceph_connection *con)
 	(void) queue_con_delay(con, 0);
 }
 
+static bool con_sock_closed(struct ceph_connection *con)
+{
+	if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
+		return false;
+
+#define CASE(x)								\
+	case CON_STATE_ ## x:						\
+		con->error_msg = "socket closed (con state " #x ")";	\
+		break;
+
+	switch (con->state) {
+	CASE(CLOSED);
+	CASE(PREOPEN);
+	CASE(CONNECTING);
+	CASE(NEGOTIATING);
+	CASE(OPEN);
+	CASE(STANDBY);
+	default:
+		pr_warning("%s con %p unrecognized state %lu\n",
+			__func__, con, con->state);
+		con->error_msg = "unrecognized con state";
+		BUG();
+		break;
+	}
+#undef CASE
+
+	return true;
+}
+
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
@@ -2284,24 +2313,8 @@ static void con_work(struct work_struct *work)
 
 	mutex_lock(&con->mutex);
 restart:
-	if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
-		switch (con->state) {
-		case CON_STATE_CONNECTING:
-			con->error_msg = "connection failed";
-			break;
-		case CON_STATE_NEGOTIATING:
-			con->error_msg = "negotiation failed";
-			break;
-		case CON_STATE_OPEN:
-			con->error_msg = "socket closed";
-			break;
-		default:
-			dout("unrecognized con state %d\n", (int)con->state);
-			con->error_msg = "unrecognized con state";
-			BUG();
-		}
+	if (con_sock_closed(con))
 		goto fault;
-	}
 
 	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);

From 28362986f8743124b3a0fda20a8ed3e80309cce1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 14 Dec 2012 16:47:41 -0600
Subject: [PATCH 63/68] libceph: report connection fault with warning

When a connection's socket disconnects, or if there's a protocol
error of some kind on the connection, a fault is signaled and
the connection is reset (closed and reopened, basically).  We
currently get an error message on the log whenever this occurs.

A ceph connection will attempt to reestablish a socket connection
repeatedly if a fault occurs.  This means that these error messages
will get repeatedly added to the log, which is undesirable.

Change the error message to be a warning, so they don't get
logged by default.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4b04ccc60db..4d111fd2b49 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2377,7 +2377,7 @@ fault:
 static void ceph_fault(struct ceph_connection *con)
 	__releases(con->mutex)
 {
-	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
 	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
 	dout("fault %p state %lu to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));

From f407731d12214e7686819018f3a1e9d7b6f83a02 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 6 Dec 2012 07:22:04 -0600
Subject: [PATCH 64/68] libceph: init osd->o_node in create_osd()

The red-black node node in the ceph osd structure is not initialized
in create_osd().  Because this node can be the subject of a
RB_EMPTY_NODE() call later on, we should ensure the node is
initialized properly for that.  Add a call to RB_CLEAR_NODE()
initialize it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 32bd696b39a..a6dc6acd656 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 	atomic_set(&osd->o_ref, 1);
 	osd->o_osdc = osdc;
 	osd->o_osd = onum;
+	RB_CLEAR_NODE(&osd->o_node);
 	INIT_LIST_HEAD(&osd->o_requests);
 	INIT_LIST_HEAD(&osd->o_linger_requests);
 	INIT_LIST_HEAD(&osd->o_osd_lru);

From 3ee5234df68d253c415ba4f2db72ad250d9c21a9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 17 Dec 2012 12:23:48 -0600
Subject: [PATCH 65/68] libceph: init event->node in ceph_osdc_create_event()

The red-black node node in the ceph osd event structure is not
initialized in create_osdc_create_event().  Because this node can
be the subject of a RB_EMPTY_NODE() call later on, we should ensure
the node is initialized properly for that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a6dc6acd656..2bce3d4be1c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1563,6 +1563,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 	event->data = data;
 	event->osdc = osdc;
 	INIT_LIST_HEAD(&event->osd_node);
+	RB_CLEAR_NODE(&event->node);
 	kref_init(&event->kref);   /* one ref for us */
 	kref_get(&event->kref);    /* one ref for the caller */
 	init_completion(&event->completion);

From a978fa20fb657548561dddbfb605fe43654f0825 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 17 Dec 2012 12:23:48 -0600
Subject: [PATCH 66/68] libceph: don't use rb_init_node() in
 ceph_osdc_alloc_request()

The red-black node in the ceph osd request structure is initialized
in ceph_osdc_alloc_request() using rbd_init_node().  We do need to
initialize this, because in __unregister_request() we call
RB_EMPTY_NODE(), which expects the node it's checking to have
been initialized.  But rb_init_node() is apparently overkill, and
may in fact be on its way out.  So use RB_CLEAR_NODE() instead.

For a little more background, see this commit:
    4c199a93 rbtree: empty nodes have no color"

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2bce3d4be1c..7cd0a7f3bf4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -221,7 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
-	rb_init_node(&req->r_node);
+	RB_CLEAR_NODE(&req->r_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd);

From c89ce05e0c5a01a256100ac6a6019f276bdd1ca6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 6 Dec 2012 07:22:04 -0600
Subject: [PATCH 67/68] libceph: register request before unregister linger

In kick_requests(), we need to register the request before we
unregister the linger request.  Otherwise the unregister will
reset the request's osd pointer to NULL.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7cd0a7f3bf4..780caf6b049 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1328,8 +1328,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 
 		dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
 		     req->r_osd ? req->r_osd->o_osd : -1);
-		__unregister_linger_request(osdc, req);
 		__register_request(osdc, req);
+		__unregister_linger_request(osdc, req);
 	}
 	mutex_unlock(&osdc->request_mutex);
 

From c3e946ce7276faf0b302acd25c7b874edbeba661 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 16 Nov 2012 09:29:16 -0600
Subject: [PATCH 68/68] rbd: get rid of rbd_{get,put}_dev()

The functions rbd_get_dev() and rbd_put_dev() are trivial wrappers
that add no value, and their existence suggests they may do more
than what they do.

Get rid of them.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 drivers/block/rbd.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4daa400c13a..89576a0b3f2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -290,16 +290,6 @@ static struct device rbd_root_dev = {
 #  define rbd_assert(expr)	((void) 0)
 #endif /* !RBD_DEBUG */
 
-static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
-{
-	return get_device(&rbd_dev->dev);
-}
-
-static void rbd_put_dev(struct rbd_device *rbd_dev)
-{
-	put_device(&rbd_dev->dev);
-}
-
 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 
@@ -311,7 +301,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
 		return -EROFS;
 
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rbd_get_dev(rbd_dev);
+	(void) get_device(&rbd_dev->dev);
 	set_device_ro(bdev, rbd_dev->mapping.read_only);
 	rbd_dev->open_count++;
 	mutex_unlock(&ctl_mutex);
@@ -326,7 +316,7 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rbd_assert(rbd_dev->open_count > 0);
 	rbd_dev->open_count--;
-	rbd_put_dev(rbd_dev);
+	put_device(&rbd_dev->dev);
 	mutex_unlock(&ctl_mutex);
 
 	return 0;