aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2012-08-08 21:03:47 +0200
committerPatrick McHardy <kaber@trash.net>2012-08-08 21:03:47 +0200
commitd53b4ed072d9779cdf53582c46436dec06d0961f (patch)
treeac95ecab33e31cd79aae69c475e8348adac51230 /fs/btrfs
parent5d4dff7f1011a81a693a9c7b1f6a0b9c842eb60c (diff)
parent28a33cbc24e4256c143dce96c7d93bf423229f92 (diff)
Merge tag 'v3.5' of 192.168.0.154:/repos/git/linux-2.6
Conflicts: drivers/Kconfig Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/async-thread.c17
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c1447
-rw-r--r--fs/btrfs/backref.h11
-rw-r--r--fs/btrfs/btrfs_inode.h54
-rw-r--r--fs/btrfs/check-integrity.c3358
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/compression.c54
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c1311
-rw-r--r--fs/btrfs/ctree.h449
-rw-r--r--fs/btrfs/delayed-inode.c104
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.c176
-rw-r--r--fs/btrfs/delayed-ref.h80
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c902
-rw-r--r--fs/btrfs/disk-io.h18
-rw-r--r--fs/btrfs/export.c19
-rw-r--r--fs/btrfs/extent-tree.c1124
-rw-r--r--fs/btrfs/extent_io.c1353
-rw-r--r--fs/btrfs/extent_io.h75
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file-item.c61
-rw-r--r--fs/btrfs/file.c197
-rw-r--r--fs/btrfs/free-space-cache.c646
-rw-r--r--fs/btrfs/inode-item.c6
-rw-r--r--fs/btrfs/inode-map.c23
-rw-r--r--fs/btrfs/inode.c1075
-rw-r--r--fs/btrfs/ioctl.c628
-rw-r--r--fs/btrfs/ioctl.h93
-rw-r--r--fs/btrfs/locking.c59
-rw-r--r--fs/btrfs/locking.h4
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c225
-rw-r--r--fs/btrfs/ordered-data.h37
-rw-r--r--fs/btrfs/orphan.c2
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/rcu-string.h56
-rw-r--r--fs/btrfs/reada.c65
-rw-r--r--fs/btrfs/relocation.c152
-rw-r--r--fs/btrfs/root-tree.c25
-rw-r--r--fs/btrfs/scrub.c1486
-rw-r--r--fs/btrfs/struct-funcs.c53
-rw-r--r--fs/btrfs/super.c544
-rw-r--r--fs/btrfs/transaction.c309
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c147
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/ulist.c222
-rw-r--r--fs/btrfs/ulist.h77
-rw-r--r--fs/btrfs/volumes.c1614
-rw-r--r--fs/btrfs/volumes.h113
-rw-r--r--fs/btrfs/xattr.c3
-rw-r--r--fs/btrfs/zlib.c4
58 files changed, 14513 insertions, 4062 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be14..d33f01c08b6 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+ bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+ depends on BTRFS_FS
+ help
+ Adds code that examines all block write requests (including
+ writes of the super block). The goal is to verify that the
+ state of the filesystem on disk is always consistent, i.e.,
+ after a power-loss or kernel panic event the filesystem is
+ in a consistent state.
+
+ If the integrity check tool is included and activated in
+ the mount options, plenty of kernel memory is used, and
+ plenty of additional CPU cycles are spent. Enabling this
+ functionality is not intended for normal use.
+
+ In most cases, unless you are a btrfs developer who needs
+ to verify the integrity of (super)-block write requests
+ during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e..0c4fa2befae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o
+ reada.o backref.o ulist.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 89b156d85d6..761e2cd8fed 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
if (ret > 0) {
/* we need an acl */
ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+ } else {
+ cache_no_acl(inode);
}
+ } else {
+ cache_no_acl(inode);
}
failed:
posix_acl_release(acl);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 0b394580d86..42704149b72 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -171,11 +171,11 @@ out:
spin_unlock_irqrestore(&workers->lock, flags);
}
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+static noinline void run_ordered_completions(struct btrfs_workers *workers,
struct btrfs_work *work)
{
if (!workers->ordered)
- return 0;
+ return;
set_bit(WORK_DONE_BIT, &work->flags);
@@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
}
spin_unlock(&workers->order_lock);
- return 0;
}
static void put_worker(struct btrfs_worker_thread *worker)
@@ -334,7 +333,7 @@ again:
if (freezing(current)) {
worker->working = 0;
spin_unlock_irq(&worker->lock);
- refrigerator();
+ try_to_freeze();
} else {
spin_unlock_irq(&worker->lock);
if (!kthread_should_stop()) {
@@ -399,7 +398,7 @@ again:
/*
* this will wait for all the worker threads to shutdown
*/
-int btrfs_stop_workers(struct btrfs_workers *workers)
+void btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
@@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
put_worker(worker);
}
spin_unlock_irq(&workers->lock);
- return 0;
}
/*
@@ -615,14 +613,14 @@ found:
* it was taken from. It is intended for use with long running work functions
* that make some progress and want to give the cpu up for others.
*/
-int btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_requeue_work(struct btrfs_work *work)
{
struct btrfs_worker_thread *worker = work->worker;
unsigned long flags;
int wake = 0;
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- goto out;
+ return;
spin_lock_irqsave(&worker->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
@@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
-out:
-
- return 0;
}
void btrfs_set_work_high_prio(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index f34cc31fa3c..063698b90ce 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -111,9 +111,9 @@ struct btrfs_workers {
void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers);
-int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);
-int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd..a383c18e74e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,19 +19,1048 @@
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+#include "locking.h"
-struct __data_ref {
- struct list_head list;
+struct extent_inode_elem {
u64 inum;
- u64 root;
- u64 extent_data_item_offset;
+ u64 offset;
+ struct extent_inode_elem *next;
};
-struct __shared_ref {
- struct list_head list;
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+ struct btrfs_file_extent_item *fi,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 data_offset;
+ u64 data_len;
+ struct extent_inode_elem *e;
+
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ return 1;
+
+ e = kmalloc(sizeof(*e), GFP_NOFS);
+ if (!e)
+ return -ENOMEM;
+
+ e->next = *eie;
+ e->inum = key->objectid;
+ e->offset = key->offset + (extent_item_pos - data_offset);
+ *eie = e;
+
+ return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
u64 disk_byte;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int slot;
+ int nritems;
+ int extent_type;
+ int ret;
+
+ /*
+ * from the shared data ref, we only have the leaf but we need
+ * the key. thus, we must look into all items and see that we
+ * find one (some) with a reference to our extent item.
+ */
+ nritems = btrfs_header_nritems(eb);
+ for (slot = 0; slot < nritems; ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte != wanted_disk_byte)
+ continue;
+
+ ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+ struct list_head list;
+ u64 root_id;
+ struct btrfs_key key_for_search;
+ int level;
+ int count;
+ struct extent_inode_elem *inode_list;
+ u64 parent;
+ u64 wanted_disk_byte;
};
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ * block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | - | -
+ * key to resolve | - | y | y | y
+ * tree block logical | - | - | - | -
+ * root for resolving | y | y | y | y
+ *
+ * - column 1: we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | y | -
+ * key to resolve | - | - | - | y
+ * tree block logical | y | y | y | y
+ * root for resolving | - | y | y | y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2: we take the first key from the block to find the parent
+ * (see __add_missing_keys)
+ * - column 4: we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+ struct btrfs_key *key, int level,
+ u64 parent, u64 wanted_disk_byte, int count)
+{
+ struct __prelim_ref *ref;
+
+ /* in case we're adding delayed refs, we're holding the refs spinlock */
+ ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+ if (!ref)
+ return -ENOMEM;
+
+ ref->root_id = root_id;
+ if (key)
+ ref->key_for_search = *key;
+ else
+ memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+
+ ref->inode_list = NULL;
+ ref->level = level;
+ ref->count = count;
+ ref->parent = parent;
+ ref->wanted_disk_byte = wanted_disk_byte;
+ list_add_tail(&ref->list, head);
+
+ return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+ struct ulist *parents, int level,
+ struct btrfs_key *key_for_search, u64 time_seq,
+ u64 wanted_disk_byte,
+ const u64 *extent_item_pos)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct extent_inode_elem *eie = NULL;
+ u64 disk_byte;
+
+ if (level != 0) {
+ eb = path->nodes[level];
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (ret < 0)
+ return ret;
+ return 0;
+ }
+
+ /*
+ * We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot==nritems. In that case, go to the next leaf before we continue.
+ */
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+
+ while (!ret) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+
+ if (key.objectid != key_for_search->objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+ if (disk_byte == wanted_disk_byte) {
+ eie = NULL;
+ if (extent_item_pos) {
+ ret = check_extent_in_eb(&key, eb, fi,
+ *extent_item_pos,
+ &eie);
+ if (ret < 0)
+ break;
+ }
+ if (!ret) {
+ ret = ulist_add(parents, eb->start,
+ (unsigned long)eie, GFP_NOFS);
+ if (ret < 0)
+ break;
+ if (!extent_item_pos) {
+ ret = btrfs_next_old_leaf(root, path,
+ time_seq);
+ continue;
+ }
+ }
+ }
+ ret = btrfs_next_old_item(root, path, time_seq);
+ }
+
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+ int search_commit_root,
+ u64 time_seq,
+ struct __prelim_ref *ref,
+ struct ulist *parents,
+ const u64 *extent_item_pos)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct btrfs_key root_key;
+ struct extent_buffer *eb;
+ int ret = 0;
+ int root_level;
+ int level = ref->level;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->search_commit_root = !!search_commit_root;
+
+ root_key.objectid = ref->root_id;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ rcu_read_lock();
+ root_level = btrfs_header_level(root->node);
+ rcu_read_unlock();
+
+ if (root_level + 1 == level)
+ goto out;
+
+ path->lowest_level = level;
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+ pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+ "%d for key (%llu %u %llu)\n",
+ (unsigned long long)ref->root_id, level, ref->count, ret,
+ (unsigned long long)ref->key_for_search.objectid,
+ ref->key_for_search.type,
+ (unsigned long long)ref->key_for_search.offset);
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[level];
+ while (!eb) {
+ if (!level) {
+ WARN_ON(1);
+ ret = 1;
+ goto out;
+ }
+ level--;
+ eb = path->nodes[level];
+ }
+
+ ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
+ time_seq, ref->wanted_disk_byte,
+ extent_item_pos);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+ int search_commit_root, u64 time_seq,
+ struct list_head *head,
+ const u64 *extent_item_pos)
+{
+ int err;
+ int ret = 0;
+ struct __prelim_ref *ref;
+ struct __prelim_ref *ref_safe;
+ struct __prelim_ref *new_ref;
+ struct ulist *parents;
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ /*
+ * _safe allows us to insert directly after the current item without
+ * iterating over the newly inserted items.
+ * we're also allowed to re-assign ref during iteration.
+ */
+ list_for_each_entry_safe(ref, ref_safe, head, list) {
+ if (ref->parent) /* already direct */
+ continue;
+ if (ref->count == 0)
+ continue;
+ err = __resolve_indirect_ref(fs_info, search_commit_root,
+ time_seq, ref, parents,
+ extent_item_pos);
+ if (err) {
+ if (ret == 0)
+ ret = err;
+ continue;
+ }
+
+ /* we put the first parent into the ref at hand */
+ ULIST_ITER_INIT(&uiter);
+ node = ulist_next(parents, &uiter);
+ ref->parent = node ? node->val : 0;
+ ref->inode_list =
+ node ? (struct extent_inode_elem *)node->aux : 0;
+
+ /* additional parents require new refs being added here */
+ while ((node = ulist_next(parents, &uiter))) {
+ new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+ if (!new_ref) {
+ ret = -ENOMEM;
+ break;
+ }
+ memcpy(new_ref, ref, sizeof(*ref));
+ new_ref->parent = node->val;
+ new_ref->inode_list =
+ (struct extent_inode_elem *)node->aux;
+ list_add(&new_ref->list, &ref->list);
+ }
+ ulist_reinit(parents);
+ }
+
+ ulist_free(parents);
+ return ret;
+}
+
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+ struct __prelim_ref *ref2)
+{
+ if (ref1->level != ref2->level)
+ return 0;
+ if (ref1->root_id != ref2->root_id)
+ return 0;
+ if (ref1->key_for_search.type != ref2->key_for_search.type)
+ return 0;
+ if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+ return 0;
+ if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+ return 0;
+ if (ref1->parent != ref2->parent)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+ struct list_head *head)
+{
+ struct list_head *pos;
+ struct extent_buffer *eb;
+
+ list_for_each(pos, head) {
+ struct __prelim_ref *ref;
+ ref = list_entry(pos, struct __prelim_ref, list);
+
+ if (ref->parent)
+ continue;
+ if (ref->key_for_search.type)
+ continue;
+ BUG_ON(!ref->wanted_disk_byte);
+ eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+ fs_info->tree_root->leafsize, 0);
+ BUG_ON(!eb);
+ btrfs_tree_read_lock(eb);
+ if (btrfs_header_level(eb) == 0)
+ btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return 0;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ * additionally, we could even add a key range for the blocks we
+ * looked into to merge even more (-> replace unresolved refs by those
+ * having a parent).
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+ struct list_head *pos1;
+
+ list_for_each(pos1, head) {
+ struct list_head *n2;
+ struct list_head *pos2;
+ struct __prelim_ref *ref1;
+
+ ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+ for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+ pos2 = n2, n2 = pos2->next) {
+ struct __prelim_ref *ref2;
+ struct __prelim_ref *xchg;
+
+ ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+ if (mode == 1) {
+ if (!ref_for_same_block(ref1, ref2))
+ continue;
+ if (!ref1->parent && ref2->parent) {
+ xchg = ref1;
+ ref1 = ref2;
+ ref2 = xchg;
+ }
+ ref1->count += ref2->count;
+ } else {
+ if (ref1->parent != ref2->parent)
+ continue;
+ ref1->count += ref2->count;
+ }
+ list_del(&ref2->list);
+ kfree(ref2);
+ }
+
+ }
+ return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+ struct list_head *prefs)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+ struct rb_node *n = &head->node.rb_node;
+ struct btrfs_key key;
+ struct btrfs_key op_key = {0};
+ int sgn;
+ int ret = 0;
+
+ if (extent_op && extent_op->update_key)
+ btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+
+ while ((n = rb_prev(n))) {
+ struct btrfs_delayed_ref_node *node;
+ node = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (node->bytenr != head->node.bytenr)
+ break;
+ WARN_ON(node->is_head);
+
+ if (node->seq > seq)
+ continue;
+
+ switch (node->action) {
+ case BTRFS_ADD_DELAYED_EXTENT:
+ case BTRFS_UPDATE_DELAYED_HEAD:
+ WARN_ON(1);
+ continue;
+ case BTRFS_ADD_DELAYED_REF:
+ sgn = 1;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ sgn = -1;
+ break;
+ default:
+ BUG_ON(1);
+ }
+ switch (node->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, &op_key,
+ ref->level + 1, 0, node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_SHARED_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, NULL,
+ ref->level + 1, ref->parent,
+ node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+ node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ref->parent, node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ }
+
+ return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ int *info_level, struct list_head *prefs)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+ unsigned long end;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ u64 item_size;
+
+ /*
+ * enumerate all inline refs
+ */
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ *info_level = btrfs_tree_block_level(leaf, info);
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ } else {
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+ }
+
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ u64 offset;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+ switch (type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, NULL,
+ *info_level + 1, offset,
+ bytenr, 1);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+ bytenr, count);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, offset, NULL,
+ *info_level + 1, 0,
+ bytenr, 1);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+
+ return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ int info_level, struct list_head *prefs)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ while (1) {
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid != bytenr)
+ break;
+ if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+ continue;
+ if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+ break;
+
+ switch (key.type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, NULL,
+ info_level + 1, key.offset,
+ bytenr, 1);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+ bytenr, count);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, key.offset, NULL,
+ info_level + 1, 0,
+ bytenr, 1);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_data_ref);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ }
+
+ return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist *refs, struct ulist *roots,
+ const u64 *extent_item_pos)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+ struct btrfs_delayed_ref_head *head;
+ int info_level = 0;
+ int ret;
+ int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
+ struct list_head prefs_delayed;
+ struct list_head prefs;
+ struct __prelim_ref *ref;
+
+ INIT_LIST_HEAD(&prefs);
+ INIT_LIST_HEAD(&prefs_delayed);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->search_commit_root = !!search_commit_root;
+
+ /*
+ * grab both a lock on the path and a lock on the delayed ref head.
+ * We need both to get a consistent picture of how the refs look
+ * at a specified point in time
+ */
+again:
+ head = NULL;
+
+ ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+ /*
+ * look if there are updates for this ref queued and lock the
+ * head
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ ret = __add_delayed_refs(head, delayed_ref_seq,
+ &prefs_delayed);
+ mutex_unlock(&head->mutex);
+ if (ret) {
+ spin_unlock(&delayed_refs->lock);
+ goto out;
+ }
+ }
+ spin_unlock(&delayed_refs->lock);
+ }
+
+ if (path->slots[0]) {
+ struct extent_buffer *leaf;
+ int slot;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY) {
+ ret = __add_inline_refs(fs_info, path, bytenr,
+ &info_level, &prefs);
+ if (ret)
+ goto out;
+ ret = __add_keyed_refs(fs_info, path, bytenr,
+ info_level, &prefs);
+ if (ret)
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ list_splice_init(&prefs_delayed, &prefs);
+
+ ret = __add_missing_keys(fs_info, &prefs);
+ if (ret)
+ goto out;
+
+ ret = __merge_refs(&prefs, 1);
+ if (ret)
+ goto out;
+
+ ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+ &prefs, extent_item_pos);
+ if (ret)
+ goto out;
+
+ ret = __merge_refs(&prefs, 2);
+ if (ret)
+ goto out;
+
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ if (ref->count < 0)
+ WARN_ON(1);
+ if (ref->count && ref->root_id && ref->parent == 0) {
+ /* no parent == root of tree */
+ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ if (ref->count && ref->parent) {
+ struct extent_inode_elem *eie = NULL;
+ if (extent_item_pos && !ref->inode_list) {
+ u32 bsz;
+ struct extent_buffer *eb;
+ bsz = btrfs_level_size(fs_info->extent_root,
+ info_level);
+ eb = read_tree_block(fs_info->extent_root,
+ ref->parent, bsz, 0);
+ BUG_ON(!eb);
+ ret = find_extent_in_eb(eb, bytenr,
+ *extent_item_pos, &eie);
+ ref->inode_list = eie;
+ free_extent_buffer(eb);
+ }
+ ret = ulist_add_merge(refs, ref->parent,
+ (unsigned long)ref->inode_list,
+ (unsigned long *)&eie, GFP_NOFS);
+ if (!ret && extent_item_pos) {
+ /*
+ * we've recorded that parent, so we must extend
+ * its inode list here
+ */
+ BUG_ON(!eie);
+ while (eie->next)
+ eie = eie->next;
+ eie->next = ref->inode_list;
+ }
+ BUG_ON(ret < 0);
+ }
+ kfree(ref);
+ }
+
+out:
+ btrfs_free_path(path);
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+ while (!list_empty(&prefs_delayed)) {
+ ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+ list);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+
+ return ret;
+}
+
+static void free_leaf_list(struct ulist *blocks)
+{
+ struct ulist_node *node = NULL;
+ struct extent_inode_elem *eie;
+ struct extent_inode_elem *eie_next;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(blocks, &uiter))) {
+ if (!node->aux)
+ continue;
+ eie = (struct extent_inode_elem *)node->aux;
+ for (; eie; eie = eie_next) {
+ eie_next = eie->next;
+ kfree(eie);
+ }
+ node->aux = 0;
+ }
+
+ ulist_free(blocks);
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **leafs,
+ const u64 *extent_item_pos)
+{
+ struct ulist *tmp;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *leafs = ulist_alloc(GFP_NOFS);
+ if (!*leafs) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ time_seq, *leafs, tmp, extent_item_pos);
+ ulist_free(tmp);
+
+ if (ret < 0 && ret != -ENOENT) {
+ free_leaf_list(*leafs);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **roots)
+{
+ struct ulist *tmp;
+ struct ulist_node *node = NULL;
+ struct ulist_iterator uiter;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *roots = ulist_alloc(GFP_NOFS);
+ if (!*roots) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ time_seq, tmp, *roots, NULL);
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(tmp);
+ ulist_free(*roots);
+ return ret;
+ }
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ }
+
+ ulist_free(tmp);
+ return 0;
+}
+
+
static int __inode_info(u64 inum, u64 ioff, u8 key_type,
struct btrfs_root *fs_root, struct btrfs_path *path,
struct btrfs_key *found_key)
@@ -108,19 +1137,25 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
s64 bytes_left = size - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
+ int leave_spinning = path->leave_spinning;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
+ path->leave_spinning = 1;
while (1) {
len = btrfs_inode_ref_name_len(eb, iref);
bytes_left -= len;
if (bytes_left >= 0)
read_extent_buffer(eb, dest + bytes_left,
(unsigned long)(iref + 1), len);
- if (eb != eb_in)
+ if (eb != eb_in) {
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
+ }
ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+ if (ret > 0)
+ ret = -ENOENT;
if (ret)
break;
next_inum = found_key.offset;
@@ -132,8 +1167,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
slot = path->slots[0];
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
- if (eb != eb_in)
+ if (eb != eb_in) {
atomic_inc(&eb->refs);
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ }
btrfs_release_path(path);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
@@ -144,6 +1182,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
btrfs_release_path(path);
+ path->leave_spinning = leave_spinning;
if (ret)
return ERR_PTR(ret);
@@ -181,8 +1220,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
found_key->objectid > logical ||
- found_key->objectid + found_key->offset <= logical)
+ found_key->objectid + found_key->offset <= logical) {
+ pr_debug("logical %llu is not within any extent\n",
+ (unsigned long long)logical);
return -ENOENT;
+ }
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +1233,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
+ pr_debug("logical %llu is at position %llu within the extent (%llu "
+ "EXTENT_ITEM %llu) flags %#llx size %u\n",
+ (unsigned long long)logical,
+ (unsigned long long)(logical - found_key->objectid),
+ (unsigned long long)found_key->objectid,
+ (unsigned long long)found_key->offset,
+ (unsigned long long)flags, item_size);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return BTRFS_EXTENT_FLAG_TREE_BLOCK;
if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,295 +1336,103 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 0;
}
-static int __data_list_add(struct list_head *head, u64 inum,
- u64 extent_data_item_offset, u64 root)
-{
- struct __data_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- ref->inum = inum;
- ref->extent_data_item_offset = extent_data_item_offset;
- ref->root = root;
- list_add_tail(&ref->list, head);
-
- return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
- struct btrfs_extent_data_ref *dref)
-{
- return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
- btrfs_extent_data_ref_offset(eb, dref),
- btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
- struct __shared_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- ref->disk_byte = disk_byte;
- list_add_tail(&ref->list, head);
-
- return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
- u64 logical, u64 inum,
- u64 extent_data_item_offset,
- u64 extent_offset,
- struct btrfs_path *path,
- struct list_head *data_refs,
- iterate_extent_inodes_t *iterate,
- void *ctx)
-{
- u64 ref_root;
- u32 item_size;
- struct btrfs_key key;
- struct extent_buffer *eb;
- struct btrfs_extent_item *ei;
- struct btrfs_extent_inline_ref *eiref;
- struct __data_ref *ref;
- int ret;
- int type;
- int last;
- unsigned long ptr = 0;
-
- WARN_ON(!list_empty(data_refs));
- ret = extent_from_logical(fs_info, logical, path, &key);
- if (ret & BTRFS_EXTENT_FLAG_DATA)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- eb = path->nodes[0];
- ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
- ret = 0;
- ref_root = 0;
- /*
- * as done in iterate_extent_inodes, we first build a list of refs to
- * iterate, then free the path and then iterate them to avoid deadlocks.
- */
- do {
- last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
- &eiref, &type);
- if (last < 0) {
- ret = last;
- goto out;
- }
- if (type == BTRFS_TREE_BLOCK_REF_KEY ||
- type == BTRFS_SHARED_BLOCK_REF_KEY) {
- ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
- ret = __data_list_add(data_refs, inum,
- extent_data_item_offset,
- ref_root);
- }
- } while (!ret && !last);
-
- btrfs_release_path(path);
-
- if (ref_root == 0) {
- printk(KERN_ERR "btrfs: failed to find tree block ref "
- "for shared data backref %llu\n", logical);
- WARN_ON(1);
- ret = -EIO;
- }
-
-out:
- while (!list_empty(data_refs)) {
- ref = list_first_entry(data_refs, struct __data_ref, list);
- list_del(&ref->list);
- if (!ret)
- ret = iterate(ref->inum, extent_offset +
- ref->extent_data_item_offset,
- ref->root, ctx);
- kfree(ref);
- }
-
- return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
- u64 logical, u64 orig_extent_item_objectid,
- u64 extent_offset, struct btrfs_path *path,
- struct list_head *data_refs,
- iterate_extent_inodes_t *iterate,
- void *ctx)
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+ u64 root, u64 extent_item_objectid,
+ iterate_extent_inodes_t *iterate, void *ctx)
{
- u64 disk_byte;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *eb;
- int slot;
- int nritems;
- int ret;
- int found = 0;
-
- eb = read_tree_block(fs_info->tree_root, logical,
- fs_info->tree_root->leafsize, 0);
- if (!eb)
- return -EIO;
-
- /*
- * from the shared data ref, we only have the leaf but we need
- * the key. thus, we must look into all items and see that we
- * find one (some) with a reference to our extent item.
- */
- nritems = btrfs_header_nritems(eb);
- for (slot = 0; slot < nritems; ++slot) {
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- if (!fi) {
- free_extent_buffer(eb);
- return -EIO;
- }
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != orig_extent_item_objectid) {
- if (found)
- break;
- else
- continue;
- }
- ++found;
- ret = __iter_shared_inline_ref_inodes(fs_info, logical,
- key.objectid,
- key.offset,
- extent_offset, path,
- data_refs,
- iterate, ctx);
- if (ret)
+ struct extent_inode_elem *eie;
+ int ret = 0;
+
+ for (eie = inode_list; eie; eie = eie->next) {
+ pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+ "root %llu\n", extent_item_objectid,
+ eie->inum, eie->offset, root);
+ ret = iterate(eie->inum, eie->offset, root, ctx);
+ if (ret) {
+ pr_debug("stopping iteration for %llu due to ret=%d\n",
+ extent_item_objectid, ret);
break;
+ }
}
- if (!found) {
- printk(KERN_ERR "btrfs: failed to follow shared data backref "
- "to parent %llu\n", logical);
- WARN_ON(1);
- ret = -EIO;
- }
-
- free_extent_buffer(eb);
return ret;
}
/*
* calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
* when the iterator function returns a non-zero value, iteration stops.
*/
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- u64 extent_item_objectid,
- u64 extent_offset,
+ u64 extent_item_objectid, u64 extent_item_pos,
+ int search_commit_root,
iterate_extent_inodes_t *iterate, void *ctx)
{
- unsigned long ptr = 0;
- int last;
int ret;
- int type;
- u64 logical;
- u32 item_size;
- struct btrfs_extent_inline_ref *eiref;
- struct btrfs_extent_data_ref *dref;
- struct extent_buffer *eb;
- struct btrfs_extent_item *ei;
- struct btrfs_key key;
struct list_head data_refs = LIST_HEAD_INIT(data_refs);
struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
- struct __data_ref *ref_d;
- struct __shared_ref *ref_s;
-
- eb = path->nodes[0];
- ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
- /* first we iterate the inline refs, ... */
- do {
- last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
- &eiref, &type);
- if (last == -ENOENT) {
- ret = 0;
- break;
- }
- if (last < 0) {
- ret = last;
- break;
- }
+ struct btrfs_trans_handle *trans;
+ struct ulist *refs = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_node *ref_node = NULL;
+ struct ulist_node *root_node = NULL;
+ struct seq_list seq_elem = {};
+ struct seq_list tree_mod_seq_elem = {};
+ struct ulist_iterator ref_uiter;
+ struct ulist_iterator root_uiter;
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+
+ pr_debug("resolving all inodes for extent %llu\n",
+ extent_item_objectid);
+
+ if (search_commit_root) {
+ trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
+ } else {
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+ spin_unlock(&delayed_refs->lock);
+ btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ }
- if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
- ret = __data_list_add_eb(&data_refs, eb, dref);
- } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
- logical = btrfs_extent_inline_ref_offset(eb, eiref);
- ret = __shared_list_add(&shared_refs, logical);
- }
- } while (!ret && !last);
+ ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+ seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+ &extent_item_pos);
+ if (ret)
+ goto out;
- /* ... then we proceed to in-tree references and ... */
- while (!ret) {
- ++path->slots[0];
- if (path->slots[0] > btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(fs_info->extent_root, path);
- if (ret) {
- if (ret == 1)
- ret = 0; /* we're done */
- break;
- }
- eb = path->nodes[0];
- }
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
- if (key.objectid != extent_item_objectid)
+ ULIST_ITER_INIT(&ref_uiter);
+ while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+ ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ seq_elem.seq,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret)
break;
- if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_extent_data_ref);
- ret = __data_list_add_eb(&data_refs, eb, dref);
- } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- ret = __shared_list_add(&shared_refs, key.offset);
+ ULIST_ITER_INIT(&root_uiter);
+ while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+ pr_debug("root %llu references leaf %llu, data list "
+ "%#lx\n", root_node->val, ref_node->val,
+ ref_node->aux);
+ ret = iterate_leaf_refs(
+ (struct extent_inode_elem *)ref_node->aux,
+ root_node->val, extent_item_objectid,
+ iterate, ctx);
}
+ ulist_free(roots);
+ roots = NULL;
}
- btrfs_release_path(path);
-
- /*
- * ... only at the very end we can process the refs we found. this is
- * because the iterator function we call is allowed to make tree lookups
- * and we have to avoid deadlocks. additionally, we need more tree
- * lookups ourselves for shared data refs.
- */
- while (!list_empty(&data_refs)) {
- ref_d = list_first_entry(&data_refs, struct __data_ref, list);
- list_del(&ref_d->list);
- if (!ret)
- ret = iterate(ref_d->inum, extent_offset +
- ref_d->extent_data_item_offset,
- ref_d->root, ctx);
- kfree(ref_d);
- }
-
- while (!list_empty(&shared_refs)) {
- ref_s = list_first_entry(&shared_refs, struct __shared_ref,
- list);
- list_del(&ref_s->list);
- if (!ret)
- ret = __iter_shared_inline_ref(fs_info,
- ref_s->disk_byte,
- extent_item_objectid,
- extent_offset, path,
- &data_refs,
- iterate, ctx);
- kfree(ref_s);
+ free_leaf_list(refs);
+ ulist_free(roots);
+out:
+ if (!search_commit_root) {
+ btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+ btrfs_end_transaction(trans, fs_info->extent_root);
}
return ret;
@@ -586,19 +1443,22 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- u64 offset;
+ u64 extent_item_pos;
struct btrfs_key found_key;
+ int search_commit_root = path->search_commit_root;
ret = extent_from_logical(fs_info, logical, path,
&found_key);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -EINVAL;
if (ret < 0)
return ret;
- offset = logical - found_key.objectid;
- ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
- offset, iterate, ctx);
+ extent_item_pos = logical - found_key.objectid;
+ ret = iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, search_commit_root,
+ iterate, ctx);
return ret;
}
@@ -607,7 +1467,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_path *path,
iterate_irefs_t *iterate, void *ctx)
{
- int ret;
+ int ret = 0;
int slot;
u32 cur;
u32 len;
@@ -619,7 +1479,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
- while (1) {
+ while (!ret) {
+ path->leave_spinning = 1;
ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
&found_key);
if (ret < 0)
@@ -635,6 +1496,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
atomic_inc(&eb->refs);
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
item = btrfs_item_nr(eb, slot);
@@ -643,14 +1506,17 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
+ pr_debug("following ref at offset %u for inode %llu in "
+ "tree %llu\n", cur,
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)fs_root->objectid);
ret = iterate(parent, iref, eb, ctx);
- if (ret) {
- free_extent_buffer(eb);
+ if (ret)
break;
- }
len = sizeof(*iref) + name_len;
iref = (struct btrfs_inode_ref *)((char *)iref + len);
}
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
@@ -683,10 +1549,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
return PTR_ERR(fspath);
if (fspath > fspath_min) {
+ pr_debug("path resolved: %s\n", fspath);
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
+ pr_debug("missed path, not enough space. missing bytes: %lu, "
+ "constructed so far: %s\n",
+ (unsigned long)(fspath_min - fspath), fspath_min);
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
@@ -711,12 +1581,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
inode_to_path, ipath);
}
-/*
- * allocates space to return multiple file system paths for an inode.
- * total_bytes to allocate are passed, note that space usable for actual path
- * information will be total_bytes - sizeof(struct inode_fs_paths).
- * the returned pointer must be freed with free_ipath() in the end.
- */
struct btrfs_data_container *init_data_container(u32 total_bytes)
{
struct btrfs_data_container *data;
@@ -772,5 +1636,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
void free_ipath(struct inode_fs_paths *ipath)
{
+ if (!ipath)
+ return;
+ kfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8..c18d8ac7b79 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,9 @@
#define __BTRFS_BACKREF__
#include "ioctl.h"
+#include "ulist.h"
+
+#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
@@ -43,9 +46,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
u64 extent_item_objectid,
- u64 extent_offset,
+ u64 extent_offset, int search_commit_root,
iterate_extent_inodes_t *iterate, void *ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
@@ -54,6 +56,11 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **roots);
+
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d..12394a90d60 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,21 @@
#include "ordered-data.h"
#include "delayed-inode.h"
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
+#define BTRFS_INODE_ORPHAN_META_RESERVED 1
+#define BTRFS_INODE_DUMMY 2
+#define BTRFS_INODE_IN_DEFRAG 3
+#define BTRFS_INODE_DELALLOC_META_RESERVED 4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
+
/* in memory btrfs inode */
struct btrfs_inode {
/* which subvolume this inode belongs to */
@@ -51,12 +66,12 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
+ /* held while doing delalloc reservations */
+ struct mutex delalloc_mutex;
+
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
- /* for keeping track of orphaned inodes */
- struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
* to walk them all.
@@ -75,14 +90,13 @@ struct btrfs_inode {
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
+ unsigned long runtime_flags;
+
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
u64 generation;
- /* sequence number for NFS changes */
- u64 sequence;
-
/*
* transid of the trans_handle that last modified this inode
*/
@@ -142,22 +156,9 @@ struct btrfs_inode {
unsigned reserved_extents;
/*
- * ordered_data_close is set by truncate when a file that used
- * to have good data has been truncated to zero. When it is set
- * the btrfs file release call will add this inode to the
- * ordered operations list so that we make sure to flush out any
- * new data the application may have written before commit.
- */
- unsigned ordered_data_close:1;
- unsigned orphan_meta_reserved:1;
- unsigned dummy_inode:1;
- unsigned in_defrag:1;
- unsigned delalloc_meta_reserved:1;
-
- /*
* always compress this one file
*/
- unsigned force_compress:4;
+ unsigned force_compress;
struct btrfs_delayed_node *delayed_node;
@@ -199,4 +200,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
return false;
}
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == generation &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 00000000000..da6e9364a5e
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3358 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ * currently referenced by the super block (either directly
+ * or indirectly).
+ * 2. When a super block is written, it is verified that all
+ * referenced (directly or indirectly) blocks fulfill the
+ * following requirements:
+ * 2a. All referenced blocks have either been present when
+ * the file system was mounted, (i.e., they have been
+ * referenced by the super block) or they have been
+ * written since then and the write completion callback
+ * was called and a FLUSH request to the device where
+ * these blocks are located was received and completed.
+ * 2b. All referenced blocks need to have a generation
+ * number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
+ * excluding " [...]" */
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+ u32 magic_num; /* only used for debug purposes */
+ unsigned int is_metadata:1; /* if it is meta-data, not data-data */
+ unsigned int is_superblock:1; /* if it is one of the superblocks */
+ unsigned int is_iodone:1; /* if is done by lower subsystem */
+ unsigned int iodone_w_error:1; /* error was indicated to endio */
+ unsigned int never_written:1; /* block was added because it was
+ * referenced, not because it was
+ * written */
+ unsigned int mirror_num:2; /* large enough to hold
+ * BTRFS_SUPER_MIRROR_MAX */
+ struct btrfsic_dev_state *dev_state;
+ u64 dev_bytenr; /* key, physical byte num on disk */
+ u64 logical_bytenr; /* logical byte num on disk */
+ u64 generation;
+ struct btrfs_disk_key disk_key; /* extra info to print in case of
+ * issues, will not always be correct */
+ struct list_head collision_resolving_node; /* list node */
+ struct list_head all_blocks_node; /* list node */
+
+ /* the following two lists contain block_link items */
+ struct list_head ref_to_list; /* list */
+ struct list_head ref_from_list; /* list */
+ struct btrfsic_block *next_in_same_bio;
+ void *orig_bio_bh_private;
+ union {
+ bio_end_io_t *bio;
+ bh_end_io_t *bh;
+ } orig_bio_bh_end_io;
+ int submit_bio_bh_rw;
+ u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+ u32 magic_num; /* only used for debug purposes */
+ u32 ref_cnt;
+ struct list_head node_ref_to; /* list node */
+ struct list_head node_ref_from; /* list node */
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block *block_ref_to;
+ struct btrfsic_block *block_ref_from;
+ u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+ u32 magic_num; /* only used for debug purposes */
+ struct block_device *bdev;
+ struct btrfsic_state *state;
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block dummy_block_for_bio_bh_flush;
+ u64 last_flush_gen;
+ char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+ struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+ u64 start; /* virtual bytenr */
+ u64 dev_bytenr; /* physical bytenr on device */
+ u32 len;
+ struct btrfsic_dev_state *dev;
+ char **datav;
+ struct page **pagev;
+ void *mem_to_free;
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+ u32 magic;
+ u32 nr;
+ int error;
+ int i;
+ int limit_nesting;
+ int num_copies;
+ int mirror_num;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx *block_ctx;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfs_header *hdr;
+ struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+ u32 print_mask;
+ int include_extent_data;
+ int csum_size;
+ struct list_head all_blocks_list;
+ struct btrfsic_block_hashtable block_hashtable;
+ struct btrfsic_block_link_hashtable block_link_hashtable;
+ struct btrfs_root *root;
+ u64 max_superblock_generation;
+ struct btrfsic_block *latest_superblock;
+ u32 metablock_size;
+ u32 datablock_size;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dst, u32 offset, size_t len);
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx
+ *block_ctx, u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+ u32 len, struct block_device *bdev,
+ struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ char **datav, unsigned int num_pages);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+ b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+ b->dev_state = NULL;
+ b->dev_bytenr = 0;
+ b->logical_bytenr = 0;
+ b->generation = BTRFSIC_GENERATION_UNKNOWN;
+ b->disk_key.objectid = 0;
+ b->disk_key.type = 0;
+ b->disk_key.offset = 0;
+ b->is_metadata = 0;
+ b->is_superblock = 0;
+ b->is_iodone = 0;
+ b->iodone_w_error = 0;
+ b->never_written = 0;
+ b->mirror_num = 0;
+ b->next_in_same_bio = NULL;
+ b->orig_bio_bh_private = NULL;
+ b->orig_bio_bh_end_io.bio = NULL;
+ INIT_LIST_HEAD(&b->collision_resolving_node);
+ INIT_LIST_HEAD(&b->all_blocks_node);
+ INIT_LIST_HEAD(&b->ref_to_list);
+ INIT_LIST_HEAD(&b->ref_from_list);
+ b->submit_bio_bh_rw = 0;
+ b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+ struct btrfsic_block *b;
+
+ b = kzalloc(sizeof(*b), GFP_NOFS);
+ if (NULL != b)
+ btrfsic_block_init(b);
+
+ return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+ BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+ kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+ l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+ l->ref_cnt = 1;
+ INIT_LIST_HEAD(&l->node_ref_to);
+ INIT_LIST_HEAD(&l->node_ref_from);
+ INIT_LIST_HEAD(&l->collision_resolving_node);
+ l->block_ref_to = NULL;
+ l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+ struct btrfsic_block_link *l;
+
+ l = kzalloc(sizeof(*l), GFP_NOFS);
+ if (NULL != l)
+ btrfsic_block_link_init(l);
+
+ return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+ BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+ kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+ ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+ ds->bdev = NULL;
+ ds->state = NULL;
+ ds->name[0] = '\0';
+ INIT_LIST_HEAD(&ds->collision_resolving_node);
+ ds->last_flush_gen = 0;
+ btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+ ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+ ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = kzalloc(sizeof(*ds), GFP_NOFS);
+ if (NULL != ds)
+ btrfsic_dev_state_init(ds);
+
+ return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+ BUG_ON(!(NULL == ds ||
+ BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+ kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(b->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+ list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+ list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block *const b =
+ list_entry(elem, struct btrfsic_block,
+ collision_resolving_node);
+
+ if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+ return b;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+ ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+ & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+ list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+ ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_from))) &
+ (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem, struct btrfsic_block_link,
+ collision_resolving_node);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+ l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+ l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+ l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+ return l;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)ds->bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+ list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+ list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_dev_state *const ds =
+ list_entry(elem, struct btrfsic_dev_state,
+ collision_resolving_node);
+
+ if (ds->bdev == bdev)
+ return ds;
+ }
+
+ return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret = 0;
+ struct btrfs_super_block *selected_super;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct btrfsic_dev_state *selected_dev_state = NULL;
+ int pass;
+
+ BUG_ON(NULL == state);
+ selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
+ if (NULL == selected_super) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return -1;
+ }
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ int i;
+ struct btrfsic_dev_state *dev_state;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ dev_state = btrfsic_dev_state_lookup(device->bdev);
+ BUG_ON(NULL == dev_state);
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ ret = btrfsic_process_superblock_dev_mirror(
+ state, dev_state, device, i,
+ &selected_dev_state, selected_super);
+ if (0 != ret && 0 == i) {
+ kfree(selected_super);
+ return ret;
+ }
+ }
+ }
+
+ if (NULL == state->latest_superblock) {
+ printk(KERN_INFO "btrfsic: no superblock found!\n");
+ kfree(selected_super);
+ return -1;
+ }
+
+ state->csum_size = btrfs_super_csum_size(selected_super);
+
+ for (pass = 0; pass < 3; pass++) {
+ int num_copies;
+ int mirror_num;
+ u64 next_bytenr;
+
+ switch (pass) {
+ case 0:
+ next_bytenr = btrfs_super_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 1:
+ next_bytenr = btrfs_super_chunk_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 2:
+ next_bytenr = btrfs_super_log_root(selected_super);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(root @%llu,"
+ " mirror %d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ kfree(selected_super);
+ return -1;
+ }
+
+ next_block = btrfsic_block_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ &state->block_hashtable);
+ BUG_ON(NULL == next_block);
+
+ l = btrfsic_block_link_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ state->latest_superblock->dev_state->
+ bdev,
+ state->latest_superblock->dev_bytenr,
+ &state->block_link_hashtable);
+ BUG_ON(NULL == l);
+
+ ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+ if (ret < (int)PAGE_CACHE_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: read @logical %llu failed!\n",
+ (unsigned long long)
+ tmp_next_block_ctx.start);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ kfree(selected_super);
+ return -1;
+ }
+
+ ret = btrfsic_process_metablock(state,
+ next_block,
+ &tmp_next_block_ctx,
+ BTRFS_MAX_LEVEL + 3, 1);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ }
+ }
+
+ kfree(selected_super);
+ return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super)
+{
+ struct btrfs_super_block *super_tmp;
+ u64 dev_bytenr;
+ struct buffer_head *bh;
+ struct btrfsic_block *superblock_tmp;
+ int pass;
+ struct block_device *const superblock_bdev = device->bdev;
+
+ /* super block bytenr is always the unmapped device bytenr */
+ dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ return -1;
+ bh = __bread(superblock_bdev, dev_bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (NULL == bh)
+ return -1;
+ super_tmp = (struct btrfs_super_block *)
+ (bh->b_data + (dev_bytenr & 4095));
+
+ if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+ strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+ sizeof(super_tmp->magic)) ||
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+ btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+ btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+ btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
+ brelse(bh);
+ return 0;
+ }
+
+ superblock_tmp =
+ btrfsic_block_hashtable_lookup(superblock_bdev,
+ dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == superblock_tmp) {
+ superblock_tmp = btrfsic_block_alloc();
+ if (NULL == superblock_tmp) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ brelse(bh);
+ return -1;
+ }
+ /* for superblock, only the dev_bytenr makes sense */
+ superblock_tmp->dev_bytenr = dev_bytenr;
+ superblock_tmp->dev_state = dev_state;
+ superblock_tmp->logical_bytenr = dev_bytenr;
+ superblock_tmp->generation = btrfs_super_generation(super_tmp);
+ superblock_tmp->is_metadata = 1;
+ superblock_tmp->is_superblock = 1;
+ superblock_tmp->is_iodone = 1;
+ superblock_tmp->never_written = 0;
+ superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
+ " @%llu (%s/%llu/%d)\n",
+ superblock_bdev,
+ rcu_str_deref(device->name),
+ (unsigned long long)dev_bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ superblock_mirror_num);
+ list_add(&superblock_tmp->all_blocks_node,
+ &state->all_blocks_list);
+ btrfsic_block_hashtable_add(superblock_tmp,
+ &state->block_hashtable);
+ }
+
+ /* select the one with the highest generation field */
+ if (btrfs_super_generation(super_tmp) >
+ state->max_superblock_generation ||
+ 0 == state->max_superblock_generation) {
+ memcpy(selected_super, super_tmp, sizeof(*selected_super));
+ *selected_dev_state = dev_state;
+ state->max_superblock_generation =
+ btrfs_super_generation(super_tmp);
+ state->latest_superblock = superblock_tmp;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ u64 next_bytenr;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+ switch (pass) {
+ case 0:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "initial root ";
+ next_bytenr = btrfs_super_root(super_tmp);
+ break;
+ case 1:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "initial chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_tmp);
+ break;
+ case 2:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "initial log ";
+ next_bytenr = btrfs_super_log_root(super_tmp);
+ if (0 == next_bytenr)
+ continue;
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ if (btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ &tmp_next_block_ctx,
+ mirror_num)) {
+ printk(KERN_INFO "btrfsic: btrfsic_map_block("
+ "bytenr @%llu, mirror %d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ additional_string, 1, 1, 0,
+ mirror_num, NULL);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ next_block, superblock_tmp,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l) {
+ brelse(bh);
+ return -1;
+ }
+ }
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+ btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+ brelse(bh);
+ return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+ struct btrfsic_stack_frame *sf;
+
+ sf = kzalloc(sizeof(*sf), GFP_NOFS);
+ if (NULL == sf)
+ printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+ else
+ sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+ return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+ BUG_ON(!(NULL == sf ||
+ BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+ kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const first_block,
+ struct btrfsic_block_data_ctx *const first_block_ctx,
+ int first_limit_nesting, int force_iodone_flag)
+{
+ struct btrfsic_stack_frame initial_stack_frame = { 0 };
+ struct btrfsic_stack_frame *sf;
+ struct btrfsic_stack_frame *next_stack;
+ struct btrfs_header *const first_hdr =
+ (struct btrfs_header *)first_block_ctx->datav[0];
+
+ BUG_ON(!first_hdr);
+ sf = &initial_stack_frame;
+ sf->error = 0;
+ sf->i = -1;
+ sf->limit_nesting = first_limit_nesting;
+ sf->block = first_block;
+ sf->block_ctx = first_block_ctx;
+ sf->next_block = NULL;
+ sf->hdr = first_hdr;
+ sf->prev = NULL;
+
+continue_with_new_stack_frame:
+ sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ if (0 == sf->hdr->level) {
+ struct btrfs_leaf *const leafhdr =
+ (struct btrfs_leaf *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "leaf %llu items %d generation %llu"
+ " owner %llu\n",
+ (unsigned long long)
+ sf->block_ctx->start,
+ sf->nr,
+ (unsigned long long)
+ le64_to_cpu(leafhdr->header.generation),
+ (unsigned long long)
+ le64_to_cpu(leafhdr->header.owner));
+ }
+
+continue_with_current_leaf_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_item disk_item;
+ u32 disk_item_offset =
+ (uintptr_t)(leafhdr->items + sf->i) -
+ (uintptr_t)leafhdr;
+ struct btrfs_disk_key *disk_key;
+ u8 type;
+ u32 item_offset;
+
+ if (disk_item_offset + sizeof(struct btrfs_item) >
+ sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+ printk(KERN_INFO
+ "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(sf->block_ctx,
+ &disk_item,
+ disk_item_offset,
+ sizeof(struct btrfs_item));
+ item_offset = le32_to_cpu(disk_item.offset);
+ disk_key = &disk_item.key;
+ type = disk_key->type;
+
+ if (BTRFS_ROOT_ITEM_KEY == type) {
+ struct btrfs_root_item root_item;
+ u32 root_item_offset;
+ u64 next_bytenr;
+
+ root_item_offset = item_offset +
+ offsetof(struct btrfs_leaf, items);
+ if (root_item_offset +
+ sizeof(struct btrfs_root_item) >
+ sf->block_ctx->len)
+ goto leaf_item_out_of_bounce_error;
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &root_item,
+ root_item_offset,
+ sizeof(struct btrfs_root_item));
+ next_bytenr = le64_to_cpu(root_item.bytenr);
+
+ sf->error =
+ btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ disk_key,
+ le64_to_cpu(root_item.
+ generation));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.datav[0];
+
+ next_stack =
+ btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ btrfsic_release_block_ctx(
+ &sf->
+ next_block_ctx);
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx =
+ &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+ } else if (BTRFS_EXTENT_DATA_KEY == type &&
+ state->include_extent_data) {
+ sf->error = btrfsic_handle_extent_data(
+ state,
+ sf->block,
+ sf->block_ctx,
+ item_offset,
+ force_iodone_flag);
+ if (sf->error)
+ goto one_stack_frame_backwards;
+ }
+
+ goto continue_with_current_leaf_stack_frame;
+ }
+ } else {
+ struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "node %llu level %d items %d"
+ " generation %llu owner %llu\n",
+ (unsigned long long)
+ sf->block_ctx->start,
+ nodehdr->header.level, sf->nr,
+ (unsigned long long)
+ le64_to_cpu(nodehdr->header.generation),
+ (unsigned long long)
+ le64_to_cpu(nodehdr->header.owner));
+ }
+
+continue_with_current_node_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_key_ptr key_ptr;
+ u32 key_ptr_offset;
+ u64 next_bytenr;
+
+ key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+ (uintptr_t)nodehdr;
+ if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+ sf->block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &key_ptr, key_ptr_offset,
+ sizeof(struct btrfs_key_ptr));
+ next_bytenr = le64_to_cpu(key_ptr.blockptr);
+
+ sf->error = btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ &key_ptr.key,
+ le64_to_cpu(key_ptr.generation));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.datav[0];
+
+ next_stack = btrfsic_stack_frame_alloc();
+ if (NULL == next_stack)
+ goto one_stack_frame_backwards;
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx = &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+
+ goto continue_with_current_node_stack_frame;
+ }
+ }
+
+one_stack_frame_backwards:
+ if (NULL != sf->prev) {
+ struct btrfsic_stack_frame *const prev = sf->prev;
+
+ /* the one for the initial block is freed in the caller */
+ btrfsic_release_block_ctx(sf->block_ctx);
+
+ if (sf->error) {
+ prev->error = sf->error;
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto one_stack_frame_backwards;
+ }
+
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto continue_with_new_stack_frame;
+ } else {
+ BUG_ON(&initial_stack_frame != sf);
+ }
+
+ return sf->error;
+}
+
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dstv, u32 offset, size_t len)
+{
+ size_t cur;
+ size_t offset_in_page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(offset + len > block_ctx->len);
+ offset_in_page = (start_offset + offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+ BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT);
+ kaddr = block_ctx->datav[i];
+ memcpy(dst, kaddr + offset_in_page, cur);
+
+ dst += cur;
+ len -= cur;
+ offset_in_page = 0;
+ i++;
+ }
+}
+
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation)
+{
+ struct btrfsic_block *next_block = NULL;
+ int ret;
+ struct btrfsic_block_link *l;
+ int did_alloc_block_link;
+ int block_was_created;
+
+ *next_blockp = NULL;
+ if (0 == *num_copiesp) {
+ *num_copiesp =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, *num_copiesp);
+ *mirror_nump = 1;
+ }
+
+ if (*mirror_nump > *num_copiesp)
+ return 0;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+ *mirror_nump);
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ next_block_ctx, *mirror_nump);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr, *mirror_nump);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(state,
+ next_block_ctx, "referenced ",
+ 1, force_iodone_flag,
+ !force_iodone_flag,
+ *mirror_nump,
+ &block_was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+ if (block_was_created) {
+ l = NULL;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ } else {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d)"
+ " found in hash table, %c,"
+ " bytenr mismatch (!= stored %llu).\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx->dev->name,
+ (unsigned long long)next_block_ctx->dev_bytenr,
+ *mirror_nump,
+ btrfsic_get_block_type(state, next_block),
+ (unsigned long long)next_block->logical_bytenr);
+ } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx->dev->name,
+ (unsigned long long)next_block_ctx->dev_bytenr,
+ *mirror_nump,
+ btrfsic_get_block_type(state, next_block));
+ next_block->logical_bytenr = next_bytenr;
+
+ next_block->mirror_num = *mirror_nump;
+ l = btrfsic_block_link_hashtable_lookup(
+ next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_link_hashtable);
+ }
+
+ next_block->disk_key = *disk_key;
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ did_alloc_block_link = 1;
+ l->block_ref_to = next_block;
+ l->block_ref_from = block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ did_alloc_block_link = 0;
+ if (0 == limit_nesting) {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+ }
+
+ if (limit_nesting > 0 && did_alloc_block_link) {
+ ret = btrfsic_read_block(state, next_block_ctx);
+ if (ret < (int)next_block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: read block @logical %llu failed!\n",
+ (unsigned long long)next_bytenr);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ *next_blockp = next_block;
+ } else {
+ *next_blockp = NULL;
+ }
+ (*mirror_nump)++;
+
+ return 0;
+}
+
+static int btrfsic_handle_extent_data(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag)
+{
+ int ret;
+ struct btrfs_file_extent_item file_extent_item;
+ u64 file_extent_item_offset;
+ u64 next_bytenr;
+ u64 num_bytes;
+ u64 generation;
+ struct btrfsic_block_link *l;
+
+ file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+ item_offset;
+ if (file_extent_item_offset +
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+ ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+ file_extent_item.type,
+ (unsigned long long)
+ le64_to_cpu(file_extent_item.disk_bytenr));
+ return 0;
+ }
+
+ if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ sizeof(struct btrfs_file_extent_item));
+ next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
+ le64_to_cpu(file_extent_item.offset);
+ generation = le64_to_cpu(file_extent_item.generation);
+ num_bytes = le64_to_cpu(file_extent_item.num_bytes);
+ generation = le64_to_cpu(file_extent_item.generation);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+ " offset = %llu, num_bytes = %llu\n",
+ file_extent_item.type,
+ (unsigned long long)
+ le64_to_cpu(file_extent_item.disk_bytenr),
+ (unsigned long long)le64_to_cpu(file_extent_item.offset),
+ (unsigned long long)num_bytes);
+ while (num_bytes > 0) {
+ u32 chunk_len;
+ int num_copies;
+ int mirror_num;
+
+ if (num_bytes > state->datablock_size)
+ chunk_len = state->datablock_size;
+ else
+ chunk_len = num_bytes;
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, state->datablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfsic_block *next_block;
+ int block_was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "btrfsic_handle_extent_data("
+ "mirror_num=%d)\n", mirror_num);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO
+ "\tdisk_bytenr = %llu, num_bytes %u\n",
+ (unsigned long long)next_bytenr,
+ chunk_len);
+ ret = btrfsic_map_block(state, next_bytenr,
+ chunk_len, &next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &next_block_ctx,
+ "referenced ",
+ 0,
+ force_iodone_flag,
+ !force_iodone_flag,
+ mirror_num,
+ &block_was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&next_block_ctx);
+ return -1;
+ }
+ if (!block_was_created) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block"
+ " @%llu (%s/%llu/%d)"
+ " found in hash table, D,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx.dev->name,
+ (unsigned long long)
+ next_block_ctx.dev_bytenr,
+ mirror_num,
+ (unsigned long long)
+ next_block->logical_bytenr);
+ }
+ next_block->logical_bytenr = next_bytenr;
+ next_block->mirror_num = mirror_num;
+ }
+
+ l = btrfsic_block_link_lookup_or_add(state,
+ &next_block_ctx,
+ next_block, block,
+ generation);
+ btrfsic_release_block_ctx(&next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+
+ next_bytenr += chunk_len;
+ num_bytes -= chunk_len;
+ }
+
+ return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num)
+{
+ int ret;
+ u64 length;
+ struct btrfs_bio *multi = NULL;
+ struct btrfs_device *device;
+
+ length = len;
+ ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+ bytenr, &length, &multi, mirror_num);
+
+ device = multi->stripes[0].dev;
+ block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+ block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ if (0 == ret)
+ kfree(multi);
+ if (NULL == block_ctx_out->dev) {
+ ret = -ENXIO;
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+ }
+
+ return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+ u32 len, struct block_device *bdev,
+ struct btrfsic_block_data_ctx *block_ctx_out)
+{
+ block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+ block_ctx_out->dev_bytenr = bytenr;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+ if (NULL != block_ctx_out->dev) {
+ return 0;
+ } else {
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+ return -ENXIO;
+ }
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+ if (block_ctx->mem_to_free) {
+ unsigned int num_pages;
+
+ BUG_ON(!block_ctx->datav);
+ BUG_ON(!block_ctx->pagev);
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ while (num_pages > 0) {
+ num_pages--;
+ if (block_ctx->datav[num_pages]) {
+ kunmap(block_ctx->pagev[num_pages]);
+ block_ctx->datav[num_pages] = NULL;
+ }
+ if (block_ctx->pagev[num_pages]) {
+ __free_page(block_ctx->pagev[num_pages]);
+ block_ctx->pagev[num_pages] = NULL;
+ }
+ }
+
+ kfree(block_ctx->mem_to_free);
+ block_ctx->mem_to_free = NULL;
+ block_ctx->pagev = NULL;
+ block_ctx->datav = NULL;
+ }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx)
+{
+ unsigned int num_pages;
+ unsigned int i;
+ u64 dev_bytenr;
+ int ret;
+
+ BUG_ON(block_ctx->datav);
+ BUG_ON(block_ctx->pagev);
+ BUG_ON(block_ctx->mem_to_free);
+ if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: read_block() with unaligned bytenr %llu\n",
+ (unsigned long long)block_ctx->dev_bytenr);
+ return -1;
+ }
+
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+ sizeof(*block_ctx->pagev)) *
+ num_pages, GFP_NOFS);
+ if (!block_ctx->mem_to_free)
+ return -1;
+ block_ctx->datav = block_ctx->mem_to_free;
+ block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+ if (!block_ctx->pagev[i])
+ return -1;
+ }
+
+ dev_bytenr = block_ctx->dev_bytenr;
+ for (i = 0; i < num_pages;) {
+ struct bio *bio;
+ unsigned int j;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ bio = bio_alloc(GFP_NOFS, num_pages - i);
+ if (!bio) {
+ printk(KERN_INFO
+ "btrfsic: bio_alloc() for %u pages failed!\n",
+ num_pages - i);
+ return -1;
+ }
+ bio->bi_bdev = block_ctx->dev->bdev;
+ bio->bi_sector = dev_bytenr >> 9;
+ bio->bi_end_io = btrfsic_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ for (j = i; j < num_pages; j++) {
+ ret = bio_add_page(bio, block_ctx->pagev[j],
+ PAGE_CACHE_SIZE, 0);
+ if (PAGE_CACHE_SIZE != ret)
+ break;
+ }
+ if (j == i) {
+ printk(KERN_INFO
+ "btrfsic: error, failed to add a single page!\n");
+ return -1;
+ }
+ submit_bio(READ, bio);
+
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk(KERN_INFO
+ "btrfsic: read error at logical %llu dev %s!\n",
+ block_ctx->start, block_ctx->dev->name);
+ bio_put(bio);
+ return -1;
+ }
+ bio_put(bio);
+ dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+ i = j;
+ }
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ if (!block_ctx->datav[i]) {
+ printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+ block_ctx->dev->name);
+ return -1;
+ }
+ }
+
+ return block_ctx->len;
+}
+
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+ struct list_head *elem_all;
+
+ BUG_ON(NULL == state);
+
+ printk(KERN_INFO "all_blocks_list:\n");
+ list_for_each(elem_all, &state->all_blocks_list) {
+ const struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *elem_ref_from;
+
+ printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num);
+
+ list_for_each(elem_ref_to, &b_all->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " refers %u* to"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ }
+
+ list_for_each(elem_ref_from, &b_all->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from,
+ struct btrfsic_block_link,
+ node_ref_from);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " is ref %u* from"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ }
+
+ printk(KERN_INFO "\n");
+ }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ char **datav, unsigned int num_pages)
+{
+ struct btrfs_header *h;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ unsigned int i;
+
+ if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+ return 1; /* not metadata */
+ num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+ h = (struct btrfs_header *)datav[0];
+
+ if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+ return 1;
+
+ for (i = 0; i < num_pages; i++) {
+ u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_CACHE_SIZE :
+ (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+
+ crc = crc32c(crc, data, sublen);
+ }
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, h->csum, state->csum_size))
+ return 1;
+
+ return 0; /* is metadata */
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw)
+{
+ int is_metadata;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx block_ctx;
+ int ret;
+ struct btrfsic_state *state = dev_state->state;
+ struct block_device *bdev = dev_state->bdev;
+ unsigned int processed_len;
+
+ if (NULL != bio_is_patched)
+ *bio_is_patched = 0;
+
+again:
+ if (num_pages == 0)
+ return;
+
+ processed_len = 0;
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+ num_pages));
+
+ block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+ &state->block_hashtable);
+ if (NULL != block) {
+ u64 bytenr = 0;
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ if (block->is_superblock) {
+ bytenr = le64_to_cpu(((struct btrfs_super_block *)
+ mapped_datav[0])->bytenr);
+ if (num_pages * PAGE_CACHE_SIZE <
+ BTRFS_SUPER_INFO_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ is_metadata = 1;
+ BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+ processed_len = BTRFS_SUPER_INFO_SIZE;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+ printk(KERN_INFO
+ "[before new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ }
+ if (is_metadata) {
+ if (!block->is_superblock) {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->metablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->metablock_size;
+ bytenr = le64_to_cpu(((struct btrfs_header *)
+ mapped_datav[0])->bytenr);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+ dev_state,
+ dev_bytenr);
+ }
+ if (block->logical_bytenr != bytenr) {
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)
+ block->logical_bytenr);
+ block->logical_bytenr = bytenr;
+ } else if (state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ } else {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->datablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->datablock_size;
+ bytenr = block->logical_bytenr;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "ref_to_list: %cE, ref_from_list: %cE\n",
+ list_empty(&block->ref_to_list) ? ' ' : '!',
+ list_empty(&block->ref_from_list) ? ' ' : '!');
+ if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), old(gen=%llu,"
+ " objectid=%llu, type=%d, offset=%llu),"
+ " new(gen=%llu),"
+ " which is referenced by most recent superblock"
+ " (superblockgen=%llu)!\n",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ (unsigned long long)block->generation,
+ (unsigned long long)
+ le64_to_cpu(block->disk_key.objectid),
+ block->disk_key.type,
+ (unsigned long long)
+ le64_to_cpu(block->disk_key.offset),
+ (unsigned long long)
+ le64_to_cpu(((struct btrfs_header *)
+ mapped_datav[0])->generation),
+ (unsigned long long)
+ state->max_superblock_generation);
+ btrfsic_dump_tree(state);
+ }
+
+ if (!block->is_iodone && !block->never_written) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ (unsigned long long)block->generation,
+ (unsigned long long)
+ le64_to_cpu(((struct btrfs_header *)
+ mapped_datav[0])->generation));
+ /* it would not be safe to go on */
+ btrfsic_dump_tree(state);
+ goto continue_loop;
+ }
+
+ /*
+ * Clear all references of this block. Do not free
+ * the block itself even if is not referenced anymore
+ * because it still carries valueable information
+ * like whether it was ever written and IO completed.
+ */
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &block->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+ l->ref_cnt--;
+ if (0 == l->ref_cnt) {
+ list_del(&l->node_ref_to);
+ list_del(&l->node_ref_from);
+ btrfsic_block_link_hashtable_remove(l);
+ btrfsic_block_link_free(l);
+ }
+ }
+
+ if (block->is_superblock)
+ ret = btrfsic_map_superblock(state, bytenr,
+ processed_len,
+ bdev, &block_ctx);
+ else
+ ret = btrfsic_map_block(state, bytenr, processed_len,
+ &block_ctx, 0);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(root @%llu)"
+ " failed!\n", (unsigned long long)bytenr);
+ goto continue_loop;
+ }
+ block_ctx.datav = mapped_datav;
+ /* the following is required in case of writes to mirrors,
+ * use the same that was used for the lookup */
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+
+ if (is_metadata || state->include_extent_data) {
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private =
+ bio->bi_private;
+ block->orig_bio_bh_end_io.bio =
+ bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.
+ bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ }
+
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (is_metadata) {
+ block->logical_bytenr = bytenr;
+ block->is_metadata = 1;
+ if (block->is_superblock) {
+ BUG_ON(PAGE_CACHE_SIZE !=
+ BTRFS_SUPER_INFO_SIZE);
+ ret = btrfsic_process_written_superblock(
+ state,
+ block,
+ (struct btrfs_super_block *)
+ mapped_datav[0]);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+ printk(KERN_INFO
+ "[after new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ } else {
+ block->mirror_num = 0; /* unknown */
+ ret = btrfsic_process_metablock(
+ state,
+ block,
+ &block_ctx,
+ 0, 0);
+ }
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: btrfsic_process_metablock"
+ "(root @%llu) failed!\n",
+ (unsigned long long)dev_bytenr);
+ } else {
+ block->is_metadata = 0;
+ block->mirror_num = 0; /* unknown */
+ block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ if (!state->include_extent_data
+ && list_empty(&block->ref_from_list)) {
+ /*
+ * disk block is overwritten with extent
+ * data (not meta data) and we are configured
+ * to not include extent data: take the
+ * chance and free the block's memory
+ */
+ btrfsic_block_hashtable_remove(block);
+ list_del(&block->all_blocks_node);
+ btrfsic_block_free(block);
+ }
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ } else {
+ /* block has not been found in hash table */
+ u64 bytenr;
+
+ if (!is_metadata) {
+ processed_len = state->datablock_size;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "Written block (%s/%llu/?)"
+ " !found in hash table, D.\n",
+ dev_state->name,
+ (unsigned long long)dev_bytenr);
+ if (!state->include_extent_data) {
+ /* ignore that written D block */
+ goto continue_loop;
+ }
+
+ /* this is getting ugly for the
+ * include_extent_data case... */
+ bytenr = 0; /* unknown */
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.pagev = NULL;
+ } else {
+ processed_len = state->metablock_size;
+ bytenr = le64_to_cpu(((struct btrfs_header *)
+ mapped_datav[0])->bytenr);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+ dev_bytenr);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/?)"
+ " !found in hash table, M.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr);
+
+ ret = btrfsic_map_block(state, bytenr, processed_len,
+ &block_ctx, 0);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(root @%llu)"
+ " failed!\n",
+ (unsigned long long)dev_bytenr);
+ goto continue_loop;
+ }
+ }
+ block_ctx.datav = mapped_datav;
+ /* the following is required in case of writes to mirrors,
+ * use the same that was used for the lookup */
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&block_ctx);
+ goto continue_loop;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = dev_bytenr;
+ block->logical_bytenr = bytenr;
+ block->is_metadata = is_metadata;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->mirror_num = 0; /* unknown */
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New written %c-block @%llu (%s/%llu/%d)\n",
+ is_metadata ? 'M' : 'D',
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+ if (is_metadata) {
+ ret = btrfsic_process_metablock(state, block,
+ &block_ctx, 0, 0);
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: process_metablock(root @%llu)"
+ " failed!\n",
+ (unsigned long long)dev_bytenr);
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+continue_loop:
+ BUG_ON(!processed_len);
+ dev_bytenr += processed_len;
+ mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+ num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+ goto again;
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ int iodone_w_error;
+
+ /* mutex is not held! This is not save if IO is not yet completed
+ * on umount */
+ iodone_w_error = 0;
+ if (bio_error_status)
+ iodone_w_error = 1;
+
+ BUG_ON(NULL == block);
+ bp->bi_private = block->orig_bio_bh_private;
+ bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+ do {
+ struct btrfsic_block *next_block;
+ struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ bio_error_status,
+ btrfsic_get_block_type(dev_state->state, block),
+ (unsigned long long)block->logical_bytenr,
+ dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ next_block = block->next_in_same_bio;
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ (unsigned long long)
+ dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is
+ * on disk */
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ block = next_block;
+ } while (NULL != block);
+
+ bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+ int iodone_w_error = !uptodate;
+ struct btrfsic_dev_state *dev_state;
+
+ BUG_ON(NULL == block);
+ dev_state = block->dev_state;
+ if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+ iodone_w_error,
+ btrfsic_get_block_type(dev_state->state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ (unsigned long long)dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is on disk */
+
+ bh->b_private = block->orig_bio_bh_private;
+ bh->b_end_io = block->orig_bio_bh_end_io.bh;
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const superblock,
+ struct btrfs_super_block *const super_hdr)
+{
+ int pass;
+
+ superblock->generation = btrfs_super_generation(super_hdr);
+ if (!(superblock->generation > state->max_superblock_generation ||
+ 0 == state->max_superblock_generation)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: superblock @%llu (%s/%llu/%d)"
+ " with old gen %llu <= %llu\n",
+ (unsigned long long)superblock->logical_bytenr,
+ superblock->dev_state->name,
+ (unsigned long long)superblock->dev_bytenr,
+ superblock->mirror_num,
+ (unsigned long long)
+ btrfs_super_generation(super_hdr),
+ (unsigned long long)
+ state->max_superblock_generation);
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+ " with new gen %llu > %llu\n",
+ (unsigned long long)superblock->logical_bytenr,
+ superblock->dev_state->name,
+ (unsigned long long)superblock->dev_bytenr,
+ superblock->mirror_num,
+ (unsigned long long)
+ btrfs_super_generation(super_hdr),
+ (unsigned long long)
+ state->max_superblock_generation);
+
+ state->max_superblock_generation =
+ btrfs_super_generation(super_hdr);
+ state->latest_superblock = superblock;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ int ret;
+ u64 next_bytenr;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+
+ switch (pass) {
+ case 0:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "root ";
+ next_bytenr = btrfs_super_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 1:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 2:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "log ";
+ next_bytenr = btrfs_super_log_root(super_hdr);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, BTRFS_SUPER_INFO_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ int was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_process_written_superblock("
+ "mirror_num=%d)\n", mirror_num);
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ additional_string,
+ 1, 0, 1,
+ mirror_num,
+ &was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ if (was_created)
+ next_block->generation =
+ BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ next_block,
+ superblock,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+ }
+
+ if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+ WARN_ON(1);
+ btrfsic_dump_tree(state);
+ }
+
+ return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_to;
+ int ret = 0;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /*
+ * Note that this situation can happen and does not
+ * indicate an error in regular cases. It happens
+ * when disk blocks are freed and later reused.
+ * The check-integrity module is not aware of any
+ * block free operations, it just recognizes block
+ * write operations. Therefore it keeps the linkage
+ * information for a block until a block is
+ * rewritten. This can temporarily cause incorrect
+ * and even circular linkage informations. This
+ * causes no harm unless such blocks are referenced
+ * by the most recent super block.
+ */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 1).\n");
+
+ return ret;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack
+ * space is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " %u* refers to %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ if (l->block_ref_to->never_written) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is never written!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (!l->block_ref_to->is_iodone) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->parent_generation !=
+ l->block_ref_to->generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->parent_generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->block_ref_to->generation) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " with generation %llu !="
+ " parent generation %llu!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ (unsigned long long)l->block_ref_to->generation,
+ (unsigned long long)l->parent_generation);
+ ret = -1;
+ } else if (l->block_ref_to->flush_gen >
+ l->block_ref_to->dev_state->last_flush_gen) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not flushed out of disk's write cache"
+ " (block flush_gen=%llu,"
+ " dev->flush_gen=%llu)!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ (unsigned long long)block->flush_gen,
+ (unsigned long long)
+ l->block_ref_to->dev_state->last_flush_gen);
+ ret = -1;
+ } else if (-1 == btrfsic_check_all_ref_blocks(state,
+ l->block_ref_to,
+ recursion_level +
+ 1)) {
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+ const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_from;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /* refer to comment at "abort cyclic linkage (case 1)" */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 2).\n");
+
+ return 0;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_from, &block->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from, struct btrfsic_block_link,
+ node_ref_from);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ if (l->block_ref_from->is_superblock &&
+ state->latest_superblock->dev_bytenr ==
+ l->block_ref_from->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev ==
+ l->block_ref_from->dev_state->bdev)
+ return 1;
+ else if (btrfsic_is_block_ref_by_superblock(state,
+ l->block_ref_from,
+ recursion_level +
+ 1))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Add %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Rem %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block)
+{
+ if (block->is_superblock &&
+ state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+ return 'S';
+ else if (block->is_superblock)
+ return 's';
+ else if (block->is_metadata)
+ return 'M';
+ else
+ return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+ btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level)
+{
+ struct list_head *elem_ref_to;
+ int indent_add;
+ static char buf[80];
+ int cursor_position;
+
+ /*
+ * Should better fill an on-stack buffer with a complete line and
+ * dump it at once when it is time to print a newline character.
+ */
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ return;
+ }
+ printk(buf);
+ indent_level += indent_add;
+ if (list_empty(&block->ref_to_list)) {
+ printk("\n");
+ return;
+ }
+ if (block->mirror_num > 1 &&
+ !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+ printk(" [...]\n");
+ return;
+ }
+
+ cursor_position = indent_level;
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ while (cursor_position < indent_level) {
+ printk(" ");
+ cursor_position++;
+ }
+ if (l->ref_cnt > 1)
+ indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+ else
+ indent_add = sprintf(buf, " --> ");
+ if (indent_level + indent_add >
+ BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ cursor_position = 0;
+ continue;
+ }
+
+ printk(buf);
+
+ btrfsic_dump_tree_sub(state, l->block_ref_to,
+ indent_level + indent_add);
+ cursor_position = 0;
+ }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation)
+{
+ struct btrfsic_block_link *l;
+
+ l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ from_block->dev_state->bdev,
+ from_block->dev_bytenr,
+ &state->block_link_hashtable);
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc" " failed!\n");
+ return NULL;
+ }
+
+ l->block_ref_to = next_block;
+ l->block_ref_from = from_block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &from_block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+
+ return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created)
+{
+ struct btrfsic_block *block;
+
+ block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == block) {
+ struct btrfsic_dev_state *dev_state;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return NULL;
+ }
+ dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+ if (NULL == dev_state) {
+ printk(KERN_INFO
+ "btrfsic: error, lookup dev_state failed!\n");
+ btrfsic_block_free(block);
+ return NULL;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = block_ctx->dev_bytenr;
+ block->logical_bytenr = block_ctx->start;
+ block->is_metadata = is_metadata;
+ block->is_iodone = is_iodone;
+ block->never_written = never_written;
+ block->mirror_num = mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New %s%c-block @%llu (%s/%llu/%d)\n",
+ additional_string,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+ if (NULL != was_created)
+ *was_created = 1;
+ } else {
+ if (NULL != was_created)
+ *was_created = 0;
+ }
+
+ return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr)
+{
+ int num_copies;
+ int mirror_num;
+ int ret;
+ struct btrfsic_block_data_ctx block_ctx;
+ int match = 0;
+
+ num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ bytenr, state->metablock_size);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, state->metablock_size,
+ &block_ctx, mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(logical @%llu,"
+ " mirror %d) failed!\n",
+ (unsigned long long)bytenr, mirror_num);
+ continue;
+ }
+
+ if (dev_state->bdev == block_ctx.dev->bdev &&
+ dev_bytenr == block_ctx.dev_bytenr) {
+ match++;
+ btrfsic_release_block_ctx(&block_ctx);
+ break;
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+ if (!match) {
+ printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+ " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+ " phys_bytenr=%llu)!\n",
+ (unsigned long long)bytenr, dev_state->name,
+ (unsigned long long)dev_bytenr);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr,
+ state->metablock_size,
+ &block_ctx, mirror_num);
+ if (ret)
+ continue;
+
+ printk(KERN_INFO "Read logical bytenr @%llu maps to"
+ " (%s/%llu/%d)\n",
+ (unsigned long long)bytenr,
+ block_ctx.dev->name,
+ (unsigned long long)block_ctx.dev_bytenr,
+ mirror_num);
+ }
+ WARN_ON(1);
+ }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = btrfsic_dev_state_hashtable_lookup(bdev,
+ &btrfsic_dev_state_hashtable);
+ return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return submit_bh(rw, bh);
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bh() might also be called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+ /* Only called to write the superblock (incl. FLUSH/FUA) */
+ if (NULL != dev_state &&
+ (rw & WRITE) && bh->b_size > 0) {
+ u64 dev_bytenr;
+
+ dev_bytenr = 4096 * bh->b_blocknr;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+ " size=%lu, data=%p, bdev=%p)\n",
+ rw, (unsigned long)bh->b_blocknr,
+ (unsigned long long)dev_bytenr,
+ (unsigned long)bh->b_size, bh->b_data,
+ bh->b_bdev);
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ &bh->b_data, 1, NULL,
+ NULL, bh, rw);
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
+ rw, bh->b_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bh(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ }
+ }
+ mutex_unlock(&btrfsic_mutex);
+ return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized) {
+ submit_bio(rw, bio);
+ return;
+ }
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bio() is also called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+ if (NULL != dev_state &&
+ (rw & WRITE) && NULL != bio->bi_io_vec) {
+ unsigned int i;
+ u64 dev_bytenr;
+ int bio_is_patched;
+ char **mapped_datav;
+
+ dev_bytenr = 512 * bio->bi_sector;
+ bio_is_patched = 0;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x, bi_vcnt=%u,"
+ " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+ rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
+ (unsigned long long)dev_bytenr,
+ bio->bi_bdev);
+
+ mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+ GFP_NOFS);
+ if (!mapped_datav)
+ goto leave;
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+ mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+ if (!mapped_datav[i]) {
+ while (i > 0) {
+ i--;
+ kunmap(bio->bi_io_vec[i].bv_page);
+ }
+ kfree(mapped_datav);
+ goto leave;
+ }
+ if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE) ==
+ (dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "#%u: page=%p, len=%u, offset=%u\n",
+ i, bio->bi_io_vec[i].bv_page,
+ bio->bi_io_vec[i].bv_len,
+ bio->bi_io_vec[i].bv_offset);
+ }
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ mapped_datav, bio->bi_vcnt,
+ bio, &bio_is_patched,
+ NULL, rw);
+ while (i > 0) {
+ i--;
+ kunmap(bio->bi_io_vec[i].bv_page);
+ }
+ kfree(mapped_datav);
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
+ rw, bio->bi_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bio(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ }
+ }
+leave:
+ mutex_unlock(&btrfsic_mutex);
+
+ submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask)
+{
+ int ret;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (root->nodesize != root->leafsize) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+ root->nodesize, root->leafsize);
+ return -1;
+ }
+ if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ state = kzalloc(sizeof(*state), GFP_NOFS);
+ if (NULL == state) {
+ printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+ return -1;
+ }
+
+ if (!btrfsic_is_initialized) {
+ mutex_init(&btrfsic_mutex);
+ btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+ btrfsic_is_initialized = 1;
+ }
+ mutex_lock(&btrfsic_mutex);
+ state->root = root;
+ state->print_mask = print_mask;
+ state->include_extent_data = including_extent_data;
+ state->csum_size = 0;
+ state->metablock_size = root->nodesize;
+ state->datablock_size = root->sectorsize;
+ INIT_LIST_HEAD(&state->all_blocks_list);
+ btrfsic_block_hashtable_init(&state->block_hashtable);
+ btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+ state->max_superblock_generation = 0;
+ state->latest_superblock = NULL;
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+ char *p;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_alloc();
+ if (NULL == ds) {
+ printk(KERN_INFO
+ "btrfs check-integrity: kmalloc() failed!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return -1;
+ }
+ ds->bdev = device->bdev;
+ ds->state = state;
+ bdevname(ds->bdev, ds->name);
+ ds->name[BDEVNAME_SIZE - 1] = '\0';
+ for (p = ds->name; *p != '\0'; p++);
+ while (p > ds->name && *p != '/')
+ p--;
+ if (*p == '/')
+ p++;
+ strlcpy(ds->name, p, sizeof(ds->name));
+ btrfsic_dev_state_hashtable_add(ds,
+ &btrfsic_dev_state_hashtable);
+ }
+
+ ret = btrfsic_process_superblock(state, fs_devices);
+ if (0 != ret) {
+ mutex_unlock(&btrfsic_mutex);
+ btrfsic_unmount(root, fs_devices);
+ return ret;
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+ btrfsic_dump_database(state);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+ btrfsic_dump_tree(state);
+
+ mutex_unlock(&btrfsic_mutex);
+ return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices)
+{
+ struct list_head *elem_all;
+ struct list_head *tmp_all;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ mutex_lock(&btrfsic_mutex);
+
+ state = NULL;
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_hashtable_lookup(
+ device->bdev,
+ &btrfsic_dev_state_hashtable);
+ if (NULL != ds) {
+ state = ds->state;
+ btrfsic_dev_state_hashtable_remove(ds);
+ btrfsic_dev_state_free(ds);
+ }
+ }
+
+ if (NULL == state) {
+ printk(KERN_INFO
+ "btrfsic: error, cannot find state information"
+ " on umount!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return;
+ }
+
+ /*
+ * Don't care about keeping the lists' state up to date,
+ * just free all memory that was allocated dynamically.
+ * Free the blocks and the block_links.
+ */
+ list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+ struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &b_all->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+
+ l->ref_cnt--;
+ if (0 == l->ref_cnt)
+ btrfsic_block_link_free(l);
+ }
+
+ if (b_all->is_iodone || b_all->never_written)
+ btrfsic_block_free(b_all);
+ else
+ printk(KERN_INFO "btrfs: attempt to free %c-block"
+ " @%llu (%s/%llu/%d) on umount which is"
+ " not yet iodone!\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num);
+ }
+
+ mutex_unlock(&btrfsic_mutex);
+
+ kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 00000000000..8b59175cc50
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 14f1c5a0b2d..86eff48dab7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -120,10 +120,10 @@ static int check_compressed_csum(struct inode *inode,
page = cb->compressed_pages[i];
csum = ~(u32)0;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
btrfs_csum_final(csum, (char *)&csum);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
if (csum != *cb_sum) {
printk(KERN_INFO "btrfs csum failed ino %llu "
@@ -226,8 +226,8 @@ out:
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline int end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
@@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
index += ret;
}
/* the inode may be gone now */
- return 0;
}
/*
@@ -392,20 +391,21 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
*/
atomic_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(root, inode, bio,
start, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(bio);
bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ BUG_ON(!bio);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -421,15 +421,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_get(bio);
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(bio);
return 0;
@@ -497,7 +497,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* sure they map to this compressed extent on disk.
*/
set_page_extent_mapped(page);
- lock_extent(tree, last_offset, end, GFP_NOFS);
+ lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
@@ -507,7 +507,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_extent(tree, last_offset, end);
unlock_page(page);
page_cache_release(page);
break;
@@ -521,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (zero_offset) {
int zeros;
zeros = PAGE_CACHE_SIZE - zero_offset;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, zeros);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
}
}
@@ -535,7 +535,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
nr_pages++;
page_cache_release(page);
} else {
- unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_extent(tree, last_offset, end);
unlock_page(page);
page_cache_release(page);
break;
@@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page_offset(bio->bi_io_vec->bv_page),
PAGE_CACHE_SIZE);
read_unlock(&em_tree->lock);
+ if (!em)
+ return -EIO;
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -660,7 +662,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
/*
* inc the count before we submit the bio so
@@ -673,19 +675,20 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(root, inode,
comp_bio, sums);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
sums += (comp_bio->bi_size + root->sectorsize - 1) /
root->sectorsize;
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(comp_bio);
comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
GFP_NOFS);
+ BUG_ON(!comp_bio);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -696,15 +699,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(comp_bio);
return 0;
@@ -732,7 +735,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = {
&btrfs_lzo_compress,
};
-int __init btrfs_init_compress(void)
+void __init btrfs_init_compress(void)
{
int i;
@@ -742,7 +745,6 @@ int __init btrfs_init_compress(void)
atomic_set(&comp_alloc_workspace[i], 0);
init_waitqueue_head(&comp_workspace_wait[i]);
}
- return 0;
}
/*
@@ -991,9 +993,9 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
bytes = min(PAGE_CACHE_SIZE - *pg_offset,
PAGE_CACHE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
- kaddr = kmap_atomic(page_out, KM_USER0);
+ kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
flush_dcache_page(page_out);
*pg_offset += bytes;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index a12059f4f0f..9afb0a62ae8 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,7 +19,7 @@
#ifndef __BTRFS_COMPRESSION_
#define __BTRFS_COMPRESSION_
-int btrfs_init_compress(void);
+void btrfs_init_compress(void);
void btrfs_exit_compress(void);
int btrfs_compress_pages(int type, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdee..8206b390058 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/rbtree.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -36,8 +37,17 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot);
+static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot,
+ int tree_mod_log);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb);
+struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid,
+ u64 time_seq);
+struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 time_seq);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -156,10 +166,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
- rcu_read_lock();
- eb = rcu_dereference(root->node);
- extent_buffer_get(eb);
- rcu_read_unlock();
+ while (1) {
+ rcu_read_lock();
+ eb = rcu_dereference(root->node);
+
+ /*
+ * RCU really hurts here, we could free up the root node because
+ * it was cow'ed but we may not get the new root node yet so do
+ * the inc_not_zero dance and if it doesn't work then
+ * synchronize_rcu and try again.
+ */
+ if (atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ synchronize_rcu();
+ }
return eb;
}
@@ -207,10 +230,12 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
+ spin_lock(&root->fs_info->trans_lock);
if (root->track_dirty && list_empty(&root->dirty_list)) {
list_add(&root->dirty_list,
&root->fs_info->dirty_cowonly_roots);
}
+ spin_unlock(&root->fs_info->trans_lock);
}
/*
@@ -261,9 +286,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
if (ret)
return ret;
@@ -273,6 +298,449 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return 0;
}
+enum mod_log_op {
+ MOD_LOG_KEY_REPLACE,
+ MOD_LOG_KEY_ADD,
+ MOD_LOG_KEY_REMOVE,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ MOD_LOG_MOVE_KEYS,
+ MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+ int dst_slot;
+ int nr_items;
+};
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 index; /* shifted logical */
+ struct seq_list elem;
+ enum mod_log_op op;
+
+ /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+ int slot;
+
+ /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+ u64 generation;
+
+ /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* this is used for op == MOD_LOG_MOVE_KEYS */
+ struct tree_mod_move move;
+
+ /* this is used for op == MOD_LOG_ROOT_REPLACE */
+ struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+ elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ elem->flags = 1;
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ __get_tree_mod_seq(fs_info, elem);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct seq_list *cur_elem;
+ struct tree_mod_elem *tm;
+ u64 min_seq = (u64)-1;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ BUG_ON(!(elem->flags & 1));
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ list_del(&elem->list);
+
+ list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+ if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+ if (seq_putting > cur_elem->seq) {
+ /*
+ * blocker with lower sequence number exists, we
+ * cannot remove anything from the log
+ */
+ goto out;
+ }
+ min_seq = cur_elem->seq;
+ }
+ }
+
+ /*
+ * anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = container_of(node, struct tree_mod_elem, node);
+ if (tm->elem.seq > min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ list_del(&tm->elem.list);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+out:
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ * index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+ int ret = 0;
+
+ BUG_ON(!tm || !tm->elem.seq);
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = container_of(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->index < tm->index)
+ new = &((*new)->rb_left);
+ else if (cur->index > tm->index)
+ new = &((*new)->rb_right);
+ else if (cur->elem.seq < tm->elem.seq)
+ new = &((*new)->rb_left);
+ else if (cur->elem.seq > tm->elem.seq)
+ new = &((*new)->rb_right);
+ else {
+ kfree(tm);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+unlock:
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return ret;
+}
+
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb) {
+ smp_mb();
+ if (list_empty(&(fs_info)->tree_mod_seq_list))
+ return 1;
+ if (!eb)
+ return 0;
+ if (btrfs_header_level(eb) == 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * This allocates memory and gets a tree modification sequence number when
+ * needed.
+ *
+ * Returns 0 when no sequence number is needed, < 0 on error.
+ * Returns 1 when a sequence number was added. In this case,
+ * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
+ * after inserting into the rb tree.
+ */
+static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+ struct tree_mod_elem **tm_ret)
+{
+ struct tree_mod_elem *tm;
+ int seq;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ return 0;
+
+ tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return -ENOMEM;
+
+ tm->elem.flags = 0;
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ /*
+ * someone emptied the list while we were waiting for the lock.
+ * we must not add to the list, because no blocker exists. items
+ * are removed from the list only when the existing blocker is
+ * removed from the list.
+ */
+ kfree(tm);
+ seq = 0;
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ } else {
+ __get_tree_mod_seq(fs_info, &tm->elem);
+ seq = tm->elem.seq;
+ }
+
+ return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ if (op != MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ int slot, enum mod_log_op op)
+{
+ return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int dst_slot, int src_slot,
+ int nr_items, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+ int i;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return 0;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+ BUG_ON(ret < 0);
+ }
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = MOD_LOG_MOVE_KEYS;
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *old_root,
+ struct extent_buffer *new_root, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = MOD_LOG_ROOT_REPLACE;
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+ int smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+ u64 index = start >> PAGE_CACHE_SHIFT;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = container_of(node, struct tree_mod_elem, node);
+ if (cur->index < index) {
+ node = node->rb_left;
+ } else if (cur->index > index) {
+ node = node->rb_right;
+ } else if (cur->elem.seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* we want the node with the highest seq */
+ if (found)
+ BUG_ON(found->elem.seq > cur->elem.seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->elem.seq > min_seq) {
+ /* we want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->elem.seq < cur->elem.seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+ u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ struct extent_buffer *src, unsigned long dst_offset,
+ unsigned long src_offset, int nr_items)
+{
+ int ret;
+ int i;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ return;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return;
+
+ /* speed this up by single seq for all operations? */
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
+ ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+ MOD_LOG_KEY_ADD);
+ BUG_ON(ret < 0);
+ }
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ int dst_offset, int src_offset, int nr_items)
+{
+ int ret;
+ ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+ nr_items, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static inline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int slot, int atomic)
+{
+ int ret;
+
+ ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
+ MOD_LOG_KEY_REPLACE,
+ atomic ? GFP_ATOMIC : GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ int i;
+ int ret;
+ u32 nritems;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert_key(fs_info, eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+ BUG_ON(ret < 0);
+ }
+}
+
+static inline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+ struct extent_buffer *new_root_node)
+{
+ int ret;
+ tree_mod_log_free_eb(root->fs_info, root->node);
+ ret = tree_mod_log_insert_root(root->fs_info, root->node,
+ new_root_node, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
/*
* check if the tree block can be shared by multiple trees
*/
@@ -331,8 +799,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(root, buf)) {
ret = btrfs_lookup_extent_info(trans, root, buf->start,
buf->len, &refs, &flags);
- BUG_ON(ret);
- BUG_ON(refs == 0);
+ if (ret)
+ return ret;
+ if (refs == 0) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
} else {
refs = 1;
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
@@ -350,44 +823,51 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
- ret = btrfs_inc_ref(trans, root, buf, 1);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_dec_ref(trans, root, buf, 0);
- BUG_ON(ret);
- ret = btrfs_inc_ref(trans, root, cow, 1);
- BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ BUG_ON(ret); /* -ENOMEM */
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
ret = btrfs_set_disk_extent_flags(trans, root,
buf->start,
buf->len,
new_flags, 0);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
- BUG_ON(ret);
- ret = btrfs_dec_ref(trans, root, buf, 1);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+ BUG_ON(ret); /* -ENOMEM */
}
+ /*
+ * don't log freeing in case we're freeing the root node, this
+ * is done by tree_mod_log_set_root_pointer later
+ */
+ if (buf != root->node && btrfs_header_level(buf) != 0)
+ tree_mod_log_free_eb(root->fs_info, buf);
clean_tree_block(trans, root, buf);
*last_ref = 1;
}
@@ -415,7 +895,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
- int level;
+ int level, ret;
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start;
@@ -467,7 +947,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
- update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
if (root->ref_cows)
btrfs_reloc_cow_block(trans, root, buf, cow);
@@ -481,6 +965,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
extent_buffer_get(cow);
+ tree_mod_log_set_root_pointer(root, cow);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -494,6 +979,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
WARN_ON(trans->transid != btrfs_header_generation(parent));
+ tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+ MOD_LOG_KEY_REPLACE);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -504,12 +991,235 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
}
if (unlock_orig)
btrfs_tree_unlock(buf);
- free_extent_buffer(buf);
+ free_extent_buffer_stale(buf);
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
}
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = root->node->start;
+ int looped = 0;
+
+ if (!time_seq)
+ return 0;
+
+ /*
+ * the very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). this has the index of the *new*
+ * root, making it the very first operation that's logged for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return 0;
+ /*
+ * if there are no tree operation for the oldest root, we simply
+ * return it. this should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * if there's an operation that's not a root replacement, we
+ * found the oldest version of our root. normally, we'll find a
+ * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ BUG_ON(root_logical == root->node->start);
+ looped = 1;
+ }
+
+ /* if there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ while (tm && tm->elem.seq >= time_seq) {
+ /*
+ * all the operations are recorded with the operator used for
+ * the modification. as we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case MOD_LOG_ROOT_REPLACE:
+ /*
+ * this operation is special. for roots, this must be
+ * handled explicitly before rewinding.
+ * for non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. in the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. we simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = container_of(next, struct tree_mod_elem, node);
+ if (tm->index != first_tm->index)
+ break;
+ }
+ btrfs_set_header_nritems(eb, n);
+}
+
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(eb->start,
+ fs_info->tree_root->nodesize);
+ BUG_ON(!eb_rewin);
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ BUG_ON(!eb_rewin);
+ }
+
+ extent_buffer_get(eb_rewin);
+ free_extent_buffer(eb);
+
+ __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+
+ return eb_rewin;
+}
+
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+
+ eb = btrfs_read_lock_root_node(root);
+ tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ if (!tm)
+ return root->node;
+
+ if (tm->op == MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ } else {
+ logical = root->node->start;
+ }
+
+ tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+ if (old_root)
+ eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+ else
+ eb = btrfs_clone_extent_buffer(root->node);
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ if (!eb)
+ return NULL;
+ btrfs_tree_read_lock(eb);
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, root->root_key.objectid);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ if (tm)
+ __tree_mod_log_rewind(eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ extent_buffer_get(eb);
+
+ return eb;
+}
+
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -700,7 +1410,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
cur = btrfs_find_tree_block(root, blocknr, blocksize);
if (cur)
- uptodate = btrfs_buffer_uptodate(cur, gen);
+ uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
uptodate = 0;
if (!cur || !uptodate) {
@@ -714,7 +1424,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur)
return -EIO;
} else if (!uptodate) {
- btrfs_read_buffer(cur, gen);
+ err = btrfs_read_buffer(cur, gen);
+ if (err) {
+ free_extent_buffer(cur);
+ return err;
+ }
}
}
if (search_start == 0)
@@ -829,20 +1543,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot)
{
- if (level == 0) {
+ if (level == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
sizeof(struct btrfs_item),
key, btrfs_header_nritems(eb),
slot);
- } else {
+ else
return generic_bin_search(eb,
offsetof(struct btrfs_node, ptrs),
sizeof(struct btrfs_key_ptr),
key, btrfs_header_nritems(eb),
slot);
- }
- return -1;
}
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -934,7 +1646,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
- BUG_ON(!child);
+ if (!child) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
+
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
@@ -944,6 +1661,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
+ tree_mod_log_set_root_pointer(root, child);
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -959,15 +1677,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
- free_extent_buffer(mid);
+ free_extent_buffer_stale(mid);
return 0;
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
- btrfs_header_nritems(mid);
-
left = read_node_slot(root, parent, pslot - 1);
if (left) {
btrfs_tree_lock(left);
@@ -997,7 +1713,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
wret = push_node_left(trans, root, left, mid, 1);
if (wret < 0)
ret = wret;
- btrfs_header_nritems(mid);
}
/*
@@ -1010,17 +1725,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- wret = del_ptr(trans, root, path, level + 1, pslot +
- 1);
- if (wret)
- ret = wret;
+ del_ptr(trans, root, path, level + 1, pslot + 1, 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1);
- free_extent_buffer(right);
+ free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &right_key, pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1035,7 +1749,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* otherwise we would have pulled some pointers from the
* right
*/
- BUG_ON(!left);
+ if (!left) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
wret = balance_node_right(trans, root, mid, left);
if (wret < 0) {
ret = wret;
@@ -1051,17 +1769,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- wret = del_ptr(trans, root, path, level + 1, pslot);
- if (wret)
- ret = wret;
+ del_ptr(trans, root, path, level + 1, pslot, 1);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
- free_extent_buffer(mid);
+ free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+ pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -1159,6 +1877,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &disk_key, pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -1210,6 +1930,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &disk_key, pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1331,7 +2053,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
eb = btrfs_find_tree_block(root, block1, blocksize);
- if (eb && btrfs_buffer_uptodate(eb, gen))
+ /*
+ * if we get -eagain from btrfs_buffer_uptodate, we
+ * don't want to return eagain here. That will loop
+ * forever
+ */
+ if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block1 = 0;
free_extent_buffer(eb);
}
@@ -1339,7 +2066,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
eb = btrfs_find_tree_block(root, block2, blocksize);
- if (eb && btrfs_buffer_uptodate(eb, gen))
+ if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
}
@@ -1382,7 +2109,8 @@ static noinline int reada_for_balance(struct btrfs_root *root,
* if lowest_unlock is 1, level 0 won't be unlocked
*/
static noinline void unlock_up(struct btrfs_path *path, int level,
- int lowest_unlock)
+ int lowest_unlock, int min_write_lock_level,
+ int *write_lock_level)
{
int i;
int skip_level = level;
@@ -1414,6 +2142,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
+ if (write_lock_level &&
+ i > min_write_lock_level &&
+ i <= *write_lock_level) {
+ *write_lock_level = i - 1;
+ }
}
}
}
@@ -1456,7 +2189,7 @@ static int
read_block_for_search(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer **eb_ret, int level, int slot,
- struct btrfs_key *key)
+ struct btrfs_key *key, u64 time_seq)
{
u64 blocknr;
u64 gen;
@@ -1471,8 +2204,9 @@ read_block_for_search(struct btrfs_trans_handle *trans,
tmp = btrfs_find_tree_block(root, blocknr, blocksize);
if (tmp) {
- if (btrfs_buffer_uptodate(tmp, 0)) {
- if (btrfs_buffer_uptodate(tmp, gen)) {
+ /* first we do an atomic uptodate check */
+ if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
/*
* we found an up to date block without
* sleeping, return
@@ -1490,8 +2224,9 @@ read_block_for_search(struct btrfs_trans_handle *trans,
free_extent_buffer(tmp);
btrfs_set_path_blocking(p);
+ /* now we're allowed to do a blocking uptodate check */
tmp = read_tree_block(root, blocknr, blocksize, gen);
- if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
*eb_ret = tmp;
return 0;
}
@@ -1526,7 +2261,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!btrfs_buffer_uptodate(tmp, 0))
+ if (!btrfs_buffer_uptodate(tmp, 0, 0))
ret = -EIO;
free_extent_buffer(tmp);
}
@@ -1637,6 +2372,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
+ int min_write_lock_level;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1664,6 +2400,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (cow && (p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
+ min_write_lock_level = write_lock_level;
+
again:
/*
* we try very hard to do read locks on the root
@@ -1795,7 +2533,8 @@ cow_done:
goto again;
}
- unlock_up(p, level, lowest_unlock);
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
if (level == lowest_level) {
if (dec)
@@ -1804,7 +2543,7 @@ cow_done:
}
err = read_block_for_search(trans, root, p,
- &b, level, slot, key);
+ &b, level, slot, key, 0);
if (err == -EAGAIN)
goto again;
if (err) {
@@ -1857,7 +2596,8 @@ cow_done:
}
}
if (!p->search_for_split)
- unlock_up(p, level, lowest_unlock);
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
goto done;
}
}
@@ -1875,21 +2615,125 @@ done:
}
/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq)
+{
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ u8 lowest_level = 0;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(p->nodes[0] != NULL);
+
+ if (p->search_commit_root) {
+ BUG_ON(time_seq);
+ return btrfs_search_slot(NULL, root, key, p, 0, 0);
+ }
+
+again:
+ b = get_old_root(root, time_seq);
+ level = btrfs_header_level(b);
+ p->locks[level] = BTRFS_READ_LOCK;
+
+ while (b) {
+ level = btrfs_header_level(b);
+ p->nodes[level] = b;
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+
+ ret = bin_search(b, key, level, &slot);
+
+ if (level != 0) {
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
+ slot -= 1;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(NULL, root, p, &b, level,
+ slot, key, time_seq);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ err = btrfs_try_tree_read_lock(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_READ_LOCK);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
+ b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+ if (b != p->nodes[level]) {
+ btrfs_tree_unlock_rw(p->nodes[level],
+ p->locks[level]);
+ p->locks[level] = 0;
+ p->nodes[level] = b;
+ }
+ } else {
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
+ if (ret < 0)
+ btrfs_release_path(p);
+
+ return ret;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
* fixing up pointers when a given leaf/node is not in slot 0 of the
* higher levels
*
- * If this fails to write a tree block, it returns -1, but continues
- * fixing up the blocks in ram so the tree is consistent.
*/
-static int fixup_low_keys(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_disk_key *key, int level)
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
{
int i;
- int ret = 0;
struct extent_buffer *t;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1897,12 +2741,12 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
if (!path->nodes[i])
break;
t = path->nodes[i];
+ tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
break;
}
- return ret;
}
/*
@@ -1911,9 +2755,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *new_key)
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *eb;
@@ -1923,13 +2767,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- if (comp_keys(&disk_key, new_key) >= 0)
- return -1;
+ BUG_ON(comp_keys(&disk_key, new_key) >= 0);
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- if (comp_keys(&disk_key, new_key) <= 0)
- return -1;
+ BUG_ON(comp_keys(&disk_key, new_key) <= 0);
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -1937,7 +2779,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(eb);
if (slot == 0)
fixup_low_keys(trans, root, path, &disk_key, 1);
- return 0;
}
/*
@@ -1983,12 +2824,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+ push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
+ tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
+ src_nritems - push_items);
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
(src_nritems - push_items) *
@@ -2042,11 +2887,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
+ tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+ src_nritems - push_items, push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2121,6 +2969,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
+ tree_mod_log_set_root_pointer(root, c);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -2140,36 +2989,42 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
*
* slot and level indicate where you want the key to go, and
* blocknr is the block the key points to.
- *
- * returns zero on success and < 0 on any error
*/
-static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, struct btrfs_disk_key
- *key, u64 bytenr, int slot, int level)
+static void insert_ptr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, u64 bytenr,
+ int slot, int level)
{
struct extent_buffer *lower;
int nritems;
+ int ret;
BUG_ON(!path->nodes[level]);
btrfs_assert_tree_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
- if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
- BUG();
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
if (slot != nritems) {
+ if (level)
+ tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+ slot, nritems - slot);
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
+ if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+ MOD_LOG_KEY_ADD);
+ BUG_ON(ret < 0);
+ }
btrfs_set_node_key(lower, key, slot);
btrfs_set_node_blockptr(lower, slot, bytenr);
WARN_ON(trans->transid == 0);
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
btrfs_mark_buffer_dirty(lower);
- return 0;
}
/*
@@ -2190,7 +3045,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
int mid;
int ret;
- int wret;
u32 c_nritems;
c = path->nodes[level];
@@ -2235,7 +3089,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_chunk_tree_uuid(split),
BTRFS_UUID_SIZE);
-
+ tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
@@ -2247,11 +3101,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
- wret = insert_ptr(trans, root, path, &disk_key, split->start,
- path->slots[level + 1] + 1,
- level + 1);
- if (wret)
- ret = wret;
+ insert_ptr(trans, root, path, &disk_key, split->start,
+ path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
@@ -2320,6 +3171,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
+ struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -2331,6 +3183,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
u32 data_end;
u32 this_item_size;
+ btrfs_init_map_token(&token);
+
if (empty)
nr = 0;
else
@@ -2408,8 +3262,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
push_space = BTRFS_LEAF_DATA_SIZE(root);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- push_space -= btrfs_item_size(right, item);
- btrfs_set_item_offset(right, item, push_space);
+ push_space -= btrfs_token_item_size(right, item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
}
left_nritems -= push_items;
@@ -2537,9 +3391,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
u32 old_left_nritems;
u32 nr;
int ret = 0;
- int wret;
u32 this_item_size;
u32 old_left_item_size;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
if (empty)
nr = min(right_nritems, max_slot);
@@ -2600,9 +3456,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
item = btrfs_item_nr(left, i);
- ioff = btrfs_item_offset(left, item);
- btrfs_set_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+ ioff = btrfs_token_item_offset(left, item, &token);
+ btrfs_set_token_item_offset(left, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+ &token);
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2632,8 +3489,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- push_space = push_space - btrfs_item_size(right, item);
- btrfs_set_item_offset(right, item, push_space);
+ push_space = push_space - btrfs_token_item_size(right,
+ item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
}
btrfs_mark_buffer_dirty(left);
@@ -2643,9 +3501,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path, &disk_key, 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -2716,7 +3572,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
path->nodes[1], slot - 1, &left);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
- ret = 1;
+ if (ret == -ENOSPC)
+ ret = 1;
goto out;
}
@@ -2738,22 +3595,21 @@ out:
/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
- *
- * returns 0 if all went well and < 0 on failure.
*/
-static noinline int copy_for_split(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *l,
- struct extent_buffer *right,
- int slot, int mid, int nritems)
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
{
int data_copy_size;
int rt_data_off;
int i;
- int ret = 0;
- int wret;
struct btrfs_disk_key disk_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -2775,17 +3631,15 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
struct btrfs_item *item = btrfs_item_nr(right, i);
u32 ioff;
- ioff = btrfs_item_offset(right, item);
- btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ ioff = btrfs_token_item_offset(right, item, &token);
+ btrfs_set_token_item_offset(right, item,
+ ioff + rt_data_off, &token);
}
btrfs_set_header_nritems(l, mid);
- ret = 0;
btrfs_item_key(right, &disk_key, 0);
- wret = insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -2803,8 +3657,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
}
BUG_ON(path->slots[0] < 0);
-
- return ret;
}
/*
@@ -2993,12 +3845,8 @@ again:
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
- wret = insert_ptr(trans, root, path,
- &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
-
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3006,29 +3854,21 @@ again:
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
- wret = insert_ptr(trans, root, path,
- &disk_key,
- right->start,
+ insert_ptr(trans, root, path, &disk_key, right->start,
path->slots[1], 1);
- if (wret)
- ret = wret;
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[0] = 0;
- if (path->slots[1] == 0) {
- wret = fixup_low_keys(trans, root,
- path, &disk_key, 1);
- if (wret)
- ret = wret;
- }
+ if (path->slots[1] == 0)
+ fixup_low_keys(trans, root, path,
+ &disk_key, 1);
}
btrfs_mark_buffer_dirty(right);
return ret;
}
- ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
- BUG_ON(ret);
+ copy_for_split(trans, root, path, l, right, slot, mid, nritems);
if (split == 2) {
BUG_ON(num_doubles != 0);
@@ -3036,7 +3876,7 @@ again:
goto again;
}
- return ret;
+ return 0;
push_for_double:
push_for_double_split(trans, root, path, data_size);
@@ -3238,11 +4078,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
- BUG_ON(ret);
-
+ setup_items_for_insert(trans, root, path, new_key, &item_size,
+ item_size, item_size +
+ sizeof(struct btrfs_item), 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -3257,10 +4095,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -3271,13 +4109,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
slot = path->slots[0];
old_size = btrfs_item_size_nr(leaf, slot);
if (old_size == new_size)
- return 0;
+ return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(root, leaf);
@@ -3297,8 +4138,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff + size_diff);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + size_diff, &token);
}
/* shift the data */
@@ -3350,15 +4192,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return 0;
}
/*
* make the item pointed to by the path bigger, data_size is the new size.
*/
-int btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -3368,6 +4209,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
@@ -3397,8 +4241,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - data_size);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - data_size, &token);
}
/* shift the data */
@@ -3416,7 +4261,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return 0;
}
/*
@@ -3441,6 +4285,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
unsigned int data_end;
struct btrfs_disk_key disk_key;
struct btrfs_key found_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
for (i = 0; i < nr; i++) {
if (total_size + data_size[i] + sizeof(struct btrfs_item) >
@@ -3506,8 +4353,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - total_data);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3534,9 +4382,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_item_size(leaf, item, data_size[i]);
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
}
btrfs_set_header_nritems(leaf, nritems + nr);
btrfs_mark_buffer_dirty(leaf);
@@ -3544,7 +4393,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
ret = 0;
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -3562,19 +4411,21 @@ out:
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
*/
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
struct btrfs_disk_key disk_key;
- int ret;
struct extent_buffer *leaf;
int slot;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
slot = path->slots[0];
@@ -3606,8 +4457,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - total_data);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3626,17 +4478,17 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_item_size(leaf, item, data_size[i]);
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
}
btrfs_set_header_nritems(leaf, nritems + nr);
- ret = 0;
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
@@ -3645,7 +4497,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return ret;
}
/*
@@ -3672,16 +4523,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
if (ret == 0)
return -EEXIST;
if (ret < 0)
- goto out;
+ return ret;
slot = path->slots[0];
BUG_ON(slot < 0);
- ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ setup_items_for_insert(trans, root, path, cpu_key, data_size,
total_data, total_size, nr);
-
-out:
- return ret;
+ return 0;
}
/*
@@ -3717,22 +4566,30 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* the tree should have been previously balanced so the deletion does not
* empty a node.
*/
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot,
+ int tree_mod_log)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
- int ret = 0;
- int wret;
+ int ret;
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
+ if (tree_mod_log && level)
+ tree_mod_log_eb_move(root->fs_info, parent, slot,
+ slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
+ } else if (tree_mod_log && level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
}
+
nritems--;
btrfs_set_header_nritems(parent, nritems);
if (nritems == 0 && parent == root->node) {
@@ -3743,12 +4600,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
- return ret;
}
/*
@@ -3761,17 +4615,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* The path must have already been setup for deleting the leaf, including
* all the proper balancing. path->nodes[1] must be locked.
*/
-static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf)
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
{
- int ret;
-
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- ret = del_ptr(trans, root, path, 1, path->slots[1]);
- if (ret)
- return ret;
+ del_ptr(trans, root, path, 1, path->slots[1], 1);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -3781,8 +4631,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
+ extent_buffer_get(leaf);
btrfs_free_tree_block(trans, root, leaf, 0, 1);
- return 0;
+ free_extent_buffer_stale(leaf);
}
/*
* delete the item at the leaf level in path. If that empties
@@ -3799,6 +4650,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int wret;
int i;
u32 nritems;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -3820,8 +4674,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff + dsize);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + dsize, &token);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -3839,8 +4694,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
} else {
btrfs_set_path_blocking(path);
clean_tree_block(trans, root, leaf);
- ret = btrfs_del_leaf(trans, root, path, leaf);
- BUG_ON(ret);
+ btrfs_del_leaf(trans, root, path, leaf);
}
} else {
int used = leaf_space_used(leaf, 0, nritems);
@@ -3848,10 +4702,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path,
- &disk_key, 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
@@ -3879,9 +4730,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_nritems(leaf) == 0) {
path->slots[1] = slot;
- ret = btrfs_del_leaf(trans, root, path, leaf);
- BUG_ON(ret);
+ btrfs_del_leaf(trans, root, path, leaf);
free_extent_buffer(leaf);
+ ret = 0;
} else {
/* if we're still in the path, make sure
* we're dirty. Otherwise, one of the
@@ -4029,7 +4880,7 @@ again:
tmp = btrfs_find_tree_block(root, blockptr,
btrfs_level_size(root, level - 1));
- if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
free_extent_buffer(tmp);
break;
}
@@ -4059,18 +4910,18 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1);
+ unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
- BUG_ON(!cur);
+ BUG_ON(!cur); /* -ENOMEM */
btrfs_tree_read_lock(cur);
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
- unlock_up(path, level, 1);
+ unlock_up(path, level, 1, 0, NULL);
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
@@ -4152,7 +5003,8 @@ next:
struct extent_buffer *cur;
cur = btrfs_find_tree_block(root, blockptr,
btrfs_level_size(root, level - 1));
- if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+ if (!cur ||
+ btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
slot++;
if (cur)
free_extent_buffer(cur);
@@ -4178,6 +5030,12 @@ next:
*/
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq)
+{
int slot;
int level;
struct extent_buffer *c;
@@ -4202,7 +5060,10 @@ again:
path->keep_locks = 1;
path->leave_spinning = 1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (time_seq)
+ ret = btrfs_search_old_slot(root, &key, path, time_seq);
+ else
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->keep_locks = 0;
if (ret < 0)
@@ -4247,7 +5108,7 @@ again:
next = c;
next_rw_lock = path->locks[level];
ret = read_block_for_search(NULL, root, path, &next, level,
- slot, &key);
+ slot, &key, 0);
if (ret == -EAGAIN)
goto again;
@@ -4258,6 +5119,18 @@ again:
if (!path->skip_locking) {
ret = btrfs_try_tree_read_lock(next);
+ if (!ret && time_seq) {
+ /*
+ * If we don't get the lock, we may be racing
+ * with push_leaf_left, holding that lock while
+ * itself waiting for the leaf we've currently
+ * locked. To solve this situation, we give up
+ * on our lock and cycle.
+ */
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
@@ -4284,7 +5157,7 @@ again:
break;
ret = read_block_for_search(NULL, root, path, &next, level,
- 0, &key);
+ 0, &key, 0);
if (ret == -EAGAIN)
goto again;
@@ -4306,7 +5179,7 @@ again:
}
ret = 0;
done:
- unlock_up(path, 0, 1);
+ unlock_up(path, 0, 1, 0, NULL);
path->leave_spinning = old_spinning;
if (!old_spinning)
btrfs_set_path_blocking(path);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323..fa5c45b3907 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,8 @@ struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAX_MIRRORS 2
+
#define BTRFS_MAX_LEVEL 8
#define BTRFS_COMPAT_EXTENT_TREE_V0
@@ -86,6 +88,9 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -135,6 +140,12 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
/*
+ * the max metadata block size. This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
+/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
*/
@@ -162,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_FT_XATTR 8
#define BTRFS_FT_MAX 9
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -458,6 +472,19 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy. Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -465,6 +492,7 @@ struct btrfs_super_block {
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/*
@@ -692,6 +720,54 @@ struct btrfs_root_ref {
__le16 name_len;
} __attribute__ ((__packed__));
+struct btrfs_disk_balance_args {
+ /*
+ * profiles to operate on, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 profiles;
+
+ /* usage filter */
+ __le64 usage;
+
+ /* devid filter */
+ __le64 devid;
+
+ /* devid subset filter [pstart..pend) */
+ __le64 pstart;
+ __le64 pend;
+
+ /* btrfs virtual address space subset filter [vstart..vend) */
+ __le64 vstart;
+ __le64 vend;
+
+ /*
+ * profile to convert to, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 target;
+
+ /* BTRFS_BALANCE_ARGS_* */
+ __le64 flags;
+
+ __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+ /* BTRFS_BALANCE_* */
+ __le64 flags;
+
+ struct btrfs_disk_balance_args data;
+ struct btrfs_disk_balance_args meta;
+ struct btrfs_disk_balance_args sys;
+
+ __le64 unused[4];
+} __attribute__ ((__packed__));
+
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -750,15 +826,56 @@ struct btrfs_csum_item {
u8 csum;
} __attribute__ ((__packed__));
+struct btrfs_dev_stats_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
-#define BTRFS_NR_RAID_TYPES 5
+#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES 5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
+ BTRFS_BLOCK_GROUP_SYSTEM | \
+ BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available. This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk). The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+
+#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+ flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+ return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
struct btrfs_block_group_item {
__le64 used;
@@ -817,7 +934,7 @@ struct btrfs_block_rsv {
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned int full:1;
+ unsigned int full;
};
/*
@@ -916,6 +1033,7 @@ struct btrfs_block_group_cache {
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_balance_control;
struct btrfs_delayed_root;
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1089,7 @@ struct btrfs_fs_info {
* is required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
- unsigned long mount_opt:20;
+ unsigned long mount_opt;
unsigned long compress_type:4;
u64 max_inline;
u64 alloc_start;
@@ -1022,6 +1140,15 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
+ /* this protects tree_mod_seq_list */
+ spinlock_t tree_mod_seq_lock;
+ atomic_t tree_mod_seq;
+ struct list_head tree_mod_seq_list;
+
+ /* this protects tree_mod_log */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+
atomic_t nr_async_submits;
atomic_t async_submit_draining;
atomic_t nr_async_bios;
@@ -1132,12 +1259,23 @@ struct btrfs_fs_info {
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
- u64 data_alloc_profile;
- u64 metadata_alloc_profile;
- u64 system_alloc_profile;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_running;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
unsigned data_chunk_allocations;
unsigned metadata_ratio;
@@ -1155,6 +1293,10 @@ struct btrfs_fs_info {
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+
/* filesystem state */
u64 fs_state;
@@ -1253,7 +1395,7 @@ struct btrfs_root {
struct list_head root_list;
spinlock_t orphan_lock;
- struct list_head orphan_list;
+ atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
@@ -1383,6 +1525,14 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+#define BTRFS_BALANCE_ITEM_KEY 248
+
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY 249
+
/*
* string items are for debugging. They just store a short string of
* data in the FS
@@ -1413,6 +1563,10 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1436,6 +1590,17 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+ memset(token, 0, sizeof(*token));
+}
+
/* some macros to generate set/get funcs for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
* one for u8:
@@ -1459,20 +1624,22 @@ struct btrfs_ioctl_defrag_range_args {
#ifndef BTRFS_SETGET_FUNCS
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
#endif
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \
{ \
- type *p = page_address(eb->first_page); \
+ type *p = page_address(eb->pages[0]); \
u##bits res = le##bits##_to_cpu(p->member); \
return res; \
} \
static inline void btrfs_set_##name(struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->first_page); \
+ type *p = page_address(eb->pages[0]); \
p->member = cpu_to_le##bits(val); \
}
@@ -2025,7 +2192,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
static inline bool btrfs_root_readonly(struct btrfs_root *root)
{
- return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
/* struct btrfs_root_backup */
@@ -2077,8 +2244,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2441,31 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index)
+{
+ u64 val;
+
+ read_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+ return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index, u64 val)
+{
+ write_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+}
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
@@ -2298,32 +2567,31 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data);
+ struct btrfs_key *ins, u64 data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int for_cow);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
+ u64 root_objectid, u64 owner, u64 offset, int for_cow);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -2380,8 +2648,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
u64 num_bytes);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-int btrfs_set_block_group_rw(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
@@ -2400,9 +2668,9 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *new_key);
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -2422,12 +2690,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
-int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, u32 data_size);
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2440,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, int cache_only, u64 *last_ret,
@@ -2461,10 +2732,10 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2482,10 +2753,25 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+ struct btrfs_path *p, u64 time_seq)
+{
+ ++p->slots[0];
+ if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+ return btrfs_next_old_leaf(root, p, time_seq);
+ return 0;
+}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ return btrfs_next_old_item(root, p, 0);
+}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ int update_ref, int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2500,6 +2786,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
+ kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
@@ -2528,9 +2815,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item);
-int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_key *key, struct btrfs_root_item
- *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2693,7 +2981,6 @@ int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@@ -2714,7 +3001,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -2766,13 +3053,41 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, const char *fmt, ...);
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno);
+
+#define btrfs_abort_transaction(trans, root, errno) \
+do { \
+ __btrfs_abort_transaction(trans, root, __func__, \
+ __LINE__, errno); \
+} while (0)
#define btrfs_std_error(fs_info, errno) \
do { \
if ((errno)) \
- __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+ __btrfs_std_error((fs_info), __func__, \
+ __LINE__, (errno), NULL); \
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_std_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ struct btrfs_fs_info *_i = (fs_info); \
+ __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \
+ BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
} while (0)
/* acl.c */
@@ -2808,16 +3123,17 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
struct btrfs_scrub_progress *progress, int readonly);
-int btrfs_scrub_pause(struct btrfs_root *root);
-int btrfs_scrub_pause_super(struct btrfs_root *root);
-int btrfs_scrub_continue(struct btrfs_root *root);
-int btrfs_scrub_continue_super(struct btrfs_root *root);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_pause_super(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+void btrfs_scrub_continue_super(struct btrfs_root *root);
+int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel(struct btrfs_root *root);
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
@@ -2840,4 +3156,23 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
+/* delayed seq elem */
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+ u32 flags;
+};
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+
+static inline int is_fstree(u64 rootid)
+{
+ if (rootid == BTRFS_FS_TREE_OBJECTID ||
+ (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c50..2399f408691 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -115,6 +115,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
return NULL;
}
+/* Will return either the node or PTR_ERR(-ENOMEM) */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct inode *inode)
{
@@ -595,8 +596,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid,
+ num_bytes, 1);
item->bytes_reserved = num_bytes;
+ }
return ret;
}
@@ -610,6 +615,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
return;
rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid, item->bytes_reserved,
+ 0);
btrfs_block_rsv_release(root, rsv,
item->bytes_reserved);
}
@@ -624,7 +632,7 @@ static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
- int release = false;
+ bool release = false;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,13 +659,18 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (ret == -EAGAIN)
ret = -ENOSPC;
- if (!ret)
+ if (!ret) {
node->bytes_reserved = num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "delayed_inode",
+ btrfs_ino(inode),
+ num_bytes, 1);
+ }
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
- if (BTRFS_I(inode)->delalloc_meta_reserved) {
- BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
spin_unlock(&BTRFS_I(inode)->lock);
release = true;
goto migrate;
@@ -707,11 +720,17 @@ out:
* reservation here. I think it may be time for a documentation page on
* how block rsvs. work.
*/
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ btrfs_ino(inode), num_bytes, 1);
node->bytes_reserved = num_bytes;
+ }
- if (release)
+ if (release) {
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), num_bytes, 0);
btrfs_block_rsv_release(root, src_rsv, num_bytes);
+ }
return ret;
}
@@ -725,6 +744,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
return;
rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ node->inode_id, node->bytes_reserved, 0);
btrfs_block_rsv_release(root, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
@@ -816,10 +837,8 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
btrfs_clear_path_blocking(path, NULL, 0);
/* insert the keys of the items */
- ret = setup_items_for_insert(trans, root, path, keys, data_size,
- total_data_size, total_size, nitems);
- if (ret)
- goto error;
+ setup_items_for_insert(trans, root, path, keys, data_size,
+ total_data_size, total_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
@@ -1088,16 +1107,25 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
return 0;
}
-/* Called when committing the transaction. */
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
+ if (trans->aborted)
+ return -EIO;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1110,17 +1138,18 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
curr_node = btrfs_first_delayed_node(delayed_root);
while (curr_node) {
- root = curr_node->root;
- ret = btrfs_insert_delayed_items(trans, path, root,
+ curr_root = curr_node->root;
+ ret = btrfs_insert_delayed_items(trans, path, curr_root,
curr_node);
if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, root,
- curr_node);
+ ret = btrfs_delete_delayed_items(trans, path,
+ curr_root, curr_node);
if (!ret)
- ret = btrfs_update_delayed_inode(trans, root, path,
- curr_node);
+ ret = btrfs_update_delayed_inode(trans, curr_root,
+ path, curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
+ btrfs_abort_transaction(trans, root, ret);
break;
}
@@ -1131,6 +1160,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
trans->block_rsv = block_rsv;
+
return ret;
}
@@ -1351,6 +1381,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
btrfs_wq_run_delayed_node(delayed_root, root, 0);
}
+/* Will return 0 or -ENOMEM */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
int name_len, struct inode *dir,
@@ -1372,13 +1403,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
-
delayed_item->key.objectid = btrfs_ino(dir);
btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
delayed_item->key.offset = index;
@@ -1391,6 +1415,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
dir_item->type = type;
memcpy((char *)(dir_item + 1), name, name_len);
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible
+ */
+ BUG_ON(ret);
+
+
mutex_lock(&delayed_node->mutex);
ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
if (unlikely(ret)) {
@@ -1674,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
btrfs_set_stack_inode_generation(inode_item,
BTRFS_I(inode)->generation);
- btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+ btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1722,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+ inode->i_version = btrfs_stack_inode_sequence(inode_item);
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
@@ -1847,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
}
}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node) {
+ __btrfs_kill_delayed_node(curr_node);
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+}
+
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 7083d08b2a2..f5aa4023d3e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev);
/* Used for drop dead root */
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
/* Used for readdir() */
void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
struct list_head *del_list);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd0..13ae7b04790 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
return -1;
if (ref1->type > ref2->type)
return 1;
+ /* merging of sequenced refs is not allowed */
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
/*
* find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
*/
static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
u64 bytenr,
- struct btrfs_delayed_ref_node **last)
+ struct btrfs_delayed_ref_node **last,
+ int return_bigger)
{
- struct rb_node *n = root->rb_node;
+ struct rb_node *n;
struct btrfs_delayed_ref_node *entry;
- int cmp;
+ int cmp = 0;
+again:
+ n = root->rb_node;
+ entry = NULL;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
else
return entry;
}
+ if (entry && return_bigger) {
+ if (cmp > 0) {
+ n = rb_next(&entry->rb_node);
+ if (!n)
+ n = rb_first(root);
+ entry = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ bytenr = entry->bytenr;
+ return_bigger = 0;
+ goto again;
+ }
+ return entry;
+ }
return NULL;
}
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
return 0;
}
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq)
+{
+ struct seq_list *elem;
+
+ assert_spin_locked(&delayed_refs->lock);
+ if (list_empty(&delayed_refs->seq_head))
+ return 0;
+
+ elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+ if (seq >= elem->seq) {
+ pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+ seq, elem->seq, delayed_refs);
+ return 1;
+ }
+ return 0;
+}
+
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 start)
{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
node = rb_first(&delayed_refs->root);
} else {
ref = NULL;
- find_ref_head(&delayed_refs->root, start, &ref);
+ find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
if (ref) {
- struct btrfs_delayed_ref_node *tmp;
-
- node = rb_prev(&ref->rb_node);
- while (node) {
- tmp = rb_entry(node,
- struct btrfs_delayed_ref_node,
- rb_node);
- if (tmp->bytenr < start)
- break;
- ref = tmp;
- node = rb_prev(&ref->rb_node);
- }
node = &ref->rb_node;
} else
node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
*/
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes,
int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
ref->action = 0;
ref->is_head = 1;
ref->in_tree = 1;
+ ref->seq = 0;
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
@@ -455,27 +487,29 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(head_ref);
} else {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
* helper to insert a delayed tree ref into the rbtree.
*/
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, int level, int action)
+ u64 ref_root, int level, int action,
+ int for_cow)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +525,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->is_head = 0;
ref->in_tree = 1;
+ if (is_fstree(ref_root))
+ seq = inc_delayed_seq(delayed_refs);
+ ref->seq = seq;
+
full_ref = btrfs_delayed_node_to_tree_ref(ref);
- if (parent) {
- full_ref->parent = parent;
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
- } else {
- full_ref->root = ref_root;
+ else
ref->type = BTRFS_TREE_BLOCK_REF_KEY;
- }
full_ref->level = level;
trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -511,26 +548,27 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
* helper to insert a delayed data ref into the rbtree.
*/
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, u64 owner, u64 offset,
- int action)
+ int action, int for_cow)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +584,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->is_head = 0;
ref->in_tree = 1;
+ if (is_fstree(ref_root))
+ seq = inc_delayed_seq(delayed_refs);
+ ref->seq = seq;
+
full_ref = btrfs_delayed_node_to_data_ref(ref);
- if (parent) {
- full_ref->parent = parent;
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
ref->type = BTRFS_SHARED_DATA_REF_KEY;
- } else {
- full_ref->root = ref_root;
+ else
ref->type = BTRFS_EXTENT_DATA_REF_KEY;
- }
+
full_ref->objectid = owner;
full_ref->offset = offset;
@@ -567,12 +609,11 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
@@ -580,15 +621,16 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
* to make sure the delayed ref is eventually processed before this
* transaction commits.
*/
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
BUG_ON(extent_op && extent_op->is_data);
ref = kmalloc(sizeof(*ref), GFP_NOFS);
@@ -610,30 +652,34 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
- action, 0);
- BUG_ON(ret);
-
- ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, level, action);
- BUG_ON(ret);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, action, 0);
+
+ add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+ num_bytes, parent, ref_root, level, action,
+ for_cow);
+ if (!is_fstree(ref_root) &&
+ waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+
return 0;
}
/*
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
BUG_ON(extent_op && !extent_op->is_data);
ref = kmalloc(sizeof(*ref), GFP_NOFS);
@@ -655,24 +701,27 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
- action, 1);
- BUG_ON(ret);
-
- ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, owner, offset, action);
- BUG_ON(ret);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, action, 1);
+
+ add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+ num_bytes, parent, ref_root, owner, offset,
+ action, for_cow);
+ if (!is_fstree(ref_root) &&
+ waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+
return 0;
}
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
if (!head_ref)
@@ -683,11 +732,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
- BUG_ON(ret);
+ if (waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -704,7 +754,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+ ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
if (ref)
return btrfs_delayed_node_to_head(ref);
return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab..413927fb995 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
/* the size of the extent */
u64 num_bytes;
+ /* seq number to keep track of insertion order */
+ u64 seq;
+
/* ref count on this data structure */
atomic_t refs;
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
struct btrfs_delayed_tree_ref {
struct btrfs_delayed_ref_node node;
- union {
- u64 root;
- u64 parent;
- };
+ u64 root;
+ u64 parent;
int level;
};
struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_node node;
- union {
- u64 root;
- u64 parent;
- };
+ u64 root;
+ u64 parent;
u64 objectid;
u64 offset;
};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * seq number of delayed refs. We need to know if a backref was being
+ * added before the currently processed ref or afterwards.
+ */
+ u64 seq;
+
+ /*
+ * seq_list holds a list of all seq numbers that are currently being
+ * added to the list. While walking backrefs (btrfs_find_all_roots,
+ * qgroups), which might take some time, no newer ref must be processed,
+ * as it might influence the outcome of the walk.
+ */
+ struct list_head seq_head;
+
+ /*
+ * when the only refs we have in the list must not be processed, we want
+ * to wait for more refs to show up or for the end of backref walking.
+ */
+ wait_queue_head_t seq_wait;
};
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
}
}
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
@@ -170,6 +194,36 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+ assert_spin_locked(&delayed_refs->lock);
+ ++delayed_refs->seq;
+ return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ struct seq_list *elem)
+{
+ assert_spin_locked(&delayed_refs->lock);
+ elem->seq = delayed_refs->seq;
+ list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ struct seq_list *elem)
+{
+ spin_lock(&delayed_refs->lock);
+ list_del(&elem->list);
+ wake_up(&delayed_refs->seq_wait);
+ spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq);
+
/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 31d84e78129..c1a074d0696 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -49,9 +49,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(root, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- ret = btrfs_extend_item(trans, root, path, data_size);
- }
- if (ret < 0)
+ btrfs_extend_item(trans, root, path, data_size);
+ } else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
@@ -116,6 +115,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* 'location' is the key to stuff into the directory item, 'type' is the
* type of the inode we're pointing to, and 'index' is the sequence number
* to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, const char *name, int name_len,
@@ -383,8 +383,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- ret = btrfs_truncate_item(trans, root, path,
- item_len - sub_item_len, 1);
+ btrfs_truncate_item(trans, root, path,
+ item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f44b3928dc2..2936ca49b3b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,24 +43,25 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents);
-static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
@@ -98,6 +99,7 @@ struct async_submit_bio {
*/
u64 bio_offset;
struct btrfs_work work;
+ int error;
};
/*
@@ -322,7 +324,8 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
* in the wrong place.
*/
static int verify_parent_transid(struct extent_io_tree *io_tree,
- struct extent_buffer *eb, u64 parent_transid)
+ struct extent_buffer *eb, u64 parent_transid,
+ int atomic)
{
struct extent_state *cached_state = NULL;
int ret;
@@ -330,9 +333,12 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
+ if (atomic)
+ return -EAGAIN;
+
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state, GFP_NOFS);
- if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+ 0, &cached_state);
+ if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
@@ -343,7 +349,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
ret = 1;
- clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
@@ -359,9 +365,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
u64 start, u64 parent_transid)
{
struct extent_io_tree *io_tree;
+ int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
+ int failed_mirror = 0;
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
@@ -369,9 +377,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
- if (!ret &&
- !verify_parent_transid(io_tree, eb, parent_transid))
- return ret;
+ if (!ret && !verify_parent_transid(io_tree, eb,
+ parent_transid, 0))
+ break;
/*
* This buffer's crc is fine, but its contents are corrupted, so
@@ -379,18 +387,30 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
* any less wrong.
*/
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
- return ret;
+ break;
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
if (num_copies == 1)
- return ret;
+ break;
+
+ if (!failed_mirror) {
+ failed = 1;
+ failed_mirror = eb->read_mirror;
+ }
mirror_num++;
+ if (mirror_num == failed_mirror)
+ mirror_num++;
+
if (mirror_num > num_copies)
- return ret;
+ break;
}
- return -EIO;
+
+ if (failed && !ret)
+ repair_eb_io_failure(root, eb, failed_mirror);
+
+ return ret;
}
/*
@@ -403,50 +423,27 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
struct extent_io_tree *tree;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 found_start;
- unsigned long len;
struct extent_buffer *eb;
- int ret;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE) {
- WARN_ON(1);
- goto out;
- }
- if (!page->private) {
- WARN_ON(1);
- goto out;
- }
- len = page->private >> 2;
- WARN_ON(len == 0);
-
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL) {
- WARN_ON(1);
- goto out;
- }
- ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
- btrfs_header_generation(eb));
- BUG_ON(ret);
- WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
-
+ eb = (struct extent_buffer *)page->private;
+ if (page != eb->pages[0])
+ return 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
WARN_ON(1);
- goto err;
+ return 0;
}
- if (eb->first_page != page) {
+ if (eb->pages[0] != page) {
WARN_ON(1);
- goto err;
+ return 0;
}
if (!PageUptodate(page)) {
WARN_ON(1);
- goto err;
+ return 0;
}
csum_tree_block(root, eb, 0);
-err:
- free_extent_buffer(eb);
-out:
return 0;
}
@@ -536,34 +533,75 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
+struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
+ struct page *page, int max_walk)
+{
+ struct extent_buffer *eb;
+ u64 start = page_offset(page);
+ u64 target = start;
+ u64 min_start;
+
+ if (start < max_walk)
+ min_start = 0;
+ else
+ min_start = start - max_walk;
+
+ while (start >= min_start) {
+ eb = find_extent_buffer(tree, start, 0);
+ if (eb) {
+ /*
+ * we found an extent buffer and it contains our page
+ * horray!
+ */
+ if (eb->start <= target &&
+ eb->start + eb->len > target)
+ return eb;
+
+ /* we found an extent buffer that wasn't for us */
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ if (start == 0)
+ break;
+ start -= PAGE_CACHE_SIZE;
+ }
+ return NULL;
+}
+
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
- struct extent_state *state)
+ struct extent_state *state, int mirror)
{
struct extent_io_tree *tree;
u64 found_start;
int found_level;
- unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
int ret = 0;
+ int reads_done;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
- goto out;
if (!page->private)
goto out;
- len = page->private >> 2;
- WARN_ON(len == 0);
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ eb = (struct extent_buffer *)page->private;
+
+ /* the pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ extent_buffer_get(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL) {
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
ret = -EIO;
- goto out;
+ goto err;
}
found_start = btrfs_header_bytenr(eb);
- if (found_start != start) {
+ if (found_start != eb->start) {
printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n",
(unsigned long long)found_start,
@@ -571,13 +609,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
- if (eb->first_page != page) {
- printk(KERN_INFO "btrfs bad first page %lu %lu\n",
- eb->first_page->index, page->index);
- WARN_ON(1);
- ret = -EIO;
- goto err;
- }
if (check_tree_block_fsid(root, eb)) {
printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
(unsigned long long)eb->start);
@@ -605,48 +636,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO;
}
- end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
- end = eb->start + end - 1;
+ if (!ret)
+ set_extent_buffer_uptodate(eb);
err:
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, ret);
}
+ if (ret)
+ clear_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
out:
return ret;
}
-static int btree_io_failed_hook(struct bio *failed_bio,
- struct page *page, u64 start, u64 end,
- int mirror_num, struct extent_state *state)
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
{
- struct extent_io_tree *tree;
- unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
- goto out;
- if (!page->private)
- goto out;
-
- len = page->private >> 2;
- WARN_ON(len == 0);
-
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL)
- goto out;
-
- if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
- clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ eb = (struct extent_buffer *)page->private;
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ eb->read_mirror = failed_mirror;
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
- }
- free_extent_buffer(eb);
-
-out:
return -EIO; /* we fixed nothing */
}
@@ -718,11 +732,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
+ int ret;
async = container_of(work, struct async_submit_bio, work);
- async->submit_bio_start(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
+ if (ret)
+ async->error = ret;
}
static void run_one_async_done(struct btrfs_work *work)
@@ -743,6 +760,12 @@ static void run_one_async_done(struct btrfs_work *work)
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
+ /* If an error occured we just want to clean up the bio and move on */
+ if (async->error) {
+ bio_endio(async->bio, async->error);
+ return;
+ }
+
async->submit_bio_done(async->inode, async->rw, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
@@ -784,6 +807,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
+ async->error = 0;
+
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
@@ -805,15 +830,18 @@ static int btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
struct btrfs_root *root;
+ int ret = 0;
WARN_ON(bio->bi_vcnt <= 0);
while (bio_index < bio->bi_vcnt) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- csum_dirty_buffer(root, bvec->bv_page);
+ ret = csum_dirty_buffer(root, bvec->bv_page);
+ if (ret)
+ break;
bio_index++;
bvec++;
}
- return 0;
+ return ret;
}
static int __btree_submit_bio_start(struct inode *inode, int rw,
@@ -825,8 +853,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- btree_csum_one_bio(bio);
- return 0;
+ return btree_csum_one_bio(bio);
}
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
@@ -846,15 +873,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
{
int ret;
- ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
- BUG_ON(ret);
-
if (!(rw & REQ_WRITE)) {
+
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
*/
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, 1);
+ if (ret)
+ return ret;
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
}
@@ -872,7 +900,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
#ifdef CONFIG_MIGRATION
static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
@@ -887,38 +916,10 @@ static int btree_migratepage(struct address_space *mapping,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct extent_io_tree *tree;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct extent_buffer *eb;
- int was_dirty;
-
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (!(current->flags & PF_MEMALLOC)) {
- return extent_write_full_page(tree, page,
- btree_get_extent, wbc);
- }
-
- redirty_page_for_writepage(wbc, page);
- eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
- WARN_ON(!eb);
-
- was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- if (!was_dirty) {
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
- spin_unlock(&root->fs_info->delalloc_lock);
- }
- free_extent_buffer(eb);
-
- unlock_page(page);
- return 0;
-}
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
@@ -938,7 +939,7 @@ static int btree_writepages(struct address_space *mapping,
if (num_dirty < thresh)
return 0;
}
- return extent_writepages(tree, mapping, btree_get_extent, wbc);
+ return btree_write_cache_pages(mapping, wbc);
}
static int btree_readpage(struct file *file, struct page *page)
@@ -950,28 +951,16 @@ static int btree_readpage(struct file *file, struct page *page)
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct extent_io_tree *tree;
- struct extent_map_tree *map;
- int ret;
-
if (PageWriteback(page) || PageDirty(page))
return 0;
+ /*
+ * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
+ * slab allocation from alloc_extent_state down the callchain where
+ * it'd hit a BUG_ON as those flags are not allowed.
+ */
+ gfp_flags &= ~GFP_SLAB_BUG_MASK;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- map = &BTRFS_I(page->mapping->host)->extent_tree;
-
- ret = try_release_extent_state(map, tree, page, gfp_flags);
- if (!ret)
- return 0;
-
- ret = try_release_extent_buffer(tree, page);
- if (ret == 1) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- page_cache_release(page);
- }
-
- return ret;
+ return try_release_extent_buffer(page, gfp_flags);
}
static void btree_invalidatepage(struct page *page, unsigned long offset)
@@ -989,15 +978,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
+static int btree_set_page_dirty(struct page *page)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ return __set_page_dirty_nobuffers(page);
+}
+
static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
- .writepage = btree_writepage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
+ .set_page_dirty = btree_set_page_dirty,
};
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1040,7 +1042,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
free_extent_buffer(buf);
return -EIO;
- } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ } else if (extent_buffer_uptodate(buf)) {
*eb = buf;
} else {
free_extent_buffer(buf);
@@ -1065,20 +1067,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
struct extent_buffer *eb;
eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, NULL);
+ bytenr, blocksize);
return eb;
}
int btrfs_write_tree_block(struct extent_buffer *buf)
{
- return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
buf->start + buf->len - 1);
}
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return filemap_fdatawait_range(buf->first_page->mapping,
+ return filemap_fdatawait_range(buf->pages[0]->mapping,
buf->start, buf->start + buf->len - 1);
}
@@ -1093,17 +1095,13 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
return NULL;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-
- if (ret == 0)
- set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
return buf;
}
-int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf)
+void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf)
{
- struct inode *btree_inode = root->fs_info->btree_inode;
if (btrfs_header_generation(buf) ==
root->fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
@@ -1112,23 +1110,27 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
spin_lock(&root->fs_info->delalloc_lock);
if (root->fs_info->dirty_metadata_bytes >= buf->len)
root->fs_info->dirty_metadata_bytes -= buf->len;
- else
- WARN_ON(1);
+ else {
+ spin_unlock(&root->fs_info->delalloc_lock);
+ btrfs_panic(root->fs_info, -EOVERFLOW,
+ "Can't clear %lu bytes from "
+ " dirty_mdatadata_bytes (%lu)",
+ buf->len,
+ root->fs_info->dirty_metadata_bytes);
+ }
spin_unlock(&root->fs_info->delalloc_lock);
}
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
- clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ clear_extent_buffer_dirty(buf);
}
- return 0;
}
-static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
- u64 objectid)
+static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ u32 stripesize, struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
@@ -1142,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_item_inserted = 0;
root->orphan_cleanup_state = 0;
- root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
@@ -1153,7 +1154,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
- INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1166,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+ atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
@@ -1181,13 +1182,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
- return 0;
}
-static int find_and_setup_root(struct btrfs_root *tree_root,
- struct btrfs_fs_info *fs_info,
- u64 objectid,
- struct btrfs_root *root)
+static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid,
+ struct btrfs_root *root)
{
int ret;
u32 blocksize;
@@ -1200,14 +1200,15 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
&root->root_item, &root->root_key);
if (ret > 0)
return -ENOENT;
- BUG_ON(ret);
+ else if (ret < 0)
+ return ret;
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+ if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
free_extent_buffer(root->node);
root->node = NULL;
return -EIO;
@@ -1216,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (root)
+ root->fs_info = fs_info;
+ return root;
+}
+
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1223,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1243,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->ref_cows = 0;
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+ BTRFS_TREE_LOG_OBJECTID, NULL,
+ 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1317,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
u32 blocksize;
int ret = 0;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
if (location->offset == (u64)-1) {
@@ -1360,7 +1370,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
- BUG_ON(!root->node);
+ BUG_ON(!root->node); /* -ENOMEM */
out:
if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
@@ -1496,41 +1506,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return 0;
}
-static int bio_ready_for_csum(struct bio *bio)
-{
- u64 length = 0;
- u64 buf_len = 0;
- u64 start = 0;
- struct page *page;
- struct extent_io_tree *io_tree = NULL;
- struct bio_vec *bvec;
- int i;
- int ret;
-
- bio_for_each_segment(bvec, bio, i) {
- page = bvec->bv_page;
- if (page->private == EXTENT_PAGE_PRIVATE) {
- length += bvec->bv_len;
- continue;
- }
- if (!page->private) {
- length += bvec->bv_len;
- continue;
- }
- length = bvec->bv_len;
- buf_len = page->private >> 2;
- start = page_offset(page) + bvec->bv_offset;
- io_tree = &BTRFS_I(page->mapping->host)->io_tree;
- }
- /* are we fully contained in this bio? */
- if (buf_len <= length)
- return 1;
-
- ret = extent_range_uptodate(io_tree, start + length,
- start + buf_len - 1);
- return ret;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -1546,17 +1521,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio = end_io_wq->bio;
fs_info = end_io_wq->info;
- /* metadata bio reads are special because the whole tree block must
- * be checksummed at once. This makes sure the entire block is in
- * ram and up to date before trying to verify things. For
- * blocksize <= pagesize, it is basically a noop
- */
- if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
- !bio_ready_for_csum(bio)) {
- btrfs_queue_worker(&fs_info->endio_meta_workers,
- &end_io_wq->work);
- return;
- }
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
@@ -1579,9 +1543,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(root->fs_info);
}
- if (freezing(current)) {
- refrigerator();
- } else {
+ if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule();
@@ -1599,9 +1561,10 @@ static int transaction_kthread(void *arg)
u64 transid;
unsigned long now;
unsigned long delay;
- int ret;
+ bool cannot_commit;
do {
+ cannot_commit = false;
delay = HZ * 30;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
@@ -1623,11 +1586,14 @@ static int transaction_kthread(void *arg)
transid = cur->transid;
spin_unlock(&root->fs_info->trans_lock);
+ /* If the file system is aborted, this will always fail. */
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ cannot_commit = true;
+ goto sleep;
+ }
if (transid == trans->transid) {
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ btrfs_commit_transaction(trans, root);
} else {
btrfs_end_transaction(trans, root);
}
@@ -1635,12 +1601,11 @@ sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
- if (freezing(current)) {
- refrigerator();
- } else {
+ if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
- !btrfs_transaction_blocked(root->fs_info))
+ (!btrfs_transaction_blocked(root->fs_info) ||
+ cannot_commit))
schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
}
@@ -1877,9 +1842,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
}
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
{
u32 sectorsize;
u32 nodesize;
@@ -1891,8 +1856,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
struct btrfs_root *extent_root;
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
@@ -1903,16 +1868,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
int num_backups_tried = 0;
int backup_index = 0;
- extent_root = fs_info->extent_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- csum_root = fs_info->csum_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- chunk_root = fs_info->chunk_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- dev_root = fs_info->dev_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+ extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+ csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+ chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+ dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+ if (!tree_root || !extent_root || !csum_root ||
+ !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
@@ -1952,11 +1915,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv);
btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1969,12 +1935,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
+ fs_info->tree_mod_log = RB_ROOT;
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,6 +1969,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->scrub_pause_wait);
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ fs_info->check_integrity_print_mask = 0;
+#endif
+
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -2020,6 +1999,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
fs_info->btree_inode->i_mapping);
+ BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2027,7 +2007,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->root = tree_root;
memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
sizeof(struct btrfs_key));
- BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ set_bit(BTRFS_INODE_DUMMY,
+ &BTRFS_I(fs_info->btree_inode)->runtime_flags);
insert_inode_hash(fs_info->btree_inode);
spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2062,6 +2043,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
__setup_root(4096, 4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
+ invalidate_bdev(fs_devices->latest_bdev);
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh) {
err = -EINVAL;
@@ -2082,7 +2064,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
/* check FS state, whether FS is broken. */
fs_info->fs_state |= btrfs_super_flags(disk_super);
- btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ if (ret) {
+ printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+ err = ret;
+ goto fail_alloc;
+ }
/*
* run through our array of backup supers and setup
@@ -2113,10 +2100,55 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
+ if (btrfs_super_leafsize(disk_super) !=
+ btrfs_super_nodesize(disk_super)) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksizes don't match. node %d leaf %d\n",
+ btrfs_super_nodesize(disk_super),
+ btrfs_super_leafsize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksize (%d) was too large\n",
+ btrfs_super_leafsize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+ if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+ /*
+ * flag our filesystem as having big metadata blocks if
+ * they are bigger than the page size
+ */
+ if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ nodesize = btrfs_super_nodesize(disk_super);
+ leafsize = btrfs_super_leafsize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = btrfs_super_stripesize(disk_super);
+
+ /*
+ * mixed block groups end up with duplicate but slightly offset
+ * extent buffers for the same range. It leads to corruptions
+ */
+ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (sectorsize != leafsize)) {
+ printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+ "are not allowed for mixed block groups on %s\n",
+ sb->s_id);
+ goto fail_alloc;
+ }
+
btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
@@ -2220,10 +2252,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
- nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = btrfs_super_stripesize(disk_super);
tree_root->nodesize = nodesize;
tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
@@ -2238,6 +2266,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
+ if (sectorsize != PAGE_SIZE) {
+ printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
+ "found on %s\n", (unsigned long)sectorsize, sb->s_id);
+ goto fail_sb_buffer;
+ }
+
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2257,7 +2291,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
blocksize, generation);
- BUG_ON(!chunk_root->node);
+ BUG_ON(!chunk_root->node); /* -ENOMEM */
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
@@ -2270,9 +2304,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
(unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
BTRFS_UUID_SIZE);
- mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_chunk_tree(chunk_root);
- mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
@@ -2281,6 +2313,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_devices);
+ if (!fs_devices->latest_bdev) {
+ printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
+ sb->s_id);
+ goto fail_tree_roots;
+ }
+
retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
@@ -2316,14 +2354,23 @@ retry_root_backup:
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
goto recovery_tree_root;
-
csum_root->track_dirty = 1;
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
+ ret = btrfs_recover_balance(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to recover balance\n");
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_dev_stats(fs_info);
+ if (ret) {
+ printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+ ret);
+ goto fail_block_groups;
+ }
ret = btrfs_init_space_info(fs_info);
if (ret) {
@@ -2356,6 +2403,19 @@ retry_root_backup:
btrfs_set_opt(fs_info->mount_opt, SSD);
}
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+ ret = btrfsic_mount(tree_root, fs_devices,
+ btrfs_test_opt(tree_root,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+ 1 : 0,
+ fs_info->check_integrity_print_mask);
+ if (ret)
+ printk(KERN_WARNING "btrfs: failed to initialize"
+ " integrity check module %s\n", sb->s_id);
+ }
+#endif
+
/* do not make disk changes in broken FS */
if (btrfs_super_log_root(disk_super) != 0 &&
!(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2371,7 +2431,7 @@ retry_root_backup:
btrfs_level_size(tree_root,
btrfs_super_log_root_level(disk_super));
- log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
err = -ENOMEM;
goto fail_trans_kthread;
@@ -2383,21 +2443,31 @@ retry_root_backup:
log_tree_root->node = read_tree_block(tree_root, bytenr,
blocksize,
generation + 1);
+ /* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(tree_root->fs_info, ret,
+ "Failed to recover log tree");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ goto fail_trans_kthread;
+ }
if (sb->s_flags & MS_RDONLY) {
- ret = btrfs_commit_super(tree_root);
- BUG_ON(ret);
+ ret = btrfs_commit_super(tree_root);
+ if (ret)
+ goto fail_trans_kthread;
}
}
ret = btrfs_find_orphan_roots(tree_root);
- BUG_ON(ret);
+ if (ret)
+ goto fail_trans_kthread;
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
- BUG_ON(ret);
+ if (ret) {
+ }
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
@@ -2420,19 +2490,26 @@ retry_root_backup:
goto fail_trans_kthread;
}
- if (!(sb->s_flags & MS_RDONLY)) {
- down_read(&fs_info->cleanup_work_sem);
- err = btrfs_orphan_cleanup(fs_info->fs_root);
- if (!err)
- err = btrfs_orphan_cleanup(fs_info->tree_root);
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
up_read(&fs_info->cleanup_work_sem);
- if (err) {
- close_ctree(tree_root);
- return ERR_PTR(err);
- }
+ close_ctree(tree_root);
+ return ret;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to resume balance\n");
+ close_ctree(tree_root);
+ return ret;
}
- return tree_root;
+ return 0;
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
@@ -2478,8 +2555,7 @@ fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
btrfs_close_devices(fs_info->fs_devices);
- free_fs_info(fs_info);
- return ERR_PTR(err);
+ return err;
recovery_tree_root:
if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2502,18 +2578,20 @@ recovery_tree_root:
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- printk_ratelimited(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
+ struct btrfs_device *device = (struct btrfs_device *)
+ bh->b_private;
+
+ printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+ "I/O error on %s\n",
+ rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
clear_buffer_uptodate(bh);
+ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
}
unlock_buffer(bh);
put_bh(bh);
@@ -2628,13 +2706,14 @@ static int write_dev_supers(struct btrfs_device *device,
set_buffer_uptodate(bh);
lock_buffer(bh);
bh->b_end_io = btrfs_end_buffer_write_sync;
+ bh->b_private = device;
}
/*
* we fua the first super. The others we allow
* to go down lazy.
*/
- ret = submit_bh(WRITE_FUA, bh);
+ ret = btrfsic_submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
@@ -2680,12 +2759,15 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk("btrfs: disabling barriers on dev %s\n",
- device->name);
+ printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+ rcu_str_deref(device->name));
device->nobarriers = 1;
}
if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
+ if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
}
/* drop the reference from the wait == 0 run */
@@ -2699,7 +2781,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
* one reference for us, and we leave it for the
* caller
*/
- device->flush_bio = NULL;;
+ device->flush_bio = NULL;
bio = bio_alloc(GFP_NOFS, 0);
if (!bio)
return -ENOMEM;
@@ -2711,7 +2793,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
device->flush_bio = bio;
bio_get(bio);
- submit_bio(WRITE_FLUSH, bio);
+ btrfsic_submit_bio(WRITE_FLUSH, bio);
return 0;
}
@@ -2814,6 +2896,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (total_errors > max_errors) {
printk(KERN_ERR "btrfs: %d errors while writing supers\n",
total_errors);
+
+ /* This shouldn't happen. FUA is masked off if unsupported */
BUG();
}
@@ -2830,9 +2914,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- printk(KERN_ERR "btrfs: %d errors while writing supers\n",
- total_errors);
- BUG();
+ btrfs_error(root->fs_info, -EIO,
+ "%d errors while writing supers", total_errors);
+ return -EIO;
}
return 0;
}
@@ -2846,7 +2930,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
@@ -2859,7 +2943,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
__btrfs_remove_free_space_cache(root->free_ino_pinned);
__btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
- return 0;
}
static void free_fs_root(struct btrfs_root *root)
@@ -2876,7 +2959,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root);
}
-static int del_fs_roots(struct btrfs_fs_info *fs_info)
+static void del_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
@@ -2905,7 +2988,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++)
btrfs_free_fs_root(fs_info, gang[i]);
}
- return 0;
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2954,14 +3036,21 @@ int btrfs_commit_super(struct btrfs_root *root)
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ if (ret)
+ return ret;
/* run commit again to drop the original snapshot */
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
ret = btrfs_write_and_wait_transaction(NULL, root);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Failed to sync btree inode to disk.");
+ return ret;
+ }
ret = write_ctree_super(NULL, root, 0);
return ret;
@@ -2975,6 +3064,9 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(root->fs_info);
+
btrfs_scrub_cancel(root);
/* wait for any defraggers to finish */
@@ -2982,7 +3074,7 @@ int close_ctree(struct btrfs_root *root)
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(root->fs_info);
+ btrfs_run_defrag_inodes(fs_info);
/*
* Here come 2 situations when btrfs is broken to flip readonly:
@@ -3011,8 +3103,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_put_block_group_cache(fs_info);
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
+ kthread_stop(fs_info->transaction_kthread);
+ kthread_stop(fs_info->cleaner_kthread);
fs_info->closing = 2;
smp_mb();
@@ -3030,14 +3122,14 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(fs_info->extent_root->commit_root);
free_extent_buffer(fs_info->tree_root->node);
free_extent_buffer(fs_info->tree_root->commit_root);
- free_extent_buffer(root->fs_info->chunk_root->node);
- free_extent_buffer(root->fs_info->chunk_root->commit_root);
- free_extent_buffer(root->fs_info->dev_root->node);
- free_extent_buffer(root->fs_info->dev_root->commit_root);
- free_extent_buffer(root->fs_info->csum_root->node);
- free_extent_buffer(root->fs_info->csum_root->commit_root);
+ free_extent_buffer(fs_info->chunk_root->node);
+ free_extent_buffer(fs_info->chunk_root->commit_root);
+ free_extent_buffer(fs_info->dev_root->node);
+ free_extent_buffer(fs_info->dev_root->commit_root);
+ free_extent_buffer(fs_info->csum_root->node);
+ free_extent_buffer(fs_info->csum_root->commit_root);
- btrfs_free_block_groups(root->fs_info);
+ btrfs_free_block_groups(fs_info);
del_fs_roots(fs_info);
@@ -3057,44 +3149,46 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- free_fs_info(fs_info);
-
return 0;
}
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int atomic)
{
int ret;
- struct inode *btree_inode = buf->first_page->mapping->host;
+ struct inode *btree_inode = buf->pages[0]->mapping->host;
- ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
- NULL);
+ ret = extent_buffer_uptodate(buf);
if (!ret)
return ret;
ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
- parent_transid);
+ parent_transid, atomic);
+ if (ret == -EAGAIN)
+ return ret;
return !ret;
}
int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
{
- struct inode *btree_inode = buf->first_page->mapping->host;
- return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ return set_extent_buffer_uptodate(buf);
}
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
u64 transid = btrfs_header_generation(buf);
- struct inode *btree_inode = root->fs_info->btree_inode;
int was_dirty;
btrfs_assert_tree_locked(buf);
@@ -3106,8 +3200,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)root->fs_info->generation);
WARN_ON(1);
}
- was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
root->fs_info->dirty_metadata_bytes += buf->len;
@@ -3161,12 +3254,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- int ret;
- ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
- if (ret == 0)
- set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
- return ret;
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
}
static int btree_lock_page_hook(struct page *page, void *data,
@@ -3174,17 +3263,21 @@ static int btree_lock_page_hook(struct page *page, void *data,
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_buffer *eb;
- unsigned long len;
- u64 bytenr = page_offset(page);
- if (page->private == EXTENT_PAGE_PRIVATE)
+ /*
+ * We culled this eb but the page is still hanging out on the mapping,
+ * carry on.
+ */
+ if (!PagePrivate(page))
goto out;
- len = page->private >> 2;
- eb = find_extent_buffer(io_tree, bytenr, len);
- if (!eb)
+ eb = (struct extent_buffer *)page->private;
+ if (!eb) {
+ WARN_ON(1);
+ goto out;
+ }
+ if (page != eb->pages[0])
goto out;
if (!btrfs_try_tree_write_lock(eb)) {
@@ -3203,7 +3296,6 @@ static int btree_lock_page_hook(struct page *page, void *data,
}
btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
out:
if (!trylock_page(page)) {
flush_fn(data);
@@ -3212,15 +3304,23 @@ out:
return 0;
}
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) {
+ printk(KERN_ERR "btrfs: unsupported checksum algorithm\n");
+ return -EINVAL;
+ }
+
if (read_only)
- return;
+ return 0;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
printk(KERN_WARNING "warning: mount fs with errors, "
"running btrfsck is recommended\n");
+ }
+
+ return 0;
}
int btrfs_error_commit_super(struct btrfs_root *root)
@@ -3242,7 +3342,7 @@ int btrfs_error_commit_super(struct btrfs_root *root)
return ret;
}
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3264,11 +3364,9 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
spin_unlock(&root->fs_info->ordered_extent_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
- return 0;
}
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct list_head splice;
struct btrfs_ordered_extent *ordered;
@@ -3300,12 +3398,10 @@ static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_root *root)
+int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -3321,28 +3417,36 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
- node = rb_first(&delayed_refs->root);
- while (node) {
+ while ((node = rb_first(&delayed_refs->root)) != NULL) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
- ref->in_tree = 0;
- rb_erase(&ref->rb_node, &delayed_refs->root);
- delayed_refs->num_entries--;
atomic_set(&ref->refs, 1);
if (btrfs_delayed_ref_is_head(ref)) {
struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
- mutex_lock(&head->mutex);
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ /* Need to wait for the delayed ref to run */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
kfree(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
- mutex_unlock(&head->mutex);
}
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3356,7 +3460,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
{
struct btrfs_pending_snapshot *snapshot;
struct list_head splice;
@@ -3374,11 +3478,9 @@ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
kfree(snapshot);
}
-
- return 0;
}
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3398,8 +3500,6 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
}
spin_unlock(&root->fs_info->delalloc_lock);
-
- return 0;
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3435,11 +3535,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
&(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
offset >> PAGE_CACHE_SHIFT);
spin_unlock(&dirty_pages->buffer_lock);
- if (eb) {
+ if (eb)
ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
&eb->bflags);
- atomic_set(&eb->refs, 1);
- }
if (PageWriteback(page))
end_page_writeback(page);
@@ -3453,8 +3551,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
spin_unlock_irq(&page->mapping->tree_lock);
}
- page->mapping->a_ops->invalidatepage(page, 0);
unlock_page(page);
+ page_cache_release(page);
}
}
@@ -3468,8 +3566,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
u64 start;
u64 end;
int ret;
+ bool loop = true;
unpin = pinned_extents;
+again:
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -3487,16 +3587,57 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
cond_resched();
}
+ if (loop) {
+ if (unpin == &root->fs_info->freed_extents[0])
+ unpin = &root->fs_info->freed_extents[1];
+ else
+ unpin = &root->fs_info->freed_extents[0];
+ loop = false;
+ goto again;
+ }
+
return 0;
}
-static int btrfs_cleanup_transaction(struct btrfs_root *root)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+ struct btrfs_root *root)
+{
+ btrfs_destroy_delayed_refs(cur_trans, root);
+ btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ cur_trans->dirty_pages.dirty_bytes);
+
+ /* FIXME: cleanup wait for commit */
+ cur_trans->in_commit = 1;
+ cur_trans->blocked = 1;
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ cur_trans->blocked = 0;
+ wake_up(&root->fs_info->transaction_wait);
+
+ cur_trans->commit_done = 1;
+ wake_up(&cur_trans->commit_wait);
+
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+
+ btrfs_destroy_pending_snapshots(cur_trans);
+
+ btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+ EXTENT_DIRTY);
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
+
+ /*
+ memset(cur_trans, 0, sizeof(*cur_trans));
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ */
+}
+
+int btrfs_cleanup_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *t;
LIST_HEAD(list);
- WARN_ON(1);
-
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
@@ -3533,6 +3674,9 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+
btrfs_destroy_pending_snapshots(t);
btrfs_destroy_delalloc_inodes(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13f..05b3fab39f7 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -44,11 +44,11 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-int clean_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
int close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
@@ -64,9 +64,10 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int atomic);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
@@ -85,6 +86,9 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_cleanup_transaction(struct btrfs_root *root);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f..614f34a899c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -13,15 +13,14 @@
parent_root_objectid) / 4)
#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
- int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
struct btrfs_fid *fid = (struct btrfs_fid *)fh;
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type;
- if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
*max_len = BTRFS_FID_SIZE_CONNECTABLE;
return 255;
} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
+ if (parent) {
u64 parent_root_id;
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
parent_root_id = BTRFS_I(parent)->root->objectid;
- spin_unlock(&dentry->d_lock);
-
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
@@ -67,7 +60,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
struct btrfs_key key;
@@ -193,7 +186,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
if (ret < 0)
goto fail;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Key with offset of -1 found */
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2b..6e1d36702ff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
#include "locking.h"
#include "free-space-cache.h"
-/* control flags for do_chunk_alloc's force field
+/*
+ * control flags for do_chunk_alloc's force field
* CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
* if we really need one.
*
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
* CHUNK_ALLOC_LIMITED means to only try and allocate one
* if we have very few chunks already allocated. This is
* used as part of the clustering code to help make sure
* we have a good pool of storage to cluster in, without
* filling the FS with empty chunks
*
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
*/
enum {
CHUNK_ALLOC_NO_FORCE = 0,
- CHUNK_ALLOC_FORCE = 1,
- CHUNK_ALLOC_LIMITED = 2,
+ CHUNK_ALLOC_LIMITED = 1,
+ CHUNK_ALLOC_FORCE = 2,
};
/*
@@ -244,7 +245,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, cache->key.objectid,
stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -252,13 +253,13 @@ static int exclude_super_stripes(struct btrfs_root *root,
ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
cache->key.objectid, bytenr,
0, &logical, &nr, &stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
while (nr--) {
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, logical[nr],
stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
kfree(logical);
@@ -320,7 +321,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
total_added += size;
ret = btrfs_add_free_space(block_group, start,
size);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
break;
@@ -331,7 +332,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
size = end - start;
total_added += size;
ret = btrfs_add_free_space(block_group, start, size);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic error */
}
return total_added;
@@ -473,7 +474,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
int ret = 0;
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
+ if (!caching_ctl)
+ return -ENOMEM;
INIT_LIST_HEAD(&caching_ctl->list);
mutex_init(&caching_ctl->mutex);
@@ -527,9 +529,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
* allocate blocks for the tree root we can't do the fast caching since
* we likely hold important locks.
*/
- if (trans && (!trans->transaction->in_commit) &&
- (root && root != root->fs_info->tree_root) &&
- btrfs_test_opt(root, SPACE_CACHE)) {
+ if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA;
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -982,7 +981,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
- BUG_ON(ret > 0);
+ BUG_ON(ret > 0); /* Corruption */
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1008,9 +1007,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
new_size + extra_size, 1);
if (ret < 0)
return ret;
- BUG_ON(ret);
+ BUG_ON(ret); /* Corruption */
- ret = btrfs_extend_item(trans, root, path, new_size);
+ btrfs_extend_item(trans, root, path, new_size);
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1478,7 +1477,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
err = ret;
goto out;
}
- BUG_ON(ret);
+ if (ret && !insert) {
+ err = -ENOENT;
+ goto out;
+ }
+ BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1592,13 +1595,13 @@ out:
* helper to add new inline back ref
*/
static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref,
- u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add,
- struct btrfs_delayed_extent_op *extent_op)
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1608,7 +1611,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 refs;
int size;
int type;
- int ret;
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1617,7 +1619,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- ret = btrfs_extend_item(trans, root, path, size);
+ btrfs_extend_item(trans, root, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1652,7 +1654,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
btrfs_mark_buffer_dirty(leaf);
- return 0;
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1687,12 +1688,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
* helper to update/remove inline back ref
*/
static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref,
- int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op)
+void update_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_mod,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1703,7 +1704,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
u32 item_size;
int size;
int type;
- int ret;
u64 refs;
leaf = path->nodes[0];
@@ -1745,10 +1745,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+ btrfs_truncate_item(trans, root, path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
- return 0;
}
static noinline_for_stack
@@ -1768,13 +1767,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
root_objectid, owner, offset, 1);
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
- ret = update_inline_extent_backref(trans, root, path, iref,
- refs_to_add, extent_op);
+ update_inline_extent_backref(trans, root, path, iref,
+ refs_to_add, extent_op);
} else if (ret == -ENOENT) {
- ret = setup_inline_extent_backref(trans, root, path, iref,
- parent, root_objectid,
- owner, offset, refs_to_add,
- extent_op);
+ setup_inline_extent_backref(trans, root, path, iref, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ ret = 0;
}
return ret;
}
@@ -1804,12 +1803,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref,
int refs_to_drop, int is_data)
{
- int ret;
+ int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
- ret = update_inline_extent_backref(trans, root, path, iref,
- -refs_to_drop, NULL);
+ update_inline_extent_backref(trans, root, path, iref,
+ -refs_to_drop, NULL);
} else if (is_data) {
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
} else {
@@ -1835,6 +1834,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
+ /* Error condition is -ENOMEM */
if (!ret) {
struct btrfs_bio_stripe *stripe = bbio->stripes;
int i;
@@ -1850,7 +1850,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
if (!ret)
discarded_bytes += stripe->length;
else if (ret != -EOPNOTSUPP)
- break;
+ break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
/*
* Just in case we get back EOPNOTSUPP for some reason,
@@ -1869,23 +1869,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
return ret;
}
+/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+ u64 root_objectid, u64 owner, u64 offset, int for_cow)
{
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
root_objectid == BTRFS_TREE_LOG_OBJECTID);
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL);
+ BTRFS_ADD_DELAYED_REF, NULL, for_cow);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL);
+ BTRFS_ADD_DELAYED_REF, NULL, for_cow);
}
return ret;
}
@@ -1940,7 +1945,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
out:
btrfs_free_path(path);
return err;
@@ -2027,6 +2033,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
+ if (trans->aborted)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2124,7 +2133,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
{
- int ret;
+ int ret = 0;
+
+ if (trans->aborted)
+ return 0;
+
if (btrfs_delayed_ref_is_head(node)) {
struct btrfs_delayed_ref_head *head;
/*
@@ -2142,11 +2155,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
- BUG_ON(ret);
}
}
mutex_unlock(&head->mutex);
- return 0;
+ return ret;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -2193,6 +2205,10 @@ again:
return NULL;
}
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct list_head *cluster)
@@ -2233,6 +2249,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
/*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+
+ if (ref && ref->seq &&
+ btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+ /*
+ * there are still refs with lower seq numbers in the
+ * process of being added. Don't run this ref yet.
+ */
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+ locked_ref = NULL;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
+ /*
* record the must insert reserved flag before we
* drop the spin lock.
*/
@@ -2242,11 +2280,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
- /*
- * locked_ref is the head node, so we have to go one
- * node back for any delayed ref updates
- */
- ref = select_delayed_ref(locked_ref);
if (!ref) {
/* All delayed refs have been processed, Go ahead
* and send the head node to run_one_delayed_ref,
@@ -2264,12 +2297,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
- BUG_ON(ret);
kfree(extent_op);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- continue;
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+ spin_lock(&delayed_refs->lock);
+ return ret;
+ }
+
+ goto next;
}
list_del_init(&locked_ref->cluster);
@@ -2279,29 +2315,62 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
-
+ /*
+ * we modified num_entries, but as we're currently running
+ * delayed refs, skip
+ * wake_up(&delayed_refs->seq_wait);
+ * here.
+ */
spin_unlock(&delayed_refs->lock);
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
- BUG_ON(ret);
btrfs_put_delayed_ref(ref);
kfree(extent_op);
count++;
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+ spin_lock(&delayed_refs->lock);
+ return ret;
+ }
+
+next:
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
return count;
}
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+ unsigned long num_refs,
+ struct list_head *first_seq)
+{
+ spin_unlock(&delayed_refs->lock);
+ pr_debug("waiting for more refs (num %ld, first %p)\n",
+ num_refs, first_seq);
+ wait_event(delayed_refs->seq_wait,
+ num_refs != delayed_refs->num_entries ||
+ delayed_refs->seq_head.next != first_seq);
+ pr_debug("done waiting for more refs (num %ld, first %p)\n",
+ delayed_refs->num_entries, delayed_refs->seq_head.next);
+ spin_lock(&delayed_refs->lock);
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
* 0, which means to process everything in the tree at the start
* of the run (but not newly added entries), or it can be some target
* number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count)
@@ -2310,16 +2379,29 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
+ struct list_head *first_seq = NULL;
int ret;
+ u64 delayed_start;
int run_all = count == (unsigned long)-1;
int run_most = 0;
+ unsigned long num_refs = 0;
+ int consider_waiting;
+
+ /* We'll clean this up in btrfs_cleanup_transaction */
+ if (trans->aborted)
+ return 0;
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
+ consider_waiting = 0;
spin_lock(&delayed_refs->lock);
if (count == 0) {
count = delayed_refs->num_entries * 2;
@@ -2336,18 +2418,53 @@ again:
* of refs to process starting at the first one we are able to
* lock
*/
+ delayed_start = delayed_refs->run_delayed_start;
ret = btrfs_find_ref_cluster(trans, &cluster,
delayed_refs->run_delayed_start);
if (ret)
break;
+ if (delayed_start >= delayed_refs->run_delayed_start) {
+ if (consider_waiting == 0) {
+ /*
+ * btrfs_find_ref_cluster looped. let's do one
+ * more cycle. if we don't run any delayed ref
+ * during that cycle (because we can't because
+ * all of them are blocked) and if the number of
+ * refs doesn't change, we avoid busy waiting.
+ */
+ consider_waiting = 1;
+ num_refs = delayed_refs->num_entries;
+ first_seq = root->fs_info->tree_mod_seq_list.next;
+ } else {
+ wait_for_more_refs(delayed_refs,
+ num_refs, first_seq);
+ /*
+ * after waiting, things have changed. we
+ * dropped the lock and someone else might have
+ * run some refs, built new clusters and so on.
+ * therefore, we restart staleness detection.
+ */
+ consider_waiting = 0;
+ }
+ }
+
ret = run_clustered_refs(trans, root, &cluster);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
count -= min_t(unsigned long, ret, count);
if (count == 0)
break;
+
+ if (ret || delayed_refs->run_delayed_start == 0) {
+ /* refs were run, let's reset staleness detection */
+ consider_waiting = 0;
+ }
}
if (run_all) {
@@ -2405,7 +2522,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->update_key = 0;
extent_op->is_data = is_data ? 1 : 0;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+ num_bytes, extent_op);
if (ret)
kfree(extent_op);
return ret;
@@ -2501,7 +2619,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = -ENOENT;
if (path->slots[0] == 0)
@@ -2590,7 +2708,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc)
+ int full_backref, int inc, int for_cow)
{
u64 bytenr;
u64 num_bytes;
@@ -2603,7 +2721,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64);
+ u64, u64, u64, u64, u64, u64, int);
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
@@ -2640,34 +2758,34 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset);
+ key.offset, for_cow);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0);
+ parent, ref_root, level - 1, 0,
+ for_cow);
if (ret)
goto fail;
}
}
return 0;
fail:
- BUG();
return ret;
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, int full_backref, int for_cow)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, int full_backref, int for_cow)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2683,7 +2801,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
if (ret < 0)
goto fail;
- BUG_ON(ret);
+ BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -2691,8 +2809,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
fail:
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
return ret;
+ }
return 0;
}
@@ -2865,7 +2985,8 @@ again:
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2892,7 +3013,9 @@ again:
last = cache->key.objectid + cache->key.offset;
err = write_one_cache_group(trans, root, path, cache);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
+
btrfs_put_block_group(cache);
}
@@ -2905,7 +3028,8 @@ again:
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2930,20 +3054,21 @@ again:
continue;
}
- btrfs_write_out_cache(root, trans, cache, path);
+ err = btrfs_write_out_cache(root, trans, cache, path);
/*
* If we didn't have an error then the cache state is still
* NEED_WRITE, so we can set it to WRITTEN.
*/
- if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+ if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
cache->disk_cache_state = BTRFS_DC_WRITTEN;
last = cache->key.objectid + cache->key.offset;
btrfs_put_block_group(cache);
}
+out:
btrfs_free_path(path);
- return 0;
+ return err;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2993,9 +3118,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
- found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
- BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA);
+ found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
found->total_bytes = total_bytes;
found->disk_total = total_bytes * factor;
found->bytes_used = bytes_used;
@@ -3016,20 +3139,52 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP);
- if (extra_flags) {
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits |= extra_flags;
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+}
+
+/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ u64 target = 0;
+
+ if (!bctl)
+ return 0;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
}
+
+ return target;
}
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format. If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
/*
@@ -3039,6 +3194,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
*/
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ u64 target;
+
+ /*
+ * see if restripe for this chunk_type is in progress, if so
+ * try to reduce to the target profile
+ */
+ spin_lock(&root->fs_info->balance_lock);
+ target = get_restripe_target(root->fs_info, flags);
+ if (target) {
+ /* pick target profile only if it's already available */
+ if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+ spin_unlock(&root->fs_info->balance_lock);
+ return extended_to_chunk(target);
+ }
+ }
+ spin_unlock(&root->fs_info->balance_lock);
if (num_devices == 1)
flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3059,22 +3230,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
((flags & BTRFS_BLOCK_GROUP_RAID1) |
(flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP)))
+ (flags & BTRFS_BLOCK_GROUP_DUP))) {
flags &= ~BTRFS_BLOCK_GROUP_RAID0;
- return flags;
+ }
+
+ return extended_to_chunk(flags);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits &
- root->fs_info->data_alloc_profile;
+ flags |= root->fs_info->avail_data_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits &
- root->fs_info->system_alloc_profile;
+ flags |= root->fs_info->avail_system_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits &
- root->fs_info->metadata_alloc_profile;
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3191,6 +3362,8 @@ commit_trans:
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
return 0;
@@ -3210,6 +3383,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = BTRFS_I(inode)->space_info;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ data_sinfo->flags, bytes, 0);
spin_unlock(&data_sinfo->lock);
}
@@ -3257,29 +3432,61 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- /*
- * we have two similar checks here, one based on percentage
- * and once based on a hard number of 256MB. The idea
- * is that if we have a good amount of free
- * room, don't allocate a chunk. A good mount is
- * less than 80% utilized of the chunks we have allocated,
- * or more than 256MB free
- */
- if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
- return 0;
+ /* 256MB or 2% of the FS */
+ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+ /* system chunks need a much small threshold */
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ thresh = 32 * 1024 * 1024;
- if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
+ if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
return 0;
+ return 1;
+}
- thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+ u64 num_dev;
- /* 256MB or 5% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+ if (type & BTRFS_BLOCK_GROUP_RAID10 ||
+ type & BTRFS_BLOCK_GROUP_RAID0)
+ num_dev = root->fs_info->fs_devices->rw_devices;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1)
+ num_dev = 2;
+ else
+ num_dev = 1; /* DUP or single */
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
- return 0;
- return 1;
+ /* metadata for updaing devices and chunk tree */
+ return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
+
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type)
+{
+ struct btrfs_space_info *info;
+ u64 left;
+ u64 thresh;
+
+ info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ spin_lock(&info->lock);
+ left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+ info->bytes_reserved - info->bytes_readonly;
+ spin_unlock(&info->lock);
+
+ thresh = get_system_chunk_thresh(root, type);
+ if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
+ left, thresh, type);
+ dump_space_info(info, 0, 0);
+ }
+
+ if (left < thresh) {
+ u64 flags;
+
+ flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+ btrfs_alloc_chunk(trans, root, flags);
+ }
}
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -3291,19 +3498,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
- flags = btrfs_reduce_alloc_profile(extent_root, flags);
-
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
0, 0, &space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
- BUG_ON(!space_info);
+ BUG_ON(!space_info); /* Logic error */
again:
spin_lock(&space_info->lock);
- if (space_info->force_alloc)
+ if (force < space_info->force_alloc)
force = space_info->force_alloc;
if (space_info->full) {
spin_unlock(&space_info->lock);
@@ -3354,6 +3559,12 @@ again:
force_metadata_allocation(fs_info);
}
+ /*
+ * Check if we have enough space in SYSTEM chunk because we may need
+ * to update devices.
+ */
+ check_system_chunk(trans, extent_root, flags);
+
ret = btrfs_alloc_chunk(trans, extent_root, flags);
if (ret < 0 && ret != -ENOSPC)
goto out;
@@ -3368,7 +3579,7 @@ again:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
out:
- mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -3499,12 +3710,15 @@ static int may_commit_transaction(struct btrfs_root *root,
if (space_info != delayed_rsv->space_info)
return -ENOSPC;
+ spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
- if (delayed_rsv->size < bytes) {
+ if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
spin_unlock(&delayed_rsv->lock);
+ spin_unlock(&space_info->lock);
return -ENOSPC;
}
spin_unlock(&delayed_rsv->lock);
+ spin_unlock(&space_info->lock);
commit:
trans = btrfs_join_transaction(root);
@@ -3558,9 +3772,8 @@ again:
*/
if (current->journal_info)
return -EAGAIN;
- ret = wait_event_interruptible(space_info->wait,
- !space_info->flush);
- /* Must have been interrupted, return */
+ ret = wait_event_killable(space_info->wait, !space_info->flush);
+ /* Must have been killed, return */
if (ret)
return -EINTR;
@@ -3582,6 +3795,8 @@ again:
if (used <= space_info->total_bytes) {
if (used + orig_bytes <= space_info->total_bytes) {
space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags, orig_bytes, 1);
ret = 0;
} else {
/*
@@ -3649,6 +3864,8 @@ again:
if (used + num_bytes < space_info->total_bytes + avail) {
space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags, orig_bytes, 1);
ret = 0;
} else {
wait_ordered = true;
@@ -3711,8 +3928,9 @@ out:
return ret;
}
-static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static struct btrfs_block_rsv *get_block_rsv(
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root)
{
struct btrfs_block_rsv *block_rsv = NULL;
@@ -3755,7 +3973,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +4010,8 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
if (num_bytes) {
spin_lock(&space_info->lock);
space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
space_info->reservation_progress++;
spin_unlock(&space_info->lock);
}
@@ -3947,7 +4168,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
if (global_rsv->full || global_rsv == block_rsv ||
block_rsv->space_info != global_rsv->space_info)
global_rsv = NULL;
- block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+ block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+ num_bytes);
}
/*
@@ -3993,8 +4215,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = calc_global_metadata_size(fs_info);
- spin_lock(&block_rsv->lock);
spin_lock(&sinfo->lock);
+ spin_lock(&block_rsv->lock);
block_rsv->size = num_bytes;
@@ -4006,18 +4228,22 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = sinfo->total_bytes - num_bytes;
block_rsv->reserved += num_bytes;
sinfo->bytes_may_use += num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes, 1);
}
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
sinfo->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes, 0);
sinfo->reservation_progress++;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
}
- spin_unlock(&sinfo->lock);
spin_unlock(&block_rsv->lock);
+ spin_unlock(&sinfo->lock);
}
static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4045,7 +4271,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
- block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+ (u64)-1);
WARN_ON(fs_info->delalloc_block_rsv.size > 0);
WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,10 +4289,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
if (!trans->bytes_reserved)
return;
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
trans->bytes_reserved = 0;
}
+/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
@@ -4079,6 +4309,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
* when we are truly done with the orphan item.
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 1);
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -4086,6 +4318,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 0);
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
@@ -4122,10 +4356,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
BTRFS_I(inode)->outstanding_extents--;
if (BTRFS_I(inode)->outstanding_extents == 0 &&
- BTRFS_I(inode)->delalloc_meta_reserved) {
+ test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
drop_inode_space = 1;
- BTRFS_I(inode)->delalloc_meta_reserved = 0;
- }
/*
* If we have more or the same amount of outsanding extents than we have
@@ -4213,12 +4446,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
/* Need to be holding the i_mutex here if we aren't free space cache */
if (btrfs_is_free_space_inode(root, inode))
flush = 0;
- else
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
if (flush && btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4233,7 +4465,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
* Add an item to reserve for updating the inode when we complete the
* delalloc io.
*/
- if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+ if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
nr_extents++;
extra_reserve = 1;
}
@@ -4266,19 +4499,30 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (dropped)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
- if (to_free)
+ if (to_free) {
btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info,
+ "delalloc",
+ btrfs_ino(inode),
+ to_free, 0);
+ }
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
spin_lock(&BTRFS_I(inode)->lock);
if (extra_reserve) {
- BTRFS_I(inode)->delalloc_meta_reserved = 1;
+ set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
nr_extents--;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ if (to_reserve)
+ trace_btrfs_space_reservation(root->fs_info,"delalloc",
+ btrfs_ino(inode), to_reserve, 1);
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
@@ -4308,6 +4552,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
}
@@ -4387,7 +4633,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache)
- return -1;
+ return -ENOENT;
if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -4490,7 +4736,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
pin_down_extent(root, cache, bytenr, num_bytes, reserved);
@@ -4508,7 +4754,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
/*
* pull in the free space cache (if any) so that our pin
@@ -4553,6 +4799,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (reserve != RESERVE_FREE) {
@@ -4562,7 +4809,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
if (reserve == RESERVE_ALLOC) {
- BUG_ON(space_info->bytes_may_use < num_bytes);
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
}
@@ -4578,7 +4827,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
return ret;
}
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4608,7 +4857,6 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->extent_commit_sem);
update_global_block_rsv(fs_info);
- return 0;
}
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
@@ -4623,7 +4871,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (cache)
btrfs_put_block_group(cache);
cache = btrfs_lookup_block_group(fs_info, start);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
}
len = cache->key.objectid + cache->key.offset - start;
@@ -4660,6 +4908,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
u64 end;
int ret;
+ if (trans->aborted)
+ return 0;
+
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
@@ -4745,7 +4996,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
is_data);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -4763,10 +5015,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root,
path->nodes[0]);
}
- BUG_ON(ret);
+ if (ret < 0)
+ goto abort;
extent_slot = path->slots[0];
}
- } else {
+ } else if (ret == -ENOENT) {
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
@@ -4776,6 +5029,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)root_objectid,
(unsigned long long)owner_objectid,
(unsigned long long)owner_offset);
+ } else {
+ goto abort;
}
leaf = path->nodes[0];
@@ -4785,7 +5040,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(found_extent || extent_slot != path->slots[0]);
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto abort;
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -4802,7 +5058,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
- BUG_ON(ret);
+ if (ret < 0)
+ goto abort;
extent_slot = path->slots[0];
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -4839,7 +5096,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
is_data);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
}
} else {
if (found_extent) {
@@ -4856,23 +5114,27 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
- BUG_ON(ret);
- } else {
- invalidate_mapping_pages(info->btree_inode->i_mapping,
- bytenr >> PAGE_CACHE_SHIFT,
- (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+ if (ret)
+ goto abort;
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
}
+out:
btrfs_free_path(path);
return ret;
+
+abort:
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
}
/*
@@ -4928,6 +5190,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
rb_erase(&head->node.rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
+ if (waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
/*
* we don't take a ref on the node because we're removing it from the
@@ -4961,11 +5225,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
- parent, root->root_key.objectid,
- btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL);
- BUG_ON(ret);
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
if (!last_ref)
@@ -4999,12 +5264,13 @@ out:
btrfs_put_block_group(cache);
}
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int for_cow)
{
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
/*
* tree log blocks never actually go into the extent allocation
@@ -5016,15 +5282,16 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL);
- BUG_ON(ret);
+ BTRFS_DROP_DELAYED_REF, NULL, for_cow);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
- parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF, NULL);
- BUG_ON(ret);
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, owner,
+ offset, BTRFS_DROP_DELAYED_REF,
+ NULL, for_cow);
}
return ret;
}
@@ -5081,28 +5348,34 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+static int __get_block_group_index(u64 flags)
{
int index;
- if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
index = 0;
- else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
index = 1;
- else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
index = 2;
- else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
index = 3;
else
index = 4;
+
return index;
}
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ return __get_block_group_index(cache->flags);
+}
+
enum btrfs_loop_type {
- LOOP_FIND_IDEAL = 0,
- LOOP_CACHING_NOWAIT = 1,
- LOOP_CACHING_WAIT = 2,
- LOOP_ALLOC_CHUNK = 3,
- LOOP_NO_EMPTY_SIZE = 4,
+ LOOP_CACHING_NOWAIT = 0,
+ LOOP_CACHING_WAIT = 1,
+ LOOP_ALLOC_CHUNK = 2,
+ LOOP_NO_EMPTY_SIZE = 3,
};
/*
@@ -5116,7 +5389,6 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
- u64 search_start, u64 search_end,
u64 hint_byte, struct btrfs_key *ins,
u64 data)
{
@@ -5125,6 +5397,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
struct btrfs_block_group_cache *used_block_group;
+ u64 search_start = 0;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int done_chunk_alloc = 0;
@@ -5138,14 +5411,14 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
- u64 ideal_cache_percent = 0;
- u64 ideal_cache_offset = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
ins->objectid = 0;
ins->offset = 0;
+ trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
space_info = __find_space_info(root->fs_info, data);
if (!space_info) {
printk(KERN_ERR "No space info for %llu\n", data);
@@ -5187,7 +5460,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
empty_cluster = 0;
if (search_start == hint_byte) {
-ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
used_block_group = block_group;
@@ -5199,8 +5471,7 @@ ideal_cache:
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, data) &&
- (block_group->cached != BTRFS_CACHE_NO ||
- search_start == ideal_cache_offset)) {
+ block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -5254,56 +5525,16 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
- u64 free_percent;
-
found_uncached_bg = true;
ret = cache_block_group(block_group, trans,
- orig_root, 1);
- if (block_group->cached == BTRFS_CACHE_FINISHED)
- goto alloc;
-
- free_percent = btrfs_block_group_used(&block_group->item);
- free_percent *= 100;
- free_percent = div64_u64(free_percent,
- block_group->key.offset);
- free_percent = 100 - free_percent;
- if (free_percent > ideal_cache_percent &&
- likely(!block_group->ro)) {
- ideal_cache_offset = block_group->key.objectid;
- ideal_cache_percent = free_percent;
- }
-
- /*
- * The caching workers are limited to 2 threads, so we
- * can queue as much work as we care to.
- */
- if (loop > LOOP_FIND_IDEAL) {
- ret = cache_block_group(block_group, trans,
- orig_root, 0);
- BUG_ON(ret);
- }
-
- /*
- * If loop is set for cached only, try the next block
- * group.
- */
- if (loop == LOOP_FIND_IDEAL)
- goto loop;
+ orig_root, 0);
+ BUG_ON(ret < 0);
+ ret = 0;
}
-alloc:
if (unlikely(block_group->ro))
goto loop;
- spin_lock(&block_group->free_space_ctl->tree_lock);
- if (cached &&
- block_group->free_space_ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- spin_unlock(&block_group->free_space_ctl->tree_lock);
- goto loop;
- }
- spin_unlock(&block_group->free_space_ctl->tree_lock);
-
/*
* Ok we want to try and use the cluster allocator, so
* lets look there
@@ -5331,6 +5562,8 @@ alloc:
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start, num_bytes);
goto checks;
}
@@ -5349,8 +5582,15 @@ refill_cluster:
* plenty of times and not have found
* anything, so we are likely way too
* fragmented for the clustering stuff to find
- * anything. */
- if (loop >= LOOP_NO_EMPTY_SIZE) {
+ * anything.
+ *
+ * However, if the cluster is taken from the
+ * current block group, release the cluster
+ * first, so that we stand a better chance of
+ * succeeding in the unclustered
+ * allocation. */
+ if (loop >= LOOP_NO_EMPTY_SIZE &&
+ last_ptr->block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
}
@@ -5361,6 +5601,11 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
@@ -5377,6 +5622,9 @@ refill_cluster:
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start,
+ num_bytes);
goto checks;
}
} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5649,15 @@ refill_cluster:
}
unclustered_alloc:
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_cluster + empty_size) {
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5425,11 +5682,6 @@ unclustered_alloc:
}
checks:
search_start = stripe_align(root, offset);
- /* move on to the next group */
- if (search_start + num_bytes >= search_end) {
- btrfs_add_free_space(used_block_group, offset, num_bytes);
- goto loop;
- }
/* move on to the next group */
if (search_start + num_bytes >
@@ -5438,9 +5690,6 @@ checks:
goto loop;
}
- ins->objectid = search_start;
- ins->offset = num_bytes;
-
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -5457,6 +5706,8 @@ checks:
ins->objectid = search_start;
ins->offset = num_bytes;
+ trace_btrfs_reserve_extent(orig_root, block_group,
+ search_start, num_bytes);
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -5481,9 +5732,7 @@ loop:
if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
goto search;
- /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
- * for them to make caching progress. Also
- * determine the best possible bg to cache
+ /*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5493,50 +5742,17 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
- found_uncached_bg = false;
- loop++;
- if (!ideal_cache_percent)
- goto search;
-
- /*
- * 1 of the following 2 things have happened so far
- *
- * 1) We found an ideal block group for caching that
- * is mostly full and will cache quickly, so we might
- * as well wait for it.
- *
- * 2) We searched for cached only and we didn't find
- * anything, and we didn't start any caching kthreads
- * either, so chances are we will loop through and
- * start a couple caching kthreads, and then come back
- * around and just wait for them. This will be slower
- * because we will have 2 caching kthreads reading at
- * the same time when we could have just started one
- * and waited for it to get far enough to give us an
- * allocation, so go ahead and go to the wait caching
- * loop.
- */
- loop = LOOP_CACHING_WAIT;
- search_start = ideal_cache_offset;
- ideal_cache_percent = 0;
- goto ideal_cache;
- } else if (loop == LOOP_FIND_IDEAL) {
- /*
- * Didn't find a uncached bg, wait on anything we find
- * next.
- */
- loop = LOOP_CACHING_WAIT;
- goto search;
- }
-
loop++;
-
if (loop == LOOP_ALLOC_CHUNK) {
if (allowed_chunk_alloc) {
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data,
CHUNK_ALLOC_LIMITED);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto out;
+ }
allowed_chunk_alloc = 0;
if (ret == 1)
done_chunk_alloc = 1;
@@ -5565,6 +5781,7 @@ loop:
} else if (ins->objectid) {
ret = 0;
}
+out:
return ret;
}
@@ -5618,11 +5835,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
+ struct btrfs_key *ins, u64 data)
{
+ bool final_tried = false;
int ret;
- u64 search_start = 0;
data = btrfs_get_alloc_profile(root, data);
again:
@@ -5630,32 +5846,44 @@ again:
* the only place that sets empty_size is btrfs_realloc_node, which
* is not called recursively on allocations
*/
- if (empty_size || root->ref_cows)
+ if (empty_size || root->ref_cows) {
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
num_bytes + 2 * 1024 * 1024, data,
CHUNK_ALLOC_NO_FORCE);
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- search_start, search_end, hint_byte,
- ins, data);
+ hint_byte, ins, data);
- if (ret == -ENOSPC && num_bytes > min_alloc_size) {
- num_bytes = num_bytes >> 1;
- num_bytes = num_bytes & ~(root->sectorsize - 1);
- num_bytes = max(num_bytes, min_alloc_size);
- do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, CHUNK_ALLOC_FORCE);
- goto again;
- }
- if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
- struct btrfs_space_info *sinfo;
-
- sinfo = __find_space_info(root->fs_info, data);
- printk(KERN_ERR "btrfs allocation failed flags %llu, "
- "wanted %llu\n", (unsigned long long)data,
- (unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes, 1);
+ if (ret == -ENOSPC) {
+ if (!final_tried) {
+ num_bytes = num_bytes >> 1;
+ num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = max(num_bytes, min_alloc_size);
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes, data, CHUNK_ALLOC_FORCE);
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ if (num_bytes == min_alloc_size)
+ final_tried = true;
+ goto again;
+ } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = __find_space_info(root->fs_info, data);
+ printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ "wanted %llu\n", (unsigned long long)data,
+ (unsigned long long)num_bytes);
+ if (sinfo)
+ dump_space_info(sinfo, num_bytes, 1);
+ }
}
trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5733,7 +5961,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5763,7 +5994,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
- if (ret) {
+ if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
(unsigned long long)ins->offset);
@@ -5794,7 +6025,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5824,7 +6058,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
- if (ret) {
+ if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
(unsigned long long)ins->offset);
@@ -5842,9 +6076,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
- 0, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL);
+ ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+ ins->offset, 0,
+ root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
return ret;
}
@@ -5871,28 +6106,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
if (!caching_ctl) {
BUG_ON(!block_group_cache_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else if (start + num_bytes <= caching_ctl->progress) {
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
num_bytes = caching_ctl->progress - start;
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
start = caching_ctl->progress;
num_bytes = ins->objectid + ins->offset -
caching_ctl->progress;
ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
mutex_unlock(&caching_ctl->mutex);
@@ -5901,7 +6136,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
- BUG_ON(ret);
+ BUG_ON(ret); /* logic error */
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -5922,6 +6157,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(trans, root, buf);
+ clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
btrfs_set_buffer_uptodate(buf);
@@ -5997,10 +6233,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOSPC);
}
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
block_rsv_add_bytes(block_rsv, blocksize, 0);
- block_rsv_release_bytes(block_rsv, NULL, 0);
+ block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
}
/*
@@ -6028,15 +6265,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
- empty_size, hint, (u64)-1, &ins, 0);
+ empty_size, hint, &ins, 0);
if (ret) {
- unuse_block_rsv(block_rsv, blocksize);
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
}
buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
- BUG_ON(IS_ERR(buf));
+ BUG_ON(IS_ERR(buf)); /* -ENOMEM */
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -6048,7 +6285,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
+ BUG_ON(!extent_op); /* -ENOMEM */
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
@@ -6058,11 +6295,12 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
extent_op->update_flags = 1;
extent_op->is_data = 0;
- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ ins.objectid,
ins.offset, parent, root_objectid,
level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
- BUG_ON(ret);
+ extent_op, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
return buf;
}
@@ -6078,6 +6316,7 @@ struct walk_control {
int keep_locks;
int reada_slot;
int reada_count;
+ int for_reloc;
};
#define DROP_REFERENCE 1
@@ -6131,7 +6370,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&refs, &flags);
- BUG_ON(ret);
+ /* We don't care about errors in readahead. */
+ if (ret < 0)
+ continue;
BUG_ON(refs == 0);
if (wc->stage == DROP_REFERENCE) {
@@ -6198,7 +6439,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
eb->start, eb->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ BUG_ON(ret == -ENOMEM);
+ if (ret)
+ return ret;
BUG_ON(wc->refs[level] == 0);
}
@@ -6216,13 +6459,13 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1);
- BUG_ON(ret);
- ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -6294,7 +6537,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&wc->refs[level - 1],
&wc->flags[level - 1]);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_tree_unlock(next);
+ return ret;
+ }
+
BUG_ON(wc->refs[level - 1] == 0);
*lookup_info = 0;
@@ -6323,7 +6570,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
goto skip;
}
- if (!btrfs_buffer_uptodate(next, generation)) {
+ if (!btrfs_buffer_uptodate(next, generation, 0)) {
btrfs_tree_unlock(next);
free_extent_buffer(next);
next = NULL;
@@ -6362,8 +6609,8 @@ skip:
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0);
- BUG_ON(ret);
+ root->root_key.objectid, level - 1, 0, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
free_extent_buffer(next);
@@ -6421,7 +6668,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
eb->start, eb->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ return ret;
+ }
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
@@ -6436,10 +6686,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = btrfs_dec_ref(trans, root, eb, 1);
+ ret = btrfs_dec_ref(trans, root, eb, 1,
+ wc->for_reloc);
else
- ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, eb, 0,
+ wc->for_reloc);
+ BUG_ON(ret); /* -ENOMEM */
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
@@ -6548,8 +6800,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref,
+ int for_reloc)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -6575,7 +6828,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
}
trans = btrfs_start_transaction(tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
if (block_rsv)
trans->block_rsv = block_rsv;
@@ -6600,7 +6856,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
path->lowest_level = 0;
if (ret < 0) {
err = ret;
- goto out_free;
+ goto out_end_trans;
}
WARN_ON(ret > 0);
@@ -6620,7 +6876,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
path->nodes[level]->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
BUG_ON(wc->refs[level] == 0);
if (level == root_item->drop_level)
@@ -6637,6 +6896,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
+ wc->for_reloc = for_reloc;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
@@ -6670,26 +6930,40 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
btrfs_end_transaction_throttle(trans, tree_root);
trans = btrfs_start_transaction(tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
if (block_rsv)
trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
- BUG_ON(err);
+ if (err)
+ goto out_end_trans;
ret = btrfs_del_root(trans, tree_root, &root->root_key);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ goto out_end_trans;
+ }
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
NULL, NULL);
- BUG_ON(ret < 0);
- if (ret > 0) {
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ } else if (ret > 0) {
/* if we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
@@ -6707,20 +6981,22 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
free_extent_buffer(root->commit_root);
kfree(root);
}
-out_free:
+out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
kfree(wc);
btrfs_free_path(path);
out:
if (err)
btrfs_std_error(root->fs_info, err);
- return;
+ return err;
}
/*
* drop subtree rooted at tree block 'node'.
*
* NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
*/
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -6765,6 +7041,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
+ wc->for_reloc = 1;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
@@ -6789,8 +7066,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
{
u64 num_devices;
- u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+ u64 stripped;
+
+ /*
+ * if restripe for this chunk_type is on pick target profile and
+ * return, otherwise do the usual balance
+ */
+ stripped = get_restripe_target(root->fs_info, flags);
+ if (stripped)
+ return extended_to_chunk(stripped);
/*
* we add in the count of missing devices because we want
@@ -6800,6 +7084,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
if (num_devices == 1) {
stripped |= BTRFS_BLOCK_GROUP_DUP;
stripped = flags & ~stripped;
@@ -6812,7 +7099,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
return stripped | BTRFS_BLOCK_GROUP_DUP;
- return flags;
} else {
/* they already had raid on here, just return */
if (flags & stripped)
@@ -6825,9 +7111,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (flags & BTRFS_BLOCK_GROUP_DUP)
return stripped | BTRFS_BLOCK_GROUP_RAID1;
- /* turn single device chunks into raid0 */
- return stripped | BTRFS_BLOCK_GROUP_RAID0;
+ /* this is drive concat, leave it alone */
}
+
return flags;
}
@@ -6886,12 +7172,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
BUG_ON(cache->ro);
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
alloc_flags = update_block_group_flags(root, cache->flags);
- if (alloc_flags != cache->flags)
- do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
- CHUNK_ALLOC_FORCE);
+ if (alloc_flags != cache->flags) {
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ }
ret = set_block_group_ro(cache, 0);
if (!ret)
@@ -6971,7 +7261,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-int btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -6987,7 +7277,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,
cache->ro = 0;
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
- return 0;
}
/*
@@ -7005,6 +7294,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
+ u64 target;
int index;
int full = 0;
int ret = 0;
@@ -7045,13 +7335,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
/*
* ok we don't have enough space, but maybe we have free space on our
* devices to allocate new chunks for relocation, so loop through our
- * alloc devices and guess if we have enough space. However, if we
- * were marked as full, then we know there aren't enough chunks, and we
- * can just return.
+ * alloc devices and guess if we have enough space. if this block
+ * group is going to be restriped, run checks against the target
+ * profile instead of the current one.
*/
ret = -1;
- if (full)
- goto out;
/*
* index:
@@ -7061,7 +7349,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* 3: raid0
* 4: single
*/
- index = get_block_group_index(block_group);
+ target = get_restripe_target(root->fs_info, block_group->flags);
+ if (target) {
+ index = __get_block_group_index(extended_to_chunk(target));
+ } else {
+ /*
+ * this is just a balance, so if we were marked as full
+ * we know there is no space for a new chunk
+ */
+ if (full)
+ goto out;
+
+ index = get_block_group_index(block_group);
+ }
+
if (index == 0) {
dev_min = 4;
/* Divide by 2 */
@@ -7085,7 +7386,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free) {
- ret = find_free_dev_extent(NULL, device, min_free,
+ ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -7355,7 +7656,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
&space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7364,7 +7665,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
@@ -7446,7 +7747,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
+ update_global_block_rsv(root->fs_info);
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7455,17 +7757,33 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
sizeof(cache->item));
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ return ret;
+ }
set_avail_alloc_bits(extent_root->fs_info, type);
return 0;
}
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start)
{
@@ -7476,6 +7794,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct inode *inode;
int ret;
+ int index;
int factor;
root = root->fs_info->extent_root;
@@ -7491,6 +7810,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, block_group);
memcpy(&key, &block_group->key, sizeof(key));
+ index = get_block_group_index(block_group);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -7522,7 +7842,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
inode = lookup_free_space_inode(tree_root, block_group, path);
if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
@@ -7565,6 +7888,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index]))
+ clear_avail_alloc_bits(root->fs_info, block_group->flags);
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
@@ -7654,9 +7979,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret = 0;
- cache = btrfs_lookup_block_group(fs_info, range->start);
+ /*
+ * try to trim all FS space, our block group may start from non-zero.
+ */
+ if (range->len == total_bytes)
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ else
+ cache = btrfs_lookup_block_group(fs_info, range->start);
while (cache) {
if (cache->key.objectid >= (range->start + range->len)) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f..01c21b6c6d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,9 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
+#include "rcu-string.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -52,6 +55,13 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+ return btrfs_sb(tree->mapping->host->i_sb);
+}
+
int __init extent_io_init(void)
{
extent_state_cache = kmem_cache_create("extent_state",
@@ -135,6 +145,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
#endif
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
@@ -152,6 +163,7 @@ void free_extent_state(struct extent_state *state)
list_del(&state->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
+ trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -175,7 +187,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
return parent;
}
- entry = rb_entry(node, struct tree_entry, rb_node);
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
@@ -391,20 +402,28 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
return 0;
}
+static struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
/*
* utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
+ * it will optionally wake up any one waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
*/
-static int clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- int *bits, int wake)
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ int *bits, int wake)
{
+ struct extent_state *next;
int bits_to_clear = *bits & ~EXTENT_CTLBITS;
- int ret = state->state & bits_to_clear;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -416,6 +435,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
if (wake)
wake_up(&state->wq);
if (state->state == 0) {
+ next = next_state(state);
if (state->tree) {
rb_erase(&state->rb_node, &tree->state);
state->tree = NULL;
@@ -425,8 +445,9 @@ static int clear_state_bit(struct extent_io_tree *tree,
}
} else {
merge_state(tree, state);
+ next = next_state(state);
}
- return ret;
+ return next;
}
static struct extent_state *
@@ -438,6 +459,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
return prealloc;
}
+void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+ "Extent tree was modified by another "
+ "thread while locked.");
+}
+
/*
* clear some bits on a range in the tree. This may require splitting
* or inserting elements in the tree, so the gfp mask is used to
@@ -448,8 +476,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
*
* the range [start, end] is inclusive.
*
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int wake, int delete,
@@ -459,11 +486,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
- struct rb_node *next_node;
struct rb_node *node;
u64 last_end;
int err;
- int set = 0;
int clear = 0;
if (delete)
@@ -512,6 +537,12 @@ hit_next:
WARN_ON(state->end < start);
last_end = state->end;
+ /* the state doesn't have the wanted bits, go ahead */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
/*
* | ---- desired range ---- |
* | state | or
@@ -532,15 +563,15 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, &bits, wake);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
+ state = clear_state_bit(tree, state, &bits, wake);
+ goto next;
}
goto search_again;
}
@@ -554,31 +585,25 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake);
prealloc = NULL;
goto out;
}
- if (state->end < end && prealloc && !need_resched())
- next_node = rb_next(&state->rb_node);
- else
- next_node = NULL;
-
- set |= clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake);
+next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- if (start <= end && next_node) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ if (start <= end && state && !need_resched())
+ goto hit_next;
goto search_again;
out:
@@ -586,7 +611,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return set;
+ return 0;
search_again:
if (start > end)
@@ -597,8 +622,8 @@ search_again:
goto again;
}
-static int wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
__releases(tree->lock)
__acquires(tree->lock)
{
@@ -608,7 +633,6 @@ static int wait_on_state(struct extent_io_tree *tree,
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
- return 0;
}
/*
@@ -616,7 +640,7 @@ static int wait_on_state(struct extent_io_tree *tree,
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -653,7 +677,6 @@ again:
}
out:
spin_unlock(&tree->lock);
- return 0;
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -701,9 +724,10 @@ static void uncache_state(struct extent_state **cached_ptr)
* [start, end] is inclusive This takes the tree lock.
*/
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask)
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -737,8 +761,10 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
- BUG_ON(err == -EEXIST);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -753,7 +779,6 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- struct rb_node *next_node;
if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
@@ -761,20 +786,15 @@ hit_next:
}
set_state_bits(tree, state, &bits);
-
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
-
start = last_end + 1;
- next_node = rb_next(&state->rb_node);
- if (next_node && start < end && prealloc && !need_resched()) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
goto search_again;
}
@@ -804,7 +824,9 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
if (err)
goto out;
@@ -815,6 +837,10 @@ hit_next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
}
goto search_again;
}
@@ -841,12 +867,9 @@ hit_next:
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
- BUG_ON(err == -EEXIST);
- if (err) {
- free_extent_state(prealloc);
- prealloc = NULL;
- goto out;
- }
+ if (err)
+ extent_io_tree_panic(tree, err);
+
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
@@ -868,7 +891,8 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
cache_state(prealloc, cached_state);
@@ -895,6 +919,15 @@ search_again:
goto again;
}
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+ u64 *failed_start, struct extent_state **cached_state,
+ gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+ cached_state, mask);
+}
+
+
/**
* convert_extent - convert all bits in a given range from one bit to another
* @tree: the io tree to search
@@ -941,7 +974,8 @@ again:
}
err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -956,23 +990,14 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- struct rb_node *next_node;
-
set_state_bits(tree, state, &bits);
- clear_state_bit(tree, state, &clear_bits, 0);
-
- merge_state(tree, state);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
-
start = last_end + 1;
- next_node = rb_next(&state->rb_node);
- if (next_node && start < end && prealloc && !need_resched()) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
goto search_again;
}
@@ -999,17 +1024,20 @@ hit_next:
goto out;
}
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
- clear_state_bit(tree, state, &clear_bits, 0);
- merge_state(tree, state);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
}
goto search_again;
}
@@ -1039,12 +1067,8 @@ hit_next:
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
- BUG_ON(err == -EEXIST);
- if (err) {
- free_extent_state(prealloc);
- prealloc = NULL;
- goto out;
- }
+ if (err)
+ extent_io_tree_panic(tree, err);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -1063,12 +1087,11 @@ hit_next:
}
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
clear_state_bit(tree, prealloc, &clear_bits, 0);
-
- merge_state(tree, prealloc);
prealloc = NULL;
goto out;
}
@@ -1095,14 +1118,14 @@ search_again:
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL,
+ return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
@@ -1117,7 +1140,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE,
- 0, NULL, cached_state, mask);
+ NULL, cached_state, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1131,7 +1154,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
NULL, mask);
}
@@ -1139,12 +1162,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
- NULL, cached_state, mask);
+ cached_state, mask);
}
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state,
- gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
cached_state, mask);
@@ -1155,42 +1177,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached_state, gfp_t mask)
+ int bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
- EXTENT_LOCKED, &failed_start,
- cached_state, mask);
- if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS);
+ if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
- } else {
+ } else
break;
- }
WARN_ON(start > end);
}
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
- return lock_extent_bits(tree, start, end, 0, NULL, mask);
+ return lock_extent_bits(tree, start, end, 0, NULL);
}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
u64 failed_start;
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, mask);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL, mask);
+ EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
return 0;
}
return 1;
@@ -1203,10 +1223,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
mask);
}
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- mask);
+ GFP_NOFS);
}
/*
@@ -1220,7 +1240,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
while (index <= end_index) {
page = find_get_page(tree->mapping, index);
- BUG_ON(!page);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
set_page_writeback(page);
page_cache_release(page);
index++;
@@ -1264,7 +1284,7 @@ out:
* returned if we find something, and *start_ret and *end_ret are
* set to reflect the state struct that was found.
*
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, int bits)
@@ -1343,9 +1363,9 @@ out:
return found;
}
-static noinline int __unlock_for_delalloc(struct inode *inode,
- struct page *locked_page,
- u64 start, u64 end)
+static noinline void __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
{
int ret;
struct page *pages[16];
@@ -1355,7 +1375,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
int i;
if (index == locked_page->index && end_index == index)
- return 0;
+ return;
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1370,7 +1390,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
index += ret;
cond_resched();
}
- return 0;
}
static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1500,11 +1519,10 @@ again:
goto out_failed;
}
}
- BUG_ON(ret);
+ BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end,
- 0, &cached_state, GFP_NOFS);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1761,39 +1779,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
* helper function to set a given page up to date if all the
* extents in the tree for that page are up to date
*/
-static int check_page_uptodate(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
- return 0;
}
/*
* helper function to unlock a page if all the extents in the tree
* for that page are unlocked
*/
-static int check_page_locked(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_locked(struct extent_io_tree *tree, struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
- return 0;
}
/*
* helper function to end page writeback if all the extents
* in the tree for that page are done with writeback
*/
-static int check_page_writeback(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_writeback(struct extent_io_tree *tree,
+ struct page *page)
{
end_page_writeback(page);
- return 0;
}
/*
@@ -1895,23 +1908,44 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
}
bio->bi_bdev = dev->bdev;
bio_add_page(bio, page, length, start-page_offset(page));
- submit_bio(WRITE_SYNC, bio);
+ btrfsic_submit_bio(WRITE_SYNC, bio);
wait_for_completion(&compl);
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
/* try to remap that extent elsewhere? */
bio_put(bio);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
- "sector %llu)\n", page->mapping->host->i_ino, start,
- dev->name, sector);
+ printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+ "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+ start, rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
}
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ u64 start = eb->start;
+ unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+ int ret = 0;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+ start, p, mirror_num);
+ if (ret)
+ break;
+ start += PAGE_CACHE_SIZE;
+ }
+
+ return ret;
+}
+
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
@@ -2141,6 +2175,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
}
bio = bio_alloc(GFP_NOFS, 1);
+ if (!bio) {
+ free_io_failure(inode, failrec, 0);
+ return -EIO;
+ }
bio->bi_private = state;
bio->bi_end_io = failed_bio->bi_end_io;
bio->bi_sector = failrec->logical >> 9;
@@ -2153,13 +2191,36 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
"this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
failrec->this_mirror, num_copies, failrec->in_validation);
- tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
- failrec->bio_flags, 0);
- return 0;
+ ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
+ failrec->this_mirror,
+ failrec->bio_flags, 0);
+ return ret;
}
/* lots and lots of room for performance fixes in the end_bio funcs */
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+ int uptodate = (err == 0);
+ struct extent_io_tree *tree;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (tree->ops && tree->ops->writepage_end_io_hook) {
+ ret = tree->ops->writepage_end_io_hook(page, start,
+ end, NULL, uptodate);
+ if (ret)
+ uptodate = 0;
+ }
+
+ if (!uptodate) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ return 0;
+}
+
/*
* after a writepage IO is done, we need to:
* clear the uptodate bits on error
@@ -2171,13 +2232,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
*/
static void end_bio_extent_writepage(struct bio *bio, int err)
{
- int uptodate = err == 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct extent_io_tree *tree;
u64 start;
u64 end;
int whole_page;
- int ret;
do {
struct page *page = bvec->bv_page;
@@ -2194,28 +2253,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
- if (tree->ops && tree->ops->writepage_end_io_hook) {
- ret = tree->ops->writepage_end_io_hook(page, start,
- end, NULL, uptodate);
- if (ret)
- uptodate = 0;
- }
- if (!uptodate && tree->ops &&
- tree->ops->writepage_io_failed_hook) {
- ret = tree->ops->writepage_io_failed_hook(bio, page,
- start, end, NULL);
- if (ret == 0) {
- uptodate = (err == 0);
- continue;
- }
- }
-
- if (!uptodate) {
- clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
- ClearPageUptodate(page);
- SetPageError(page);
- }
+ if (end_extent_writepage(page, err, start, end))
+ continue;
if (whole_page)
end_page_writeback(page);
@@ -2246,6 +2286,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
u64 start;
u64 end;
int whole_page;
+ int mirror;
int ret;
if (err)
@@ -2284,17 +2325,35 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
}
spin_unlock(&tree->lock);
+ mirror = (int)(unsigned long)bio->bi_bdev;
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
- state);
- if (ret)
+ state, mirror);
+ if (ret) {
+ /* no IO indicated but software detected errors
+ * in the block, either checksum errors or
+ * issues with the contents */
+ struct btrfs_root *root =
+ BTRFS_I(page->mapping->host)->root;
+ struct btrfs_device *device;
+
uptodate = 0;
- else
+ device = btrfs_find_device_for_logical(
+ root, start, mirror);
+ if (device)
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ } else {
clean_io_failure(start, page);
+ }
}
- if (!uptodate) {
- int failed_mirror;
- failed_mirror = (int)(unsigned long)bio->bi_bdev;
+
+ if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(page, mirror);
+ if (!ret && !err &&
+ test_bit(BIO_UPTODATE, &bio->bi_flags))
+ uptodate = 1;
+ } else if (!uptodate) {
/*
* The generic bio_readpage_error handles errors the
* following way: If possible, new read requests are
@@ -2305,10 +2364,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
* can't handle the error it will return -EIO and we
* remain responsible for that page.
*/
- ret = bio_readpage_error(bio, page, start, end,
- failed_mirror, NULL);
+ ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
if (ret == 0) {
-error_handled:
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
@@ -2316,16 +2373,9 @@ error_handled:
uncache_state(&cached);
continue;
}
- if (tree->ops && tree->ops->readpage_io_failed_hook) {
- ret = tree->ops->readpage_io_failed_hook(
- bio, page, start, end,
- failed_mirror, state);
- if (ret == 0)
- goto error_handled;
- }
}
- if (uptodate) {
+ if (uptodate && tree->track_uptodate) {
set_extent_uptodate(tree, start, end, &cached,
GFP_ATOMIC);
}
@@ -2374,8 +2424,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+/*
+ * Since writes are async, they will only return -ENOMEM.
+ * Reads can return the full range of I/O error conditions.
+ */
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
int ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2393,7 +2447,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
mirror_num, bio_flags, start);
else
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
@@ -2401,6 +2455,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
return ret;
}
+static int merge_bio(struct extent_io_tree *tree, struct page *page,
+ unsigned long offset, size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ if (tree->ops && tree->ops->merge_bio_hook)
+ ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+ bio_flags);
+ BUG_ON(ret < 0);
+ return ret;
+
+}
+
static int submit_extent_page(int rw, struct extent_io_tree *tree,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
@@ -2429,12 +2496,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector;
if (prev_bio_flags != bio_flags || !contig ||
- (tree->ops && tree->ops->merge_bio_hook &&
- tree->ops->merge_bio_hook(page, offset, page_size, bio,
- bio_flags)) ||
+ merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
+ if (ret < 0)
+ return ret;
bio = NULL;
} else {
return 0;
@@ -2461,25 +2528,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
return ret;
}
-void set_page_extent_mapped(struct page *page)
+void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
page_cache_get(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
+ set_page_private(page, (unsigned long)eb);
+ } else {
+ WARN_ON(page->private != (unsigned long)eb);
}
}
-static void set_page_extent_head(struct page *page, unsigned long len)
+void set_page_extent_mapped(struct page *page)
{
- WARN_ON(!PagePrivate(page));
- set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+ }
}
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
* handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
*/
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
@@ -2519,11 +2592,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end = page_end;
while (1) {
- lock_extent(tree, start, end, GFP_NOFS);
+ lock_extent(tree, start, end);
ordered = btrfs_lookup_ordered_extent(inode, start);
if (!ordered)
break;
- unlock_extent(tree, start, end, GFP_NOFS);
+ unlock_extent(tree, start, end);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
@@ -2534,10 +2607,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
if (zero_offset) {
iosize = PAGE_CACHE_SIZE - zero_offset;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
}
}
while (cur <= end) {
@@ -2546,10 +2619,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct extent_state *cached = NULL;
iosize = PAGE_CACHE_SIZE - pg_offset;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur, cur + iosize - 1,
@@ -2560,7 +2633,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end - cur + 1, 0);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
- unlock_extent(tree, cur, end, GFP_NOFS);
+ unlock_extent(tree, cur, end);
break;
}
extent_offset = cur - em->start;
@@ -2595,10 +2668,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
char *userpage;
struct extent_state *cached = NULL;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
@@ -2612,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -2622,7 +2695,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
*/
if (block_start == EXTENT_MAP_INLINE) {
SetPageError(page);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -2642,6 +2715,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
+ BUG_ON(ret == -ENOMEM);
nr++;
*bio_flags = this_bio_flag;
}
@@ -2744,10 +2818,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (page->index == end_index) {
char *userpage;
- userpage = kmap_atomic(page, KM_USER0);
+ userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0,
PAGE_CACHE_SIZE - pg_offset);
- kunmap_atomic(userpage, KM_USER0);
+ kunmap_atomic(userpage);
flush_dcache_page(page);
}
pg_offset = 0;
@@ -2778,9 +2852,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_start = delalloc_end + 1;
continue;
}
- tree->ops->fill_delalloc(inode, page, delalloc_start,
- delalloc_end, &page_started,
- &nr_written);
+ ret = tree->ops->fill_delalloc(inode, page,
+ delalloc_start,
+ delalloc_end,
+ &page_started,
+ &nr_written);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ goto done;
+ }
/*
* delalloc_end is already one less than the total
* length, so we don't subtract one from
@@ -2817,8 +2898,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
- if (ret == -EAGAIN) {
- redirty_page_for_writepage(wbc, page);
+ if (ret) {
+ /* Fixup worker will requeue */
+ if (ret == -EBUSY)
+ wbc->pages_skipped++;
+ else
+ redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page);
ret = 0;
@@ -2949,6 +3034,275 @@ done_unlocked:
return 0;
}
+static int eb_wait(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
+{
+ unsigned long i, num_pages;
+ int flush = 0;
+ int ret = 0;
+
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush = 1;
+ flush_write_bio(epd);
+ btrfs_tree_lock(eb);
+ }
+
+ if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ btrfs_tree_unlock(eb);
+ if (!epd->sync_io)
+ return 0;
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ while (1) {
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_lock(eb);
+ if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+ break;
+ btrfs_tree_unlock(eb);
+ }
+ }
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ spin_lock(&fs_info->delalloc_lock);
+ if (fs_info->dirty_metadata_bytes >= eb->len)
+ fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&fs_info->delalloc_lock);
+ ret = 1;
+ }
+
+ btrfs_tree_unlock(eb);
+
+ if (!ret)
+ return ret;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+
+ if (!trylock_page(p)) {
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ lock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+ int uptodate = err == 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_buffer *eb;
+ int done;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ bvec--;
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ done = atomic_dec_and_test(&eb->io_pages);
+
+ if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ end_page_writeback(page);
+
+ if (!done)
+ continue;
+
+ end_extent_buffer_writeback(eb);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+
+}
+
+static int write_one_eb(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+ u64 offset = eb->start;
+ unsigned long i, num_pages;
+ int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+ int ret = 0;
+
+ clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ atomic_set(&eb->io_pages, num_pages);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+
+ clear_page_dirty_for_io(p);
+ set_page_writeback(p);
+ ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+ PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+ -1, end_bio_extent_buffer_writepage,
+ 0, 0, 0);
+ if (ret) {
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ SetPageError(p);
+ if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ ret = -EIO;
+ break;
+ }
+ offset += PAGE_CACHE_SIZE;
+ update_nr_written(p, wbc, 1);
+ unlock_page(p);
+ }
+
+ if (unlikely(ret)) {
+ for (; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ unlock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int tag;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ scanned = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!PagePrivate(page))
+ continue;
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ break;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+ if (!eb) {
+ WARN_ON(1);
+ continue;
+ }
+
+ if (eb == prev_eb)
+ continue;
+
+ if (!atomic_inc_not_zero(&eb->refs)) {
+ WARN_ON(1);
+ continue;
+ }
+
+ prev_eb = eb;
+ ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+ if (!ret) {
+ free_extent_buffer(eb);
+ continue;
+ }
+
+ ret = write_one_eb(eb, fs_info, wbc, &epd);
+ if (ret) {
+ done = 1;
+ free_extent_buffer(eb);
+ break;
+ }
+ free_extent_buffer(eb);
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ flush_write_bio(&epd);
+ return ret;
+}
+
/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
@@ -2970,6 +3324,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
+ struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -2980,6 +3335,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
int scanned = 0;
int tag;
+ /*
+ * We have to hold onto the inode so that ordered extents can do their
+ * work when the IO finishes. The alternative to this is failing to add
+ * an ordered extent if the igrab() fails there and that is a huge pain
+ * to deal with, so instead just hold onto the inode throughout the
+ * writepages operation. If it fails here we are freeing up the inode
+ * anyway and we'd rather not waste our time writing out stuff that is
+ * going to be truncated anyway.
+ */
+ if (!igrab(inode))
+ return 0;
+
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
@@ -3074,16 +3441,21 @@ retry:
index = 0;
goto retry;
}
+ btrfs_add_delayed_iput(inode);
return ret;
}
static void flush_epd_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
+ int rw = WRITE;
+ int ret;
+
if (epd->sync_io)
- submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
- else
- submit_one_bio(WRITE, epd->bio, 0, 0);
+ rw = WRITE_SYNC;
+
+ ret = submit_one_bio(rw, epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
}
@@ -3200,7 +3572,7 @@ int extent_readpages(struct extent_io_tree *tree,
}
BUG_ON(!list_empty(pages));
if (bio)
- submit_one_bio(READ, bio, 0, bio_flags);
+ return submit_one_bio(READ, bio, 0, bio_flags);
return 0;
}
@@ -3221,7 +3593,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+ lock_extent_bits(tree, start, end, 0, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -3288,7 +3660,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
- if (IS_ERR_OR_NULL(em)) {
+ if (!em) {
write_unlock(&map->lock);
break;
}
@@ -3435,7 +3807,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
get_extent);
@@ -3529,26 +3901,7 @@ out:
inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i)
{
- struct page *p;
- struct address_space *mapping;
-
- if (i == 0)
- return eb->first_page;
- i += eb->start >> PAGE_CACHE_SHIFT;
- mapping = eb->first_page->mapping;
- if (!mapping)
- return NULL;
-
- /*
- * extent_buffer_page is only called after pinning the page
- * by increasing the reference count. So we know the page must
- * be in the radix tree.
- */
- rcu_read_lock();
- p = radix_tree_lookup(&mapping->page_tree, i);
- rcu_read_unlock();
-
- return p;
+ return eb->pages[i];
}
inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3557,6 +3910,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT);
}
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+ unsigned long flags;
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ if (eb->pages && eb->pages != eb->inline_pages)
+ kfree(eb->pages);
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
u64 start,
unsigned long len,
@@ -3572,6 +3938,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return NULL;
eb->start = start;
eb->len = len;
+ eb->tree = tree;
+ eb->bflags = 0;
rwlock_init(&eb->lock);
atomic_set(&eb->write_locks, 0);
atomic_set(&eb->read_locks, 0);
@@ -3579,6 +3947,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
atomic_set(&eb->blocking_writers, 0);
atomic_set(&eb->spinning_readers, 0);
atomic_set(&eb->spinning_writers, 0);
+ eb->lock_nested = 0;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -3587,20 +3956,86 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
list_add(&eb->leak_list, &buffers);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
+ spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
+ atomic_set(&eb->io_pages, 0);
+
+ if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+ struct page **pages;
+ int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ pages = kzalloc(num_pages, mask);
+ if (!pages) {
+ __free_extent_buffer(eb);
+ return NULL;
+ }
+ eb->pages = pages;
+ } else {
+ eb->pages = eb->inline_pages;
+ }
return eb;
}
-static void __free_extent_buffer(struct extent_buffer *eb)
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
{
-#if LEAK_DEBUG
- unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
- list_del(&eb->leak_list);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
- kmem_cache_free(extent_buffer_cache, eb);
+ unsigned long i;
+ struct page *p;
+ struct extent_buffer *new;
+ unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+ new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+ if (new == NULL)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ p = alloc_page(GFP_ATOMIC);
+ BUG_ON(!p);
+ attach_extent_buffer_page(new, p);
+ WARN_ON(PageDirty(p));
+ SetPageUptodate(p);
+ new->pages[i] = p;
+ }
+
+ copy_extent_buffer(new, src, 0, 0, src->len);
+ set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+ set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+ return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+ struct extent_buffer *eb;
+ unsigned long num_pages = num_extent_pages(0, len);
+ unsigned long i;
+
+ eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+ if (!eb)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ eb->pages[i] = alloc_page(GFP_ATOMIC);
+ if (!eb->pages[i])
+ goto err;
+ }
+ set_extent_buffer_uptodate(eb);
+ btrfs_set_header_nritems(eb, 0);
+ set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+ return eb;
+err:
+ for (i--; i > 0; i--)
+ __free_page(eb->pages[i]);
+ __free_extent_buffer(eb);
+ return NULL;
+}
+
+static int extent_buffer_under_io(struct extent_buffer *eb)
+{
+ return (atomic_read(&eb->io_pages) ||
+ test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
/*
@@ -3610,20 +4045,50 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
unsigned long start_idx)
{
unsigned long index;
+ unsigned long num_pages;
struct page *page;
+ int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
- if (!eb->first_page)
- return;
+ BUG_ON(extent_buffer_under_io(eb));
- index = num_extent_pages(eb->start, eb->len);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ index = start_idx + num_pages;
if (start_idx >= index)
return;
do {
index--;
page = extent_buffer_page(eb, index);
- if (page)
+ if (page && mapped) {
+ spin_lock(&page->mapping->private_lock);
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
+ /*
+ * We need to make sure we haven't be attached
+ * to a new eb.
+ */
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
+ page_cache_release(page);
+ }
+ spin_unlock(&page->mapping->private_lock);
+
+ }
+ if (page) {
+ /* One for when we alloced the page */
page_cache_release(page);
+ }
} while (index != start_idx);
}
@@ -3636,9 +4101,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
__free_extent_buffer(eb);
}
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+ /* the ref bit is tricky. We have to make sure it is set
+ * if we have the buffer dirty. Otherwise the
+ * code to free a buffer can end up dropping a dirty
+ * page
+ *
+ * Once the ref bit is set, it won't go away while the
+ * buffer is dirty or in writeback, and it also won't
+ * go away while we have the reference count on the
+ * eb bumped.
+ *
+ * We can't just set the ref bit without bumping the
+ * ref on the eb because free_extent_buffer might
+ * see the ref bit and try to clear it. If this happens
+ * free_extent_buffer might end up dropping our original
+ * ref by mistake and freeing the page before we are able
+ * to add one more ref.
+ *
+ * So bump the ref count first, then set the bit. If someone
+ * beat us to it, drop the ref we added.
+ */
+ if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ atomic_inc(&eb->refs);
+ if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ }
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+{
+ unsigned long num_pages, i;
+
+ check_buffer_tree_ref(eb);
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ mark_page_accessed(p);
+ }
+}
+
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- struct page *page0)
+ u64 start, unsigned long len)
{
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
@@ -3654,7 +4160,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_page_accessed(eb->first_page);
+ mark_extent_buffer_accessed(eb);
return eb;
}
rcu_read_unlock();
@@ -3663,32 +4169,44 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (!eb)
return NULL;
- if (page0) {
- eb->first_page = page0;
- i = 1;
- index++;
- page_cache_get(page0);
- mark_page_accessed(page0);
- set_page_extent_mapped(page0);
- set_page_extent_head(page0, len);
- uptodate = PageUptodate(page0);
- } else {
- i = 0;
- }
- for (; i < num_pages; i++, index++) {
+ for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS);
if (!p) {
WARN_ON(1);
goto free_eb;
}
- set_page_extent_mapped(p);
- mark_page_accessed(p);
- if (i == 0) {
- eb->first_page = p;
- set_page_extent_head(p, len);
- } else {
- set_page_private(p, EXTENT_PAGE_PRIVATE);
+
+ spin_lock(&mapping->private_lock);
+ if (PagePrivate(p)) {
+ /*
+ * We could have already allocated an eb for this page
+ * and attached one so lets see if we can get a ref on
+ * the existing eb, and if we can we know it's good and
+ * we can just return that one, else we know we can just
+ * overwrite page->private.
+ */
+ exists = (struct extent_buffer *)p->private;
+ if (atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ page_cache_release(p);
+ mark_extent_buffer_accessed(exists);
+ goto free_eb;
+ }
+
+ /*
+ * Do this so attach doesn't complain and we need to
+ * drop the ref the old guy had.
+ */
+ ClearPagePrivate(p);
+ WARN_ON(PageDirty(p));
+ page_cache_release(p);
}
+ attach_extent_buffer_page(eb, p);
+ spin_unlock(&mapping->private_lock);
+ WARN_ON(PageDirty(p));
+ mark_page_accessed(p);
+ eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -3696,12 +4214,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* see below about how we avoid a nasty race with release page
* and why we unlock later
*/
- if (i != 0)
- unlock_page(p);
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto free_eb;
@@ -3711,14 +4227,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (ret == -EEXIST) {
exists = radix_tree_lookup(&tree->buffer,
start >> PAGE_CACHE_SHIFT);
- /* add one reference for the caller */
- atomic_inc(&exists->refs);
+ if (!atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&tree->buffer_lock);
+ radix_tree_preload_end();
+ exists = NULL;
+ goto again;
+ }
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
+ mark_extent_buffer_accessed(exists);
goto free_eb;
}
/* add one reference for the tree */
- atomic_inc(&eb->refs);
+ spin_lock(&eb->refs_lock);
+ check_buffer_tree_ref(eb);
+ spin_unlock(&eb->refs_lock);
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
@@ -3731,18 +4254,22 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* after the extent buffer is in the radix tree so
* it doesn't get lost
*/
- set_page_extent_mapped(eb->first_page);
- set_page_extent_head(eb->first_page, eb->len);
- if (!page0)
- unlock_page(eb->first_page);
+ SetPageChecked(eb->pages[0]);
+ for (i = 1; i < num_pages; i++) {
+ p = extent_buffer_page(eb, i);
+ ClearPageChecked(p);
+ unlock_page(p);
+ }
+ unlock_page(eb->pages[0]);
return eb;
free_eb:
- if (eb->first_page && !page0)
- unlock_page(eb->first_page);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i])
+ unlock_page(eb->pages[i]);
+ }
- if (!atomic_dec_and_test(&eb->refs))
- return exists;
+ WARN_ON(!atomic_dec_and_test(&eb->refs));
btrfs_release_extent_buffer(eb);
return exists;
}
@@ -3756,7 +4283,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_page_accessed(eb->first_page);
+ mark_extent_buffer_accessed(eb);
return eb;
}
rcu_read_unlock();
@@ -3764,19 +4291,79 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
return NULL;
}
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+ struct extent_buffer *eb =
+ container_of(head, struct extent_buffer, rcu_head);
+
+ __free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+{
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ if (atomic_dec_and_test(&eb->refs)) {
+ if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ } else {
+ struct extent_io_tree *tree = eb->tree;
+
+ spin_unlock(&eb->refs_lock);
+
+ spin_lock(&tree->buffer_lock);
+ radix_tree_delete(&tree->buffer,
+ eb->start >> PAGE_CACHE_SHIFT);
+ spin_unlock(&tree->buffer_lock);
+ }
+
+ /* Should be safe to release our pages at this point */
+ btrfs_release_extent_buffer_page(eb, 0);
+
+ call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+ return;
+ }
+ spin_unlock(&eb->refs_lock);
+}
+
void free_extent_buffer(struct extent_buffer *eb)
{
if (!eb)
return;
- if (!atomic_dec_and_test(&eb->refs))
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+ !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ /*
+ * I know this is terrible, but it's temporary until we stop tracking
+ * the uptodate bits and such for the extent buffers.
+ */
+ release_extent_buffer(eb, GFP_ATOMIC);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+ if (!eb)
return;
- WARN_ON(1);
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+ if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ release_extent_buffer(eb, GFP_NOFS);
}
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
{
unsigned long i;
unsigned long num_pages;
@@ -3792,10 +4379,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
lock_page(page);
WARN_ON(!PagePrivate(page));
- set_page_extent_mapped(page);
- if (i == 0)
- set_page_extent_head(page, eb->len);
-
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
if (!PageDirty(page)) {
@@ -3807,24 +4390,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
ClearPageError(page);
unlock_page(page);
}
- return 0;
+ WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
{
unsigned long i;
unsigned long num_pages;
int was_dirty = 0;
+ check_buffer_tree_ref(eb);
+
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
num_pages = num_extent_pages(eb->start, eb->len);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
for (i = 0; i < num_pages; i++)
- __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+ set_page_dirty(extent_buffer_page(eb, i));
return was_dirty;
}
-static int __eb_straddles_pages(u64 start, u64 len)
+static int range_straddles_pages(u64 start, u64 len)
{
if (len < PAGE_CACHE_SIZE)
return 1;
@@ -3835,26 +4423,14 @@ static int __eb_straddles_pages(u64 start, u64 len)
return 0;
}
-static int eb_straddles_pages(struct extent_buffer *eb)
-{
- return __eb_straddles_pages(eb->start, eb->len);
-}
-
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
unsigned long num_pages;
- num_pages = num_extent_pages(eb->start, eb->len);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
- if (eb_straddles_pages(eb)) {
- clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- cached_state, GFP_NOFS);
- }
+ num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (page)
@@ -3863,27 +4439,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
return 0;
}
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
unsigned long num_pages;
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
-
- if (eb_straddles_pages(eb)) {
- set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- NULL, GFP_NOFS);
- }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
- ((i == num_pages - 1) &&
- ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
- check_page_uptodate(tree, page);
- continue;
- }
SetPageUptodate(page);
}
return 0;
@@ -3898,7 +4463,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- if (__eb_straddles_pages(start, end - start + 1)) {
+ if (range_straddles_pages(start, end - start + 1)) {
ret = test_range_bit(tree, start, end,
EXTENT_UPTODATE, 1, NULL);
if (ret)
@@ -3907,6 +4472,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
page = find_get_page(tree->mapping, index);
+ if (!page)
+ return 1;
uptodate = PageUptodate(page);
page_cache_release(page);
if (!uptodate) {
@@ -3918,35 +4485,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
return pg_uptodate;
}
-int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state *cached_state)
+int extent_buffer_uptodate(struct extent_buffer *eb)
{
- int ret = 0;
- unsigned long num_pages;
- unsigned long i;
- struct page *page;
- int pg_uptodate = 1;
-
- if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return 1;
-
- if (eb_straddles_pages(eb)) {
- ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, cached_state);
- if (ret)
- return ret;
- }
-
- num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
- if (!PageUptodate(page)) {
- pg_uptodate = 0;
- break;
- }
- }
- return pg_uptodate;
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -3960,21 +4501,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
int ret = 0;
int locked_pages = 0;
int all_uptodate = 1;
- int inc_all_pages = 0;
unsigned long num_pages;
+ unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
- if (eb_straddles_pages(eb)) {
- if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, NULL)) {
- return 0;
- }
- }
-
if (start) {
WARN_ON(start < eb->start);
start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -3993,8 +4527,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
lock_page(page);
}
locked_pages++;
- if (!PageUptodate(page))
+ if (!PageUptodate(page)) {
+ num_reads++;
all_uptodate = 0;
+ }
}
if (all_uptodate) {
if (start_i == 0)
@@ -4002,20 +4538,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit;
}
+ clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
-
- WARN_ON(!PagePrivate(page));
-
- set_page_extent_mapped(page);
- if (i == 0)
- set_page_extent_head(page, eb->len);
-
- if (inc_all_pages)
- page_cache_get(page);
if (!PageUptodate(page)) {
- if (start_i == 0)
- inc_all_pages = 1;
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
@@ -4027,8 +4555,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
}
- if (bio)
- submit_one_bio(READ, bio, mirror_num, bio_flags);
+ if (bio) {
+ err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ if (err)
+ return err;
+ }
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -4040,8 +4571,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ret = -EIO;
}
- if (!ret)
- set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
return ret;
unlock_exit:
@@ -4283,15 +4812,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
{
char *dst_kaddr = page_address(dst_page);
char *src_kaddr;
+ int must_memmove = 0;
if (dst_page != src_page) {
src_kaddr = page_address(src_page);
} else {
src_kaddr = dst_kaddr;
- BUG_ON(areas_overlap(src_off, dst_off, len));
+ if (areas_overlap(src_off, dst_off, len))
+ must_memmove = 1;
}
- memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ if (must_memmove)
+ memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ else
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4361,7 +4895,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
"len %lu len %lu\n", dst_offset, len, dst->len);
BUG_ON(1);
}
- if (!areas_overlap(src_offset, dst_offset, len)) {
+ if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
@@ -4387,47 +4921,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
-{
- struct extent_buffer *eb =
- container_of(head, struct extent_buffer, rcu_head);
-
- btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+int try_release_extent_buffer(struct page *page, gfp_t mask)
{
- u64 start = page_offset(page);
struct extent_buffer *eb;
- int ret = 1;
- spin_lock(&tree->buffer_lock);
- eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- if (!eb) {
- spin_unlock(&tree->buffer_lock);
- return ret;
+ /*
+ * We need to make sure noboody is attaching this page to an eb right
+ * now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ return 1;
}
- if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- ret = 0;
- goto out;
- }
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
/*
- * set @eb->refs to 0 if it is already 1, and then release the @eb.
- * Or go back.
+ * This is a little awful but should be ok, we need to make sure that
+ * the eb doesn't disappear out from under us while we're looking at
+ * this page.
*/
- if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
- ret = 0;
- goto out;
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return 0;
}
+ spin_unlock(&page->mapping->private_lock);
- radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
- spin_unlock(&tree->buffer_lock);
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
- /* at this point we can safely release the extent buffer */
- if (atomic_read(&eb->refs) == 0)
- call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
- return ret;
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a real ref,
+ * so just return, this page will likely be freed soon anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ return 0;
+ }
+ release_extent_buffer(eb, mask);
+
+ return 1;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c300132..25900af5b15 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,6 +35,11 @@
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -54,6 +59,7 @@
#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
struct extent_state;
+struct btrfs_root;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
@@ -69,14 +75,9 @@ struct extent_io_ops {
size_t size, struct bio *bio,
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
- int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end, int failed_mirror,
- struct extent_state *state);
- int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end,
- struct extent_state *state);
+ int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
- struct extent_state *state);
+ struct extent_state *state, int mirror);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
@@ -97,6 +98,7 @@ struct extent_io_tree {
struct radix_tree_root buffer;
struct address_space *mapping;
u64 dirty_bytes;
+ int track_uptodate;
spinlock_t lock;
spinlock_t buffer_lock;
struct extent_io_ops *ops;
@@ -119,16 +121,22 @@ struct extent_state {
struct list_head leak_list;
};
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
unsigned long map_start;
unsigned long map_len;
- struct page *first_page;
unsigned long bflags;
+ struct extent_io_tree *tree;
+ spinlock_t refs_lock;
+ atomic_t refs;
+ atomic_t io_pages;
+ int read_mirror;
struct list_head leak_list;
struct rcu_head rcu_head;
- atomic_t refs;
+ pid_t lock_owner;
/* count of read lock holders on the extent buffer */
atomic_t write_locks;
@@ -137,6 +145,7 @@ struct extent_buffer {
atomic_t blocking_readers;
atomic_t spinning_readers;
atomic_t spinning_writers;
+ int lock_nested;
/* protects write locks */
rwlock_t lock;
@@ -150,6 +159,9 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
+ wait_queue_head_t lock_wq;
+ struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct page **pages;
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -176,18 +188,17 @@ void extent_io_tree_init(struct extent_io_tree *tree,
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_buffer(struct page *page, gfp_t mask);
int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached, gfp_t mask);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+ int bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
@@ -208,10 +219,12 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
+ int bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -238,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
get_extent_t *get_extent,
struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc);
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages,
@@ -249,11 +264,13 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- struct page *page0);
+ u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
@@ -285,19 +302,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state **cached_state);
-int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state *cached_state);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
@@ -317,4 +327,7 @@ struct btrfs_mapping_tree;
int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num);
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num);
#endif
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 33a7890b1f4..1195f09761f 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,8 +26,8 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
- unsigned int in_tree:1;
- unsigned int compress_type:4;
+ unsigned int in_tree;
+ unsigned int compress_type;
};
struct extent_map_tree {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c7fb3a4247d..5d158d32023 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,10 +25,12 @@
#include "transaction.h"
#include "print-tree.h"
-#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
size) - 1))
+#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
sizeof(struct btrfs_sector_sum) * \
@@ -59,7 +61,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
sizeof(*item));
if (ret < 0)
goto out;
- BUG_ON(ret);
+ BUG_ON(ret); /* Can't happen */
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -284,6 +286,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
+ LIST_HEAD(tmplist);
unsigned long offset;
int ret;
size_t size;
@@ -358,7 +361,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
GFP_NOFS);
- BUG_ON(!sums);
+ if (!sums) {
+ ret = -ENOMEM;
+ goto fail;
+ }
sector_sum = sums->sums;
sums->bytenr = start;
@@ -380,12 +386,19 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
offset += csum_size;
sector_sum++;
}
- list_add_tail(&sums->list, list);
+ list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
}
ret = 0;
fail:
+ while (ret < 0 && !list_empty(&tmplist)) {
+ sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ list_splice_tail(&tmplist, list);
+
btrfs_free_path(path);
return ret;
}
@@ -420,7 +433,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered);
+ BUG_ON(!ordered); /* Logic error */
sums->bytenr = ordered->start;
while (bio_index < bio->bi_vcnt) {
@@ -439,21 +452,21 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
- BUG_ON(!sums);
+ BUG_ON(!sums); /* -ENOMEM */
sector_sum = sums->sums;
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered);
+ BUG_ON(!ordered); /* Logic error */
sums->bytenr = ordered->start;
}
- data = kmap_atomic(bvec->bv_page, KM_USER0);
+ data = kmap_atomic(bvec->bv_page);
sector_sum->sum = ~(u32)0;
sector_sum->sum = btrfs_csum_data(root,
data + bvec->bv_offset,
sector_sum->sum,
bvec->bv_len);
- kunmap_atomic(data, KM_USER0);
+ kunmap_atomic(data);
btrfs_csum_final(sector_sum->sum,
(char *)&sector_sum->sum);
sector_sum->bytenr = disk_bytenr;
@@ -483,18 +496,17 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
*/
-static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key,
- u64 bytenr, u64 len)
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
- int ret;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
@@ -510,7 +522,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+ btrfs_truncate_item(trans, root, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -522,15 +534,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+ btrfs_truncate_item(trans, root, path, new_size, 0);
key->offset = end_byte;
- ret = btrfs_set_item_key_safe(trans, root, path, key);
- BUG_ON(ret);
+ btrfs_set_item_key_safe(trans, root, path, key);
} else {
BUG();
}
- return 0;
}
/*
@@ -635,13 +645,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
- BUG_ON(ret && ret != -EAGAIN);
+ if (ret && ret != -EAGAIN) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
key.offset = end_byte - 1;
} else {
- ret = truncate_one_csum(trans, root, path,
- &key, bytenr, len);
- BUG_ON(ret);
+ truncate_one_csum(trans, root, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -772,7 +783,7 @@ again:
if (diff != csum_size)
goto insert;
- ret = btrfs_extend_item(trans, root, path, diff);
+ btrfs_extend_item(trans, root, path, diff);
goto csum;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c05..9aa01ec2138 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
int cycled;
};
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
+ int ret;
p = &root->fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- if (defrag->ino < entry->ino)
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
p = &parent->rb_left;
- else if (defrag->ino > entry->ino)
+ else if (ret > 0)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
goto exists;
}
}
- BTRFS_I(inode)->in_defrag = 1;
+ set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (btrfs_fs_closing(root->fs_info))
return 0;
- if (BTRFS_I(inode)->in_defrag)
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
return 0;
if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
- if (!BTRFS_I(inode)->in_defrag)
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
__btrfs_add_inode_defrag(inode, defrag);
else
kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
/*
* must be called with the defrag_inodes lock held
*/
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+ u64 root, u64 ino,
struct rb_node **next)
{
struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
struct rb_node *p;
struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
p = info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- if (ino < entry->ino)
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
p = parent->rb_left;
- else if (ino > entry->ino)
+ else if (ret > 0)
p = parent->rb_right;
else
return entry;
}
if (next) {
- while (parent && ino > entry->ino) {
+ while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
entry = rb_entry(parent, struct inode_defrag, rb_node);
}
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
u64 first_ino = 0;
+ u64 root_objectid = 0;
int num_defrag;
int defrag_batch = 1024;
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
n = NULL;
/* find an inode to defrag */
- defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+ defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+ first_ino, &n);
if (!defrag) {
- if (n)
- defrag = rb_entry(n, struct inode_defrag, rb_node);
- else if (first_ino) {
+ if (n) {
+ defrag = rb_entry(n, struct inode_defrag,
+ rb_node);
+ } else if (root_objectid || first_ino) {
+ root_objectid = 0;
first_ino = 0;
continue;
} else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
/* remove it from the rbtree */
first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
goto next;
/* do a chunk of defrag */
- BTRFS_I(inode)->in_defrag = 0;
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
range.start = defrag->last_offset;
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
defrag_batch);
@@ -452,7 +481,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split = alloc_extent_map();
if (!split2)
split2 = alloc_extent_map();
- BUG_ON(!split || !split2);
+ BUG_ON(!split || !split2); /* -ENOMEM */
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -494,7 +523,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->flags = flags;
split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -520,7 +549,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
}
ret = add_extent_mapping(em_tree, split);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = NULL;
}
@@ -567,6 +596,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
int extent_type;
int recow;
int ret;
+ int modify_tree = -1;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -575,10 +605,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
if (!path)
return -ENOMEM;
+ if (start >= BTRFS_I(inode)->disk_i_size)
+ modify_tree = 0;
+
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
- search_start, -1);
+ search_start, modify_tree);
if (ret < 0)
break;
if (ret > 0 && path->slots[0] > 0 && search_start == start) {
@@ -634,7 +667,8 @@ next_slot:
}
search_start = max(key.offset, start);
- if (recow) {
+ if (recow || !modify_tree) {
+ modify_tree = -1;
btrfs_release_path(path);
continue;
}
@@ -678,8 +712,8 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset);
- BUG_ON(ret);
+ start - extent_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
*hint_byte = disk_bytenr;
}
key.offset = start;
@@ -753,8 +787,8 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset);
- BUG_ON(ret);
+ extent_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
*hint_byte = disk_bytenr;
@@ -770,7 +804,10 @@ next_slot:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
del_nr = 0;
del_slot = 0;
@@ -782,11 +819,13 @@ next_slot:
BUG_ON(1);
}
- if (del_nr > 0) {
+ if (!ret && del_nr > 0) {
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
}
+out:
btrfs_free_path(path);
return ret;
}
@@ -944,7 +983,10 @@ again:
btrfs_release_path(path);
goto again;
}
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
@@ -962,8 +1004,8 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset);
- BUG_ON(ret);
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
if (split == start) {
key.offset = start;
@@ -989,8 +1031,8 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset);
- BUG_ON(ret);
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
other_end = start;
@@ -1006,8 +1048,8 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset);
- BUG_ON(ret);
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -1025,7 +1067,10 @@ again:
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
out:
btrfs_free_path(path);
@@ -1081,7 +1126,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
again:
for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask);
+ mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
@@ -1105,8 +1150,7 @@ again:
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, 0, &cached_state,
- GFP_NOFS);
+ start_pos, last_pos - 1, 0, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
last_pos - 1);
if (ordered &&
@@ -1136,7 +1180,8 @@ again:
GFP_NOFS);
}
for (i = 0; i < num_pages; i++) {
- clear_page_dirty_for_io(pages[i]);
+ if (clear_page_dirty_for_io(pages[i]))
+ account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -1273,7 +1318,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
dirty_pages);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root, 1);
- btrfs_throttle(root);
pos += copied;
num_written += copied;
@@ -1290,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t *ppos, size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = fdentry(file)->d_inode;
struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
@@ -1300,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
count, ocount);
- /*
- * the generic O_DIRECT will update in-memory i_size after the
- * DIOs are done. But our endio handlers that update the on
- * disk i_size never update past the in memory i_size. So we
- * need one more update here to catch any additions to the
- * file
- */
- if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- mark_inode_dirty(inode);
- }
-
if (written < 0 || written == count)
return written;
@@ -1389,12 +1420,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
- err = btrfs_update_time(file);
+ err = file_update_time(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
}
- BTRFS_I(inode)->sequence++;
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
@@ -1451,8 +1481,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
*/
- if (BTRFS_I(inode)->ordered_data_close) {
- BTRFS_I(inode)->ordered_data_close = 0;
+ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags)) {
btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
@@ -1483,14 +1513,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_btrfs_sync_file(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
mutex_lock(&inode->i_mutex);
- /* we wait first, since the writeback may change the inode */
+ /*
+ * we wait first, since the writeback may change the inode, also wait
+ * ordered range does a filemape_write_and_wait_range which is why we
+ * don't do it above like other file systems.
+ */
root->log_batch++;
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ btrfs_wait_ordered_range(inode, start, end);
root->log_batch++;
/*
@@ -1508,7 +1539,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* syncing
*/
smp_mb();
- if (BTRFS_I(inode)->last_trans <=
+ if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+ BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
mutex_unlock(&inode->i_mutex);
@@ -1605,6 +1637,14 @@ static long btrfs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
/*
+ * Make sure we have enough space before we do the
+ * allocation.
+ */
+ ret = btrfs_check_data_free_space(inode, len);
+ if (ret)
+ return ret;
+
+ /*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
@@ -1630,7 +1670,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state, GFP_NOFS);
+ locked_end, 0, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -1659,7 +1699,13 @@ static long btrfs_fallocate(struct file *file, int mode,
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- BUG_ON(IS_ERR_OR_NULL(em));
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ break;
+ }
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = (last_byte + mask) & ~mask;
@@ -1667,27 +1713,12 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-
- /*
- * Make sure we have enough space before we do the
- * allocation.
- */
- ret = btrfs_check_data_free_space(inode, last_byte -
- cur_offset);
- if (ret) {
- free_extent_map(em);
- break;
- }
-
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
- /* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, last_byte -
- cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
@@ -1715,6 +1746,8 @@ static long btrfs_fallocate(struct file *file, int mode,
&cached_state, GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
+ /* Let go of our reservation. */
+ btrfs_free_reserved_data_space(inode, len);
return ret;
}
@@ -1742,7 +1775,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
return -ENXIO;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
/*
* Delalloc is such a pain. If we have a hole and we have pending
@@ -1761,7 +1794,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
start - root->sectorsize,
root->sectorsize, 0);
if (IS_ERR(em)) {
- ret = -ENXIO;
+ ret = PTR_ERR(em);
goto out;
}
last_end = em->start + em->len;
@@ -1773,7 +1806,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
while (1) {
em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
- ret = -ENXIO;
+ ret = PTR_ERR(em);
break;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c3..6c4e2baa929 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
return ERR_PTR(-ENOENT);
}
- inode->i_mapping->flags &= ~__GFP_FS;
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
return inode;
}
@@ -230,11 +233,13 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
if (ret) {
trans->block_rsv = rsv;
- WARN_ON(1);
+ btrfs_abort_transaction(trans, root, ret);
return ret;
}
ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
trans->block_rsv = rsv;
return ret;
@@ -319,9 +324,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
io_ctl_unmap_page(io_ctl);
for (i = 0; i < io_ctl->num_pages; i++) {
- ClearPageChecked(io_ctl->pages[i]);
- unlock_page(io_ctl->pages[i]);
- page_cache_release(io_ctl->pages[i]);
+ if (io_ctl->pages[i]) {
+ ClearPageChecked(io_ctl->pages[i]);
+ unlock_page(io_ctl->pages[i]);
+ page_cache_release(io_ctl->pages[i]);
+ }
}
}
@@ -361,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
{
- u64 *val;
+ __le64 *val;
io_ctl_map_page(io_ctl, 1);
@@ -384,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
{
- u64 *gen;
+ __le64 *gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -423,7 +430,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
}
if (index == 0)
- offset = sizeof(u32) * io_ctl->num_pages;;
+ offset = sizeof(u32) * io_ctl->num_pages;
crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
PAGE_CACHE_SIZE - offset);
@@ -580,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
return 0;
}
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries. This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache. So run through the space cache that we just
+ * loaded and merge contiguous entries. This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *e, *prev = NULL;
+ struct rb_node *n;
+
+again:
+ spin_lock(&ctl->tree_lock);
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+ e = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!prev)
+ goto next;
+ if (e->bitmap || prev->bitmap)
+ goto next;
+ if (prev->offset + prev->bytes == e->offset) {
+ unlink_free_space(ctl, prev);
+ unlink_free_space(ctl, e);
+ prev->bytes += e->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ link_free_space(ctl, prev);
+ prev = NULL;
+ spin_unlock(&ctl->tree_lock);
+ goto again;
+ }
+next:
+ prev = e;
+ }
+ spin_unlock(&ctl->tree_lock);
+}
+
int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
@@ -635,7 +680,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root);
+ if (ret)
+ return ret;
+
ret = readahead_cache(inode);
if (ret)
goto out;
@@ -719,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
io_ctl_drop_pages(&io_ctl);
+ merge_space_tree(ctl);
ret = 1;
out:
io_ctl_free(&io_ctl);
@@ -741,13 +790,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
u64 used = btrfs_block_group_used(&block_group->item);
/*
- * If we're unmounting then just return, since this does a search on the
- * normal root and not the commit root and we could deadlock.
- */
- if (btrfs_fs_closing(fs_info))
- return 0;
-
- /*
* If this block group has been marked to be cleared for one reason or
* another then we can't trust the on disk cache, so just return.
*/
@@ -761,6 +803,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
path = btrfs_alloc_path();
if (!path)
return 0;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
inode = lookup_free_space_inode(root, block_group, path);
if (IS_ERR(inode)) {
@@ -772,6 +816,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
spin_lock(&block_group->lock);
if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
spin_unlock(&block_group->lock);
+ btrfs_free_path(path);
goto out;
}
spin_unlock(&block_group->lock);
@@ -838,7 +883,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct io_ctl io_ctl;
struct list_head bitmap_list;
struct btrfs_key key;
- u64 start, end, len;
+ u64 start, extent_start, extent_end, len;
int entries = 0;
int bitmaps = 0;
int ret;
@@ -849,7 +894,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (!i_size_read(inode))
return -1;
- io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root);
+ if (ret)
+ return -1;
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,24 +904,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_cluster,
block_group_list);
- /*
- * We shouldn't have switched the pinned extents yet so this is the
- * right one
- */
- unpin = root->fs_info->pinned_extents;
-
/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state, GFP_NOFS);
-
- /*
- * When searching for pinned extents, we need to start at our start
- * offset.
- */
- if (block_group)
- start = block_group->key.objectid;
+ 0, &cached_state);
node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
@@ -918,9 +952,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* We want to add any pinned extents to our free space cache
* so we don't leak the space
*/
+
+ /*
+ * We shouldn't have switched the pinned extents yet so this is the
+ * right one
+ */
+ unpin = root->fs_info->pinned_extents;
+
+ if (block_group)
+ start = block_group->key.objectid;
+
while (block_group && (start < block_group->key.objectid +
block_group->key.offset)) {
- ret = find_first_extent_bit(unpin, start, &start, &end,
+ ret = find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
EXTENT_DIRTY);
if (ret) {
ret = 0;
@@ -928,20 +973,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}
/* This pinned extent is out of our range */
- if (start >= block_group->key.objectid +
+ if (extent_start >= block_group->key.objectid +
block_group->key.offset)
break;
- len = block_group->key.objectid +
- block_group->key.offset - start;
- len = min(len, end + 1 - start);
+ extent_start = max(extent_start, start);
+ extent_end = min(block_group->key.objectid +
+ block_group->key.offset, extent_end + 1);
+ len = extent_end - extent_start;
entries++;
- ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+ ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
if (ret)
goto out_nospc;
- start = end + 1;
+ start = extent_end;
}
/* Write out the bitmaps */
@@ -968,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret)
- goto out;
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
@@ -1061,7 +1105,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
ret = 0;
#ifdef DEBUG
- printk(KERN_ERR "btrfs: failed to write free space cace "
+ printk(KERN_ERR "btrfs: failed to write free space cache "
"for block group %llu\n", block_group->key.objectid);
#endif
}
@@ -1499,29 +1543,26 @@ again:
end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
/*
- * XXX - this can go away after a few releases.
- *
- * since the only user of btrfs_remove_free_space is the tree logging
- * stuff, and the only way to test that is under crash conditions, we
- * want to have this debug stuff here just in case somethings not
- * working. Search the bitmap for the space we are trying to use to
- * make sure its actually there. If its not there then we need to stop
- * because something has gone wrong.
+ * We need to search for bits in this bitmap. We could only cover some
+ * of the extent in this bitmap thanks to how we add space, so we need
+ * to search for as much as it as we can and clear that amount, and then
+ * go searching for the next bit.
*/
search_start = *offset;
- search_bytes = *bytes;
+ search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
BUG_ON(ret < 0 || search_start != *offset);
- if (*offset > bitmap_info->offset && *offset + *bytes > end) {
- bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
- *bytes -= end - *offset + 1;
- *offset = end + 1;
- } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
- bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
- *bytes = 0;
- }
+ /* We may have found more bits than what we need */
+ search_bytes = min(search_bytes, *bytes);
+
+ /* Cannot clear past the end of the bitmap */
+ search_bytes = min(search_bytes, end - search_start + 1);
+
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ *offset += search_bytes;
+ *bytes -= search_bytes;
if (*bytes) {
struct rb_node *next = rb_next(&bitmap_info->offset_index);
@@ -1552,7 +1593,7 @@ again:
* everything over again.
*/
search_start = *offset;
- search_bytes = *bytes;
+ search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
&search_bytes);
if (ret < 0 || search_start != *offset)
@@ -1835,12 +1876,14 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- struct btrfs_free_space *next_info = NULL;
int ret = 0;
spin_lock(&ctl->tree_lock);
again:
+ if (!bytes)
+ goto out_lock;
+
info = tree_search_offset(ctl, offset, 0, 0);
if (!info) {
/*
@@ -1861,94 +1904,54 @@ again:
}
}
- if (info->bytes < bytes && rb_next(&info->offset_index)) {
- u64 end;
- next_info = rb_entry(rb_next(&info->offset_index),
- struct btrfs_free_space,
- offset_index);
-
- if (next_info->bitmap)
- end = next_info->offset +
- BITS_PER_BITMAP * ctl->unit - 1;
- else
- end = next_info->offset + next_info->bytes;
-
- if (next_info->bytes < bytes ||
- next_info->offset > offset || offset > end) {
- printk(KERN_CRIT "Found free space at %llu, size %llu,"
- " trying to use %llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)bytes);
- WARN_ON(1);
- ret = -EINVAL;
- goto out_lock;
- }
-
- info = next_info;
- }
-
- if (info->bytes == bytes) {
- unlink_free_space(ctl, info);
- if (info->bitmap) {
- kfree(info->bitmap);
- ctl->total_bitmaps--;
- }
- kmem_cache_free(btrfs_free_space_cachep, info);
- ret = 0;
- goto out_lock;
- }
-
- if (!info->bitmap && info->offset == offset) {
+ if (!info->bitmap) {
unlink_free_space(ctl, info);
- info->offset += bytes;
- info->bytes -= bytes;
- ret = link_free_space(ctl, info);
- WARN_ON(ret);
- goto out_lock;
- }
+ if (offset == info->offset) {
+ u64 to_free = min(bytes, info->bytes);
+
+ info->bytes -= to_free;
+ info->offset += to_free;
+ if (info->bytes) {
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
- if (!info->bitmap && info->offset <= offset &&
- info->offset + info->bytes >= offset + bytes) {
- u64 old_start = info->offset;
- /*
- * we're freeing space in the middle of the info,
- * this can happen during tree log replay
- *
- * first unlink the old info and then
- * insert it again after the hole we're creating
- */
- unlink_free_space(ctl, info);
- if (offset + bytes < info->offset + info->bytes) {
- u64 old_end = info->offset + info->bytes;
+ offset += to_free;
+ bytes -= to_free;
+ goto again;
+ } else {
+ u64 old_end = info->bytes + info->offset;
- info->offset = offset + bytes;
- info->bytes = old_end - info->offset;
+ info->bytes = offset - info->offset;
ret = link_free_space(ctl, info);
WARN_ON(ret);
if (ret)
goto out_lock;
- } else {
- /* the hole we're creating ends at the end
- * of the info struct, just free the info
- */
- kmem_cache_free(btrfs_free_space_cachep, info);
- }
- spin_unlock(&ctl->tree_lock);
- /* step two, insert a new info struct to cover
- * anything before the hole
- */
- ret = btrfs_add_free_space(block_group, old_start,
- offset - old_start);
- WARN_ON(ret);
- goto out;
+ /* Not enough bytes in this entry to satisfy us */
+ if (old_end < offset + bytes) {
+ bytes -= old_end - offset;
+ offset = old_end;
+ goto again;
+ } else if (old_end == offset + bytes) {
+ /* all done */
+ goto out_lock;
+ }
+ spin_unlock(&ctl->tree_lock);
+
+ ret = btrfs_add_free_space(block_group, offset + bytes,
+ old_end - (offset + bytes));
+ WARN_ON(ret);
+ goto out;
+ }
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
if (ret == -EAGAIN)
goto again;
- BUG_ON(ret);
+ BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
out:
@@ -2236,7 +2239,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
if (entry->bitmap) {
ret = btrfs_alloc_from_bitmap(block_group,
cluster, entry, bytes,
- min_start);
+ cluster->window_start);
if (ret == 0) {
node = rb_next(&entry->offset_index);
if (!node)
@@ -2245,6 +2248,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
offset_index);
continue;
}
+ cluster->window_start += bytes;
} else {
ret = entry->offset;
@@ -2283,23 +2287,23 @@ out:
static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
- u64 offset, u64 bytes, u64 min_bytes)
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
- unsigned long search_bits;
- unsigned long total_bits;
+ unsigned long want_bits;
+ unsigned long min_bits;
unsigned long found_bits;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
- bool found = false;
i = offset_to_bit(entry->offset, block_group->sectorsize,
max_t(u64, offset, entry->offset));
- search_bits = bytes_to_bits(bytes, block_group->sectorsize);
- total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
again:
found_bits = 0;
@@ -2308,7 +2312,7 @@ again:
i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
- if (next_zero - i >= search_bits) {
+ if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
break;
}
@@ -2318,10 +2322,9 @@ again:
if (!found_bits)
return -ENOSPC;
- if (!found) {
+ if (!total_found) {
start = i;
cluster->max_size = 0;
- found = true;
}
total_found += found_bits;
@@ -2329,13 +2332,8 @@ again:
if (cluster->max_size < found_bits * block_group->sectorsize)
cluster->max_size = found_bits * block_group->sectorsize;
- if (total_found < total_bits) {
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
- if (i - start > total_bits * 2) {
- total_found = 0;
- cluster->max_size = 0;
- found = false;
- }
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
goto again;
}
@@ -2344,30 +2342,33 @@ again:
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -EEXIST; Logic error */
+ trace_btrfs_setup_cluster(block_group, cluster,
+ total_found * block_group->sectorsize, 1);
return 0;
}
/*
* This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
- struct btrfs_free_space *prev = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
u64 window_start;
u64 window_free;
u64 max_extent;
- u64 max_gap = 128 * 1024;
+ u64 total_size = 0;
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
@@ -2377,8 +2378,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* We don't want bitmaps, so just move along until we find a normal
* extent entry.
*/
- while (entry->bitmap) {
- if (list_empty(&entry->list))
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
node = rb_next(&entry->offset_index);
if (!node)
@@ -2391,12 +2392,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
max_extent = entry->bytes;
first = entry;
last = entry;
- prev = entry;
- while (window_free <= min_bytes) {
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
@@ -2405,26 +2403,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
continue;
}
- /*
- * we haven't filled the empty size and the window is
- * very large. reset and try again
- */
- if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
- entry->offset - window_start > (min_bytes * 2)) {
- first = entry;
- window_start = entry->offset;
- window_free = entry->bytes;
- last = entry;
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
max_extent = entry->bytes;
- } else {
- last = entry;
- window_free += entry->bytes;
- if (entry->bytes > max_extent)
- max_extent = entry->bytes;
- }
- prev = entry;
}
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
cluster->window_start = first->offset;
node = &first->offset_index;
@@ -2438,17 +2428,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
- if (entry->bitmap)
+ if (entry->bitmap || entry->bytes < min_bytes)
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
- BUG_ON(ret);
+ total_size += entry->bytes;
+ BUG_ON(ret); /* -EEXIST; Logic error */
} while (node && entry != last);
cluster->max_size = max_extent;
-
+ trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
return 0;
}
@@ -2460,7 +2451,7 @@ static noinline int
setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
@@ -2482,10 +2473,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
}
list_for_each_entry(entry, bitmaps, list) {
- if (entry->bytes < min_bytes)
+ if (entry->bytes < bytes)
continue;
ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
+ bytes, cont1_bytes, min_bytes);
if (!ret)
return 0;
}
@@ -2499,7 +2490,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
/*
* here we try to find a cluster of blocks in a block group. The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
* We might not find them all in one contiguous area.
*
* returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2506,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
u64 min_bytes;
+ u64 cont1_bytes;
int ret;
- /* for metadata, allow allocates with more holes */
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
if (btrfs_test_opt(root, SSD_SPREAD)) {
- min_bytes = bytes + empty_size;
+ cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
- /*
- * we want to do larger allocations when we are
- * flushing out the delayed refs, it helps prevent
- * making more work as we go along.
- */
- if (trans->transaction->delayed_refs.flushing)
- min_bytes = max(bytes, (bytes + empty_size) >> 1);
- else
- min_bytes = max(bytes, (bytes + empty_size) >> 4);
- } else
- min_bytes = max(bytes, (bytes + empty_size) >> 2);
+ cont1_bytes = bytes;
+ min_bytes = block_group->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = block_group->sectorsize;
+ }
spin_lock(&ctl->tree_lock);
@@ -2539,7 +2531,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
* If we know we don't have enough space to make a cluster don't even
* bother doing all the work to try and find one.
*/
- if (ctl->free_space < min_bytes) {
+ if (ctl->free_space < bytes) {
spin_unlock(&ctl->tree_lock);
return -ENOSPC;
}
@@ -2552,11 +2544,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
+ trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+ min_bytes);
+
+ INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
- bytes, min_bytes);
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
if (ret)
ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
- offset, bytes, min_bytes);
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
/* Clear our temporary list */
list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2565,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
+ } else {
+ trace_btrfs_failed_cluster_setup(block_group);
}
out:
spin_unlock(&cluster->lock);
@@ -2588,17 +2588,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
cluster->block_group = NULL;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 bytes,
+ u64 reserved_start, u64 reserved_bytes)
{
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry = NULL;
+ struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 bytes = 0;
- u64 actually_trimmed;
- int ret = 0;
+ int ret;
+ int update = 0;
+ u64 trimmed = 0;
- *trimmed = 0;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += reserved_bytes;
+ space_info->bytes_reserved += reserved_bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ ret = btrfs_error_discard_extent(fs_info->extent_root,
+ start, bytes, &trimmed);
+ if (!ret)
+ *total_trimmed += trimmed;
+
+ btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += reserved_bytes;
+ block_group->reserved -= reserved_bytes;
+ space_info->bytes_reserved -= reserved_bytes;
+ spin_unlock(&space_info->lock);
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret = 0;
+ u64 extent_start;
+ u64 extent_bytes;
+ u64 bytes;
while (start < end) {
spin_lock(&ctl->tree_lock);
@@ -2609,81 +2649,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
}
entry = tree_search_offset(ctl, start, 0, 1);
- if (!entry)
- entry = tree_search_offset(ctl,
- offset_to_bitmap(ctl, start),
- 1, 1);
-
- if (!entry || entry->offset >= end) {
+ if (!entry) {
spin_unlock(&ctl->tree_lock);
break;
}
- if (entry->bitmap) {
- ret = search_bitmap(ctl, entry, &start, &bytes);
- if (!ret) {
- if (start >= end) {
- spin_unlock(&ctl->tree_lock);
- break;
- }
- bytes = min(bytes, end - start);
- bitmap_clear_bits(ctl, entry, start, bytes);
- if (entry->bytes == 0)
- free_bitmap(ctl, entry);
- } else {
- start = entry->offset + BITS_PER_BITMAP *
- block_group->sectorsize;
+ /* skip bitmaps */
+ while (entry->bitmap) {
+ node = rb_next(&entry->offset_index);
+ if (!node) {
spin_unlock(&ctl->tree_lock);
- ret = 0;
- continue;
+ goto out;
}
- } else {
- start = entry->offset;
- bytes = min(entry->bytes, end - start);
- unlink_free_space(ctl, entry);
- kmem_cache_free(btrfs_free_space_cachep, entry);
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ }
+
+ if (entry->offset >= end) {
+ spin_unlock(&ctl->tree_lock);
+ break;
}
+ extent_start = entry->offset;
+ extent_bytes = entry->bytes;
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ goto next;
+ }
+
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+
spin_unlock(&ctl->tree_lock);
- if (bytes >= minlen) {
- struct btrfs_space_info *space_info;
- int update = 0;
-
- space_info = block_group->space_info;
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
- if (!block_group->ro) {
- block_group->reserved += bytes;
- space_info->bytes_reserved += bytes;
- update = 1;
- }
- spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
-
- ret = btrfs_error_discard_extent(fs_info->extent_root,
- start,
- bytes,
- &actually_trimmed);
-
- btrfs_add_free_space(block_group, start, bytes);
- if (update) {
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
- if (block_group->ro)
- space_info->bytes_readonly += bytes;
- block_group->reserved -= bytes;
- space_info->bytes_reserved -= bytes;
- spin_unlock(&space_info->lock);
- spin_unlock(&block_group->lock);
- }
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ extent_start, extent_bytes);
+ if (ret)
+ break;
+next:
+ start += bytes;
- if (ret)
- break;
- *trimmed += actually_trimmed;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+out:
+ return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ int ret = 0;
+ int ret2;
+ u64 bytes;
+ u64 offset = offset_to_bitmap(ctl, start);
+
+ while (offset < end) {
+ bool next_bitmap = false;
+
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (!entry) {
+ spin_unlock(&ctl->tree_lock);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = minlen;
+ ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ if (ret2 || start >= end) {
+ spin_unlock(&ctl->tree_lock);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = min(bytes, end - start);
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ goto next;
+ }
+
+ bitmap_clear_bits(ctl, entry, start, bytes);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+
+ spin_unlock(&ctl->tree_lock);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ start, bytes);
+ if (ret)
+ break;
+next:
+ if (next_bitmap) {
+ offset += BITS_PER_BITMAP * ctl->unit;
+ } else {
+ start += bytes;
+ if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+ offset += BITS_PER_BITMAP * ctl->unit;
}
- start += bytes;
- bytes = 0;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
@@ -2696,6 +2773,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
return ret;
}
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ return ret;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+ return ret;
+}
+
/*
* Find the left-most item in the cache tree, and then return the
* smallest inode number in the item.
@@ -2733,6 +2826,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
int ret;
ret = search_bitmap(ctl, entry, &offset, &count);
+ /* Logic error; Should be empty if it can't find anything */
BUG_ON(ret);
ino = offset;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index baa74f3db69..a13cf1a96c7 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -19,6 +19,7 @@
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
+#include "print-tree.h"
static int find_name_in_backref(struct btrfs_path *path, const char *name,
int name_len, struct btrfs_inode_ref **ref_ret)
@@ -128,13 +129,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- ret = btrfs_truncate_item(trans, root, path,
+ btrfs_truncate_item(trans, root, path,
item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
return ret;
}
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -165,7 +167,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ret = btrfs_extend_item(trans, root, path, ins_len);
+ btrfs_extend_item(trans, root, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d6..b1a1c929ba8 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
root->root_key.objectid);
- BUG_ON(IS_ERR(tsk));
+ BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
break;
info = rb_entry(n, struct btrfs_free_space, offset_index);
- BUG_ON(info->bitmap);
+ BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->cache_progress)
goto free;
@@ -438,15 +438,17 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
trans->bytes_reserved);
if (ret)
goto out;
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+ trans->transid, trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
ret = PTR_ERR(inode);
goto out_release;
}
if (IS_ERR(inode)) {
- BUG_ON(retry);
+ BUG_ON(retry); /* Logic error */
retry = true;
ret = create_free_ino_inode(root, trans, path);
@@ -457,12 +459,17 @@ again:
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
- WARN_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto out_put;
+ }
}
spin_lock(&root->cache_lock);
@@ -498,6 +505,8 @@ again:
out_put:
iput(inode);
out_release:
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+ trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
trans->block_rsv = rsv;
@@ -526,7 +535,7 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fd1a06df5bc..a7d1921ac76 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
static int btrfs_setsize(struct inode *inode, loff_t newsize);
static int btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -150,7 +150,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
inode_add_bytes(inode, size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
- BUG_ON(ret);
if (ret) {
err = ret;
goto fail;
@@ -173,9 +172,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_CACHE_SIZE);
- kaddr = kmap_atomic(cpage, KM_USER0);
+ kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -187,10 +186,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
page = find_get_page(inode->i_mapping,
start >> PAGE_CACHE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
offset = start & (PAGE_CACHE_SIZE - 1);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
page_cache_release(page);
}
btrfs_mark_buffer_dirty(leaf);
@@ -206,9 +205,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
* could end up racing with unlink.
*/
BTRFS_I(inode)->disk_i_size = inode->i_size;
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, inode);
- return 0;
+ return ret;
fail:
btrfs_free_path(path);
return err;
@@ -250,14 +249,21 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
ret = btrfs_drop_extents(trans, inode, start, aligned_end,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, root, inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
- BUG_ON(ret);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ } else if (ret == -ENOSPC) {
+ return 1;
+ }
+
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
@@ -293,7 +299,7 @@ static noinline int add_async_extent(struct async_cow *cow,
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
- BUG_ON(!async_extent);
+ BUG_ON(!async_extent); /* -ENOMEM */
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
@@ -344,8 +350,9 @@ static noinline int compress_file_range(struct inode *inode,
int will_compress;
int compress_type = root->fs_info->compress_type;
- /* if this is a small write inside eof, kick off a defragbot */
- if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+ /* if this is a small write inside eof, kick off a defrag */
+ if ((end - start + 1) < 16 * 1024 &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
actual_end = min_t(u64, isize, end + 1);
@@ -422,10 +429,10 @@ again:
* sending it down to disk
*/
if (offset) {
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memset(kaddr + offset, 0,
PAGE_CACHE_SIZE - offset);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
will_compress = 1;
}
@@ -433,7 +440,11 @@ again:
cont:
if (start == 0) {
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto cleanup_and_out;
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
@@ -450,11 +461,11 @@ cont:
total_compressed,
compress_type, pages);
}
- if (ret == 0) {
+ if (ret <= 0) {
/*
- * inline extent creation worked, we don't need
- * to create any more async work items. Unlock
- * and free up our temp pages.
+ * inline extent creation worked or returned error,
+ * we don't need to create any more async work items.
+ * Unlock and free up our temp pages.
*/
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -547,7 +558,7 @@ cleanup_and_bail_uncompressed:
}
out:
- return 0;
+ return ret;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
@@ -557,6 +568,20 @@ free_pages_out:
kfree(pages);
goto out;
+
+cleanup_and_out:
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ if (!trans || IS_ERR(trans))
+ btrfs_error(root->fs_info, ret, "Failed to join transaction");
+ else
+ btrfs_abort_transaction(trans, root, ret);
+ goto free_pages_out;
}
/*
@@ -597,7 +622,7 @@ retry:
lock_extent(io_tree, async_extent->start,
async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
+ async_extent->ram_size - 1);
/* allocate blocks */
ret = cow_file_range(inode, async_cow->locked_page,
@@ -606,6 +631,8 @@ retry:
async_extent->ram_size - 1,
&page_started, &nr_written, 0);
+ /* JDM XXX */
+
/*
* if page_started, cow_file_range inserted an
* inline extent and took care of all the unlocking
@@ -625,18 +652,21 @@ retry:
}
lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1,
- GFP_NOFS);
+ async_extent->start + async_extent->ram_size - 1);
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_reserve_extent(trans, root,
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ ret = btrfs_reserve_extent(trans, root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint,
- (u64)-1, &ins, 1);
- btrfs_end_transaction(trans, root);
+ 0, alloc_hint, &ins, 1);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ }
if (ret) {
int i;
@@ -649,8 +679,10 @@ retry:
async_extent->pages = NULL;
unlock_extent(io_tree, async_extent->start,
async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
- goto retry;
+ async_extent->ram_size - 1);
+ if (ret == -ENOSPC)
+ goto retry;
+ goto out_free; /* JDM: Requeue? */
}
/*
@@ -662,7 +694,7 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
@@ -694,7 +726,7 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
/*
* clear dirty, set writeback and unlock the pages.
@@ -716,13 +748,17 @@ retry:
ins.offset, async_extent->pages,
async_extent->nr_pages);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
cond_resched();
}
-
- return 0;
+ ret = 0;
+out:
+ return ret;
+out_free:
+ kfree(async_extent);
+ goto out;
}
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -791,7 +827,18 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(btrfs_is_free_space_inode(root, inode));
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ return PTR_ERR(trans);
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -800,7 +847,8 @@ static noinline int cow_file_range(struct inode *inode,
ret = 0;
/* if this is a small write inside eof, kick off defrag */
- if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+ if (num_bytes < 64 * 1024 &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(trans, inode);
if (start == 0) {
@@ -821,8 +869,10 @@ static noinline int cow_file_range(struct inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started