aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c10
-rw-r--r--fs/btrfs/async-thread.c9
-rw-r--r--fs/btrfs/backref.c355
-rw-r--r--fs/btrfs/backref.h21
-rw-r--r--fs/btrfs/btrfs_inode.h33
-rw-r--r--fs/btrfs/check-integrity.c54
-rw-r--r--fs/btrfs/compression.c20
-rw-r--r--fs/btrfs/ctree.c1197
-rw-r--r--fs/btrfs/ctree.h678
-rw-r--r--fs/btrfs/delayed-inode.c60
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c203
-rw-r--r--fs/btrfs/delayed-ref.h68
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c570
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/extent-tree.c994
-rw-r--r--fs/btrfs/extent_io.c251
-rw-r--r--fs/btrfs/extent_io.h27
-rw-r--r--fs/btrfs/extent_map.c64
-rw-r--r--fs/btrfs/extent_map.h11
-rw-r--r--fs/btrfs/file-item.c38
-rw-r--r--fs/btrfs/file.c850
-rw-r--r--fs/btrfs/free-space-cache.c83
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c1378
-rw-r--r--fs/btrfs/ioctl.c976
-rw-r--r--fs/btrfs/ioctl.h143
-rw-r--r--fs/btrfs/locking.c16
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c200
-rw-r--r--fs/btrfs/ordered-data.h21
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c1608
-rw-r--r--fs/btrfs/reada.c49
-rw-r--r--fs/btrfs/relocation.c56
-rw-r--r--fs/btrfs/root-tree.c104
-rw-r--r--fs/btrfs/scrub.c1883
-rw-r--r--fs/btrfs/send.c4695
-rw-r--r--fs/btrfs/send.h134
-rw-r--r--fs/btrfs/struct-funcs.c196
-rw-r--r--fs/btrfs/super.c200
-rw-r--r--fs/btrfs/transaction.c526
-rw-r--r--fs/btrfs/transaction.h30
-rw-r--r--fs/btrfs/tree-log.c1004
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c1122
-rw-r--r--fs/btrfs/volumes.h41
-rw-r--r--fs/btrfs/xattr.c13
-rw-r--r--fs/btrfs/zlib.c8
56 files changed, 17636 insertions, 3703 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2befae..7df3e0f0ee5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o
+ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 761e2cd8fed..e15d2b0d8d3 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, value, size);
}
if (size > 0) {
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
} else if (size == -ENOENT || size == -ENODATA || size == 0) {
/* FIXME, who returns -ENOENT? I think nobody */
acl = NULL;
@@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
return PTR_ERR(acl);
if (acl == NULL)
return -ENODATA;
- ret = posix_acl_to_xattr(acl, value, size);
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
posix_acl_release(acl);
return ret;
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
return ret;
+ if (ret == 0)
+ acl = NULL;
}
ret = 0;
break;
@@ -141,7 +143,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
goto out;
}
- ret = posix_acl_to_xattr(acl, value, size);
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (ret < 0)
goto out;
}
@@ -169,7 +171,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
return -EOPNOTSUPP;
if (value) {
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 42704149b72..58b7d14b08e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers,
work->ordered_func(work);
- /* now take the lock again and call the freeing code */
+ /* now take the lock again and drop our item from the list */
spin_lock(&workers->order_lock);
list_del(&work->order_list);
+ spin_unlock(&workers->order_lock);
+
+ /*
+ * we don't want to call the ordered free functions
+ * with the lock held though
+ */
work->ordered_free(work);
+ spin_lock(&workers->order_lock);
}
spin_unlock(&workers->order_lock);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a383c18e74e..04edf69be87 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
if (!ret) {
ret = ulist_add(parents, eb->start,
- (unsigned long)eie, GFP_NOFS);
+ (uintptr_t)eie, GFP_NOFS);
if (ret < 0)
break;
if (!extent_item_pos) {
@@ -282,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- rcu_read_lock();
- root_level = btrfs_header_level(root->node);
- rcu_read_unlock();
+ root_level = btrfs_old_root_level(root, time_seq);
if (root_level + 1 == level)
goto out;
@@ -363,8 +362,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&uiter);
node = ulist_next(parents, &uiter);
ref->parent = node ? node->val : 0;
- ref->inode_list =
- node ? (struct extent_inode_elem *)node->aux : 0;
+ ref->inode_list = node ?
+ (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
/* additional parents require new refs being added here */
while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +374,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
- new_ref->inode_list =
- (struct extent_inode_elem *)node->aux;
+ new_ref->inode_list = (struct extent_inode_elem *)
+ (uintptr_t)node->aux;
list_add(&new_ref->list, &ref->list);
}
ulist_reinit(parents);
@@ -462,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
pos2 = n2, n2 = pos2->next) {
struct __prelim_ref *ref2;
struct __prelim_ref *xchg;
+ struct extent_inode_elem *eie;
ref2 = list_entry(pos2, struct __prelim_ref, list);
@@ -473,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
ref1 = ref2;
ref2 = xchg;
}
- ref1->count += ref2->count;
} else {
if (ref1->parent != ref2->parent)
continue;
- ref1->count += ref2->count;
}
+
+ eie = ref1->inode_list;
+ while (eie && eie->next)
+ eie = eie->next;
+ if (eie)
+ eie->next = ref2->inode_list;
+ else
+ ref1->inode_list = ref2->inode_list;
+ ref1->count += ref2->count;
+
list_del(&ref2->list);
kfree(ref2);
}
@@ -773,9 +781,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist *refs, struct ulist *roots,
- const u64 *extent_item_pos)
+ u64 time_seq, struct ulist *refs,
+ struct ulist *roots, const u64 *extent_item_pos)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -837,7 +844,7 @@ again:
btrfs_put_delayed_ref(&head->node);
goto again;
}
- ret = __add_delayed_refs(head, delayed_ref_seq,
+ ret = __add_delayed_refs(head, time_seq,
&prefs_delayed);
mutex_unlock(&head->mutex);
if (ret) {
@@ -892,8 +899,7 @@ again:
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
list_del(&ref->list);
- if (ref->count < 0)
- WARN_ON(1);
+ WARN_ON(ref->count < 0);
if (ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
@@ -915,8 +921,8 @@ again:
free_extent_buffer(eb);
}
ret = ulist_add_merge(refs, ref->parent,
- (unsigned long)ref->inode_list,
- (unsigned long *)&eie, GFP_NOFS);
+ (uintptr_t)ref->inode_list,
+ (u64 *)&eie, GFP_NOFS);
if (!ret && extent_item_pos) {
/*
* we've recorded that parent, so we must extend
@@ -960,7 +966,7 @@ static void free_leaf_list(struct ulist *blocks)
while ((node = ulist_next(blocks, &uiter))) {
if (!node->aux)
continue;
- eie = (struct extent_inode_elem *)node->aux;
+ eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
for (; eie; eie = eie_next) {
eie_next = eie->next;
kfree(eie);
@@ -981,8 +987,7 @@ static void free_leaf_list(struct ulist *blocks)
*/
static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist **leafs,
+ u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos)
{
struct ulist *tmp;
@@ -997,7 +1002,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ ret = find_parent_nodes(trans, fs_info, bytenr,
time_seq, *leafs, tmp, extent_item_pos);
ulist_free(tmp);
@@ -1024,8 +1029,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*/
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist **roots)
+ u64 time_seq, struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1043,7 +1047,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
ULIST_ITER_INIT(&uiter);
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ ret = find_parent_nodes(trans, fs_info, bytenr,
time_seq, tmp, *roots, NULL);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
@@ -1111,44 +1115,97 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
found_key);
}
-/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
- struct btrfs_inode_ref *iref,
- struct extent_buffer *eb_in, u64 parent,
- char *dest, u32 size)
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off)
+{
+ int ret, slot;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ key.objectid = inode_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.offset = start_off;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If the item at offset is not found,
+ * btrfs_search_slot will point us to the slot
+ * where it should be inserted. In our case
+ * that will be the slot directly before the
+ * next INODE_REF_KEY_V2 item. In the case
+ * that we're pointing to the last slot in a
+ * leaf, we must move one leaf over.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret) {
+ if (ret >= 1)
+ ret = -ENOENT;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /*
+ * Check that we're still looking at an extended ref key for
+ * this particular objectid. If we have different
+ * objectid or type then there are no more to be found
+ * in the tree and we can exit.
+ */
+ ret = -ENOENT;
+ if (found_key.objectid != inode_objectid)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = 0;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ extref = (struct btrfs_inode_extref *)ptr;
+ *ret_extref = extref;
+ if (found_off)
+ *found_off = found_key.offset;
+ break;
+ }
+
+ return ret;
+}
+
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
{
- u32 len;
int slot;
u64 next_inum;
int ret;
- s64 bytes_left = size - 1;
+ s64 bytes_left = ((s64)size) - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
int leave_spinning = path->leave_spinning;
+ struct btrfs_inode_ref *iref;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
path->leave_spinning = 1;
while (1) {
- len = btrfs_inode_ref_name_len(eb, iref);
- bytes_left -= len;
+ bytes_left -= name_len;
if (bytes_left >= 0)
read_extent_buffer(eb, dest + bytes_left,
- (unsigned long)(iref + 1), len);
+ name_off, name_len);
if (eb != eb_in) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1158,6 +1215,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
ret = -ENOENT;
if (ret)
break;
+
next_inum = found_key.offset;
/* regular exit ahead */
@@ -1173,8 +1231,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
}
btrfs_release_path(path);
-
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+
parent = next_inum;
--bytes_left;
if (bytes_left >= 0)
@@ -1191,12 +1252,39 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ struct btrfs_inode_ref *iref,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
+{
+ return btrfs_ref_to_path(fs_root, path,
+ btrfs_inode_ref_name_len(eb_in, iref),
+ (unsigned long)(iref + 1),
+ eb_in, parent, dest, size);
+}
+
+/*
* this makes the path point to (logical EXTENT_ITEM *)
* returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
* tree blocks and <0 on error.
*/
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
- struct btrfs_path *path, struct btrfs_key *found_key)
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags_ret)
{
int ret;
u64 flags;
@@ -1240,10 +1328,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
(unsigned long long)found_key->objectid,
(unsigned long long)found_key->offset,
(unsigned long long)flags, item_size);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
- return BTRFS_EXTENT_FLAG_TREE_BLOCK;
- if (flags & BTRFS_EXTENT_FLAG_DATA)
- return BTRFS_EXTENT_FLAG_DATA;
+
+ WARN_ON(!flags_ret);
+ if (flags_ret) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else if (flags & BTRFS_EXTENT_FLAG_DATA)
+ *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+ else
+ BUG_ON(1);
+ return 0;
+ }
return -EIO;
}
@@ -1376,11 +1471,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list seq_elem = {};
struct seq_list tree_mod_seq_elem = {};
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
- struct btrfs_delayed_ref_root *delayed_refs = NULL;
pr_debug("resolving all inodes for extent %llu\n",
extent_item_objectid);
@@ -1391,16 +1484,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
- btrfs_get_delayed_seq(delayed_refs, &seq_elem);
- spin_unlock(&delayed_refs->lock);
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+ tree_mod_seq_elem.seq, &refs,
&extent_item_pos);
if (ret)
goto out;
@@ -1408,19 +1496,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
- seq_elem.seq,
- tree_mod_seq_elem.seq, &roots);
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
pr_debug("root %llu references leaf %llu, data list "
- "%#lx\n", root_node->val, ref_node->val,
- ref_node->aux);
- ret = iterate_leaf_refs(
- (struct extent_inode_elem *)ref_node->aux,
- root_node->val, extent_item_objectid,
- iterate, ctx);
+ "%#llx\n", root_node->val, ref_node->val,
+ (long long)ref_node->aux);
+ ret = iterate_leaf_refs((struct extent_inode_elem *)
+ (uintptr_t)ref_node->aux,
+ root_node->val,
+ extent_item_objectid,
+ iterate, ctx);
}
ulist_free(roots);
roots = NULL;
@@ -1431,7 +1519,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- btrfs_put_delayed_seq(delayed_refs, &seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
}
@@ -1444,16 +1531,16 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
{
int ret;
u64 extent_item_pos;
+ u64 flags = 0;
struct btrfs_key found_key;
int search_commit_root = path->search_commit_root;
- ret = extent_from_logical(fs_info, logical, path,
- &found_key);
+ ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
btrfs_release_path(path);
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
- ret = -EINVAL;
if (ret < 0)
return ret;
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ return -EINVAL;
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
@@ -1463,9 +1550,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
{
int ret = 0;
int slot;
@@ -1482,7 +1572,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
while (!ret) {
path->leave_spinning = 1;
ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
- &found_key);
+ &found_key);
if (ret < 0)
break;
if (ret) {
@@ -1510,7 +1600,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
"tree %llu\n", cur,
(unsigned long long)found_key.objectid,
(unsigned long long)fs_root->objectid);
- ret = iterate(parent, iref, eb, ctx);
+ ret = iterate(parent, name_len,
+ (unsigned long)(iref + 1), eb, ctx);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -1525,12 +1616,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
+{
+ int ret;
+ int slot;
+ u64 offset = 0;
+ u64 parent;
+ int found = 0;
+ struct extent_buffer *eb;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ u32 cur_offset;
+ unsigned long ptr;
+
+ while (1) {
+ ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+ &offset);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ /* make sure we can use eb after releasing the path */
+ atomic_inc(&eb->refs);
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_release_path(path);
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ u32 name_len;
+
+ extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ ret = iterate(parent, name_len,
+ (unsigned long)&extref->name, eb, ctx);
+ if (ret)
+ break;
+
+ cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += sizeof(*extref);
+ }
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+
+ offset++;
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path, iterate_irefs_t *iterate,
+ void *ctx)
+{
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
+}
+
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
- struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx)
{
struct inode_fs_paths *ipath = ctx;
char *fspath;
@@ -1543,20 +1720,16 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
ipath->fspath->bytes_left - s_ptr : 0;
fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
- fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
- inum, fspath_min, bytes_left);
+ fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+ name_off, eb, inum, fspath_min, bytes_left);
if (IS_ERR(fspath))
return PTR_ERR(fspath);
if (fspath > fspath_min) {
- pr_debug("path resolved: %s\n", fspath);
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
- pr_debug("missed path, not enough space. missing bytes: %lu, "
- "constructed so far: %s\n",
- (unsigned long)(fspath_min - fspath), fspath_min);
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
@@ -1578,7 +1751,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ inode_to_path, ipath);
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1587,7 +1760,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kmalloc(alloc_bytes, GFP_NOFS);
+ data = vmalloc(alloc_bytes);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -1638,6 +1811,6 @@ void free_ipath(struct inode_fs_paths *ipath)
{
if (!ipath)
return;
- kfree(ipath->fspath);
+ vfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac7b79..d61feca7945 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -21,6 +21,7 @@
#include "ioctl.h"
#include "ulist.h"
+#include "extent_io.h"
#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
@@ -32,14 +33,13 @@ struct inode_fs_paths {
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
- struct extent_buffer *eb, void *ctx);
int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
- struct btrfs_path *path, struct btrfs_key *found_key);
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_extent_item *ei, u32 item_size,
@@ -58,12 +58,23 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist **roots);
+ u64 time_seq, struct ulist **roots);
+char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ struct btrfs_inode_ref *iref, struct extent_buffer *eb,
+ u64 parent, char *dest, u32 size);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
void free_ipath(struct inode_fs_paths *ipath);
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off);
+
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 12394a90d60..2a8c242bc4f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,8 @@
#define BTRFS_INODE_DELALLOC_META_RESERVED 4
#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
+#define BTRFS_INODE_NEEDS_FULL_SYNC 7
+#define BTRFS_INODE_COPY_EVERYTHING 8
/* in memory btrfs inode */
struct btrfs_inode {
@@ -87,11 +89,11 @@ struct btrfs_inode {
/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;
- /* the space_info for where this inode's data allocations are done */
- struct btrfs_space_info *space_info;
-
unsigned long runtime_flags;
+ /* Keep track of who's O_SYNC/fsycing currently */
+ atomic_t sync_writers;
+
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
@@ -146,6 +148,9 @@ struct btrfs_inode {
/* flags field from the on disk inode */
u32 flags;
+ /* a local copy of root's last_log_commit */
+ unsigned long last_log_commit;
+
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
@@ -191,26 +196,24 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
BTRFS_I(inode)->disk_i_size = size;
}
-static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
- struct inode *inode)
+static inline bool btrfs_is_free_space_inode(struct inode *inode)
{
- if (root == root->fs_info->tree_root ||
- BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (root == root->fs_info->tree_root &&
+ btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
+ return true;
+ if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
return true;
return false;
}
static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
-
- mutex_lock(&root->log_mutex);
if (BTRFS_I(inode)->logged_trans == generation &&
- BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
- ret = 1;
- mutex_unlock(&root->log_mutex);
- return ret;
+ BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+ return 1;
+ return 0;
}
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index da6e9364a5e..11d47bfb62b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
* the file system was mounted, (i.e., they have been
* referenced by the super block) or they have been
* written since then and the write completion callback
- * was called and a FLUSH request to the device where
- * these blocks are located was received and completed.
+ * was called and no write error was indicated and a
+ * FLUSH request to the device where these blocks are
+ * located was received and completed.
* 2b. All referenced blocks need to have a generation
* number which is equal to the parent's number.
*
@@ -136,7 +137,7 @@ struct btrfsic_block {
unsigned int never_written:1; /* block was added because it was
* referenced, not because it was
* written */
- unsigned int mirror_num:2; /* large enough to hold
+ unsigned int mirror_num; /* large enough to hold
* BTRFS_SUPER_MIRROR_MAX */
struct btrfsic_dev_state *dev_state;
u64 dev_bytenr; /* key, physical byte num on disk */
@@ -722,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -902,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1032,6 +1033,7 @@ continue_with_current_leaf_stack_frame:
struct btrfs_disk_key *disk_key;
u8 type;
u32 item_offset;
+ u32 item_size;
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
@@ -1047,6 +1049,7 @@ leaf_item_out_of_bounce_error:
disk_item_offset,
sizeof(struct btrfs_item));
item_offset = le32_to_cpu(disk_item.offset);
+ item_size = le32_to_cpu(disk_item.size);
disk_key = &disk_item.key;
type = disk_key->type;
@@ -1057,14 +1060,13 @@ leaf_item_out_of_bounce_error:
root_item_offset = item_offset +
offsetof(struct btrfs_leaf, items);
- if (root_item_offset +
- sizeof(struct btrfs_root_item) >
+ if (root_item_offset + item_size >
sf->block_ctx->len)
goto leaf_item_out_of_bounce_error;
btrfsic_read_from_block_data(
sf->block_ctx, &root_item,
root_item_offset,
- sizeof(struct btrfs_root_item));
+ item_size);
next_bytenr = le64_to_cpu(root_item.bytenr);
sf->error =
@@ -1285,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
*next_blockp = NULL;
if (0 == *num_copiesp) {
*num_copiesp =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1487,7 +1489,7 @@ static int btrfsic_handle_extent_data(
chunk_len = num_bytes;
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->datablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1580,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_device *device;
length = len;
- ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+ ret = btrfs_map_block(state->root->fs_info, READ,
bytenr, &length, &multi, mirror_num);
+ if (ret) {
+ block_ctx_out->start = 0;
+ block_ctx_out->dev_bytenr = 0;
+ block_ctx_out->len = 0;
+ block_ctx_out->dev = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ return ret;
+ }
+
device = multi->stripes[0].dev;
block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1592,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
block_ctx_out->pagev = NULL;
block_ctx_out->mem_to_free = NULL;
- if (0 == ret)
- kfree(multi);
+ kfree(multi);
if (NULL == block_ctx_out->dev) {
ret = -ENXIO;
printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2461,7 +2474,7 @@ static int btrfsic_process_written_superblock(
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, BTRFS_SUPER_INFO_SIZE);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2600,6 +2613,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
(unsigned long long)l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
+ } else if (l->block_ref_to->iodone_w_error) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which has write error!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
} else if (l->parent_generation !=
l->block_ref_to->generation &&
BTRFSIC_GENERATION_UNKNOWN !=
@@ -2947,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
struct btrfsic_block_data_ctx block_ctx;
int match = 0;
- num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ num_copies = btrfs_num_copies(state->root->fs_info,
bytenr, state->metablock_size);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 86eff48dab7..94ab2f80e7e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
int ret = -ENOMEM;
+ int faili = 0;
u32 *sums;
tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
- if (!cb->compressed_pages[pg_index])
+ if (!cb->compressed_pages[pg_index]) {
+ faili = pg_index - 1;
+ ret = -ENOMEM;
goto fail2;
+ }
}
+ faili = nr_pages - 1;
cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -682,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ bio_endio(comp_bio, ret);
bio_put(comp_bio);
@@ -707,14 +713,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ bio_endio(comp_bio, ret);
bio_put(comp_bio);
return 0;
fail2:
- for (pg_index = 0; pg_index < nr_pages; pg_index++)
- free_page((unsigned long)cb->compressed_pages[pg_index]);
+ while (faili >= 0) {
+ __free_page(cb->compressed_pages[faili]);
+ faili--;
+ }
kfree(cb->compressed_pages);
fail1:
@@ -818,6 +827,7 @@ static void free_workspace(int type, struct list_head *workspace)
btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(alloc_workspace);
wake:
+ smp_mb();
if (waitqueue_active(workspace_wait))
wake_up(workspace_wait);
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b390058..eea5da7a2b9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot,
- int tree_mod_log);
+ struct btrfs_path *path, int level, int slot);
static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -321,7 +320,7 @@ struct tree_mod_root {
struct tree_mod_elem {
struct rb_node node;
u64 index; /* shifted logical */
- struct seq_list elem;
+ u64 seq;
enum mod_log_op op;
/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +340,50 @@ struct tree_mod_elem {
struct tree_mod_root old_root;
};
-static inline void
-__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
{
- elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
- list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ read_lock(&fs_info->tree_mod_log_lock);
}
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+ read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
{
- elem->flags = 1;
+ u64 seq;
+
+ tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
- __get_tree_mod_seq(fs_info, elem);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ }
+ seq = btrfs_inc_tree_mod_seq(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
+ tree_mod_log_write_unlock(fs_info);
+
+ return seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +400,40 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
if (!seq_putting)
return;
- BUG_ON(!(elem->flags & 1));
spin_lock(&fs_info->tree_mod_seq_lock);
list_del(&elem->list);
+ elem->seq = 0;
list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+ if (cur_elem->seq < min_seq) {
if (seq_putting > cur_elem->seq) {
/*
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
- goto out;
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return;
}
min_seq = cur_elem->seq;
}
}
+ spin_unlock(&fs_info->tree_mod_seq_lock);
/*
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
- write_lock(&fs_info->tree_mod_log_lock);
+ tree_mod_log_write_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = container_of(node, struct tree_mod_elem, node);
- if (tm->elem.seq > min_seq)
+ if (tm->seq > min_seq)
continue;
rb_erase(node, tm_root);
- list_del(&tm->elem.list);
kfree(tm);
}
- write_unlock(&fs_info->tree_mod_log_lock);
-out:
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ tree_mod_log_write_unlock(fs_info);
}
/*
@@ -423,11 +451,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
struct rb_node **new;
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
- int ret = 0;
- BUG_ON(!tm || !tm->elem.seq);
+ BUG_ON(!tm || !tm->seq);
- write_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
while (*new) {
@@ -437,88 +463,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
new = &((*new)->rb_left);
else if (cur->index > tm->index)
new = &((*new)->rb_right);
- else if (cur->elem.seq < tm->elem.seq)
+ else if (cur->seq < tm->seq)
new = &((*new)->rb_left);
- else if (cur->elem.seq > tm->elem.seq)
+ else if (cur->seq > tm->seq)
new = &((*new)->rb_right);
else {
kfree(tm);
- ret = -EEXIST;
- goto unlock;
+ return -EEXIST;
}
}
rb_link_node(&tm->node, parent, new);
rb_insert_color(&tm->node, tm_root);
-unlock:
- write_unlock(&fs_info->tree_mod_log_lock);
- return ret;
+ return 0;
}
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb) {
smp_mb();
if (list_empty(&(fs_info)->tree_mod_seq_list))
return 1;
- if (!eb)
- return 0;
- if (btrfs_header_level(eb) == 0)
+ if (eb && btrfs_header_level(eb) == 0)
+ return 1;
+
+ tree_mod_log_write_lock(fs_info);
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ /*
+ * someone emptied the list while we were waiting for the lock.
+ * we must not add to the list when no blocker exists.
+ */
+ tree_mod_log_write_unlock(fs_info);
return 1;
+ }
+
return 0;
}
/*
- * This allocates memory and gets a tree modification sequence number when
- * needed.
+ * This allocates memory and gets a tree modification sequence number.
*
- * Returns 0 when no sequence number is needed, < 0 on error.
- * Returns 1 when a sequence number was added. In this case,
- * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
- * after inserting into the rb tree.
+ * Returns <0 on error.
+ * Returns >0 (the added sequence number) on success.
*/
static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
struct tree_mod_elem **tm_ret)
{
struct tree_mod_elem *tm;
- int seq;
-
- if (tree_mod_dont_log(fs_info, NULL))
- return 0;
- tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+ /*
+ * once we switch from spin locks to something different, we should
+ * honor the flags parameter here.
+ */
+ tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
return -ENOMEM;
- tm->elem.flags = 0;
- spin_lock(&fs_info->tree_mod_seq_lock);
- if (list_empty(&fs_info->tree_mod_seq_list)) {
- /*
- * someone emptied the list while we were waiting for the lock.
- * we must not add to the list, because no blocker exists. items
- * are removed from the list only when the existing blocker is
- * removed from the list.
- */
- kfree(tm);
- seq = 0;
- spin_unlock(&fs_info->tree_mod_seq_lock);
- } else {
- __get_tree_mod_seq(fs_info, &tm->elem);
- seq = tm->elem.seq;
- }
-
- return seq;
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+ return tm->seq;
}
-static noinline int
-tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
+static inline int
+__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
{
- struct tree_mod_elem *tm;
int ret;
+ struct tree_mod_elem *tm;
ret = tree_mod_alloc(fs_info, flags, &tm);
- if (ret <= 0)
+ if (ret < 0)
return ret;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +549,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
- ret = __tree_mod_log_insert(fs_info, tm);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ int ret;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return 0;
+
+ ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+
+ tree_mod_log_write_unlock(fs_info);
return ret;
}
@@ -543,6 +576,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
}
static noinline int
+tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op)
+{
+ return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
int nr_items, gfp_t flags)
@@ -554,15 +595,20 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (tree_mod_dont_log(fs_info, eb))
return 0;
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+ ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
MOD_LOG_KEY_REMOVE_WHILE_MOVING);
BUG_ON(ret < 0);
}
ret = tree_mod_alloc(fs_info, flags, &tm);
- if (ret <= 0)
- return ret;
+ if (ret < 0)
+ goto out;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
tm->slot = src_slot;
@@ -571,10 +617,29 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
tm->op = MOD_LOG_MOVE_KEYS;
ret = __tree_mod_log_insert(fs_info, tm);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+ tree_mod_log_write_unlock(fs_info);
return ret;
}
+static inline void
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+ int i;
+ u32 nritems;
+ int ret;
+
+ if (btrfs_header_level(eb) == 0)
+ return;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+ BUG_ON(ret < 0);
+ }
+}
+
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
@@ -583,9 +648,12 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct tree_mod_elem *tm;
int ret;
+ if (tree_mod_dont_log(fs_info, NULL))
+ return 0;
+
ret = tree_mod_alloc(fs_info, flags, &tm);
- if (ret <= 0)
- return ret;
+ if (ret < 0)
+ goto out;
tm->index = new_root->start >> PAGE_CACHE_SHIFT;
tm->old_root.logical = old_root->start;
@@ -594,7 +662,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
tm->op = MOD_LOG_ROOT_REPLACE;
ret = __tree_mod_log_insert(fs_info, tm);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+ tree_mod_log_write_unlock(fs_info);
return ret;
}
@@ -608,7 +677,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
struct tree_mod_elem *found = NULL;
u64 index = start >> PAGE_CACHE_SHIFT;
- read_lock(&fs_info->tree_mod_log_lock);
+ tree_mod_log_read_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
@@ -617,18 +686,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
node = node->rb_left;
} else if (cur->index > index) {
node = node->rb_right;
- } else if (cur->elem.seq < min_seq) {
+ } else if (cur->seq < min_seq) {
node = node->rb_left;
} else if (!smallest) {
/* we want the node with the highest seq */
if (found)
- BUG_ON(found->elem.seq > cur->elem.seq);
+ BUG_ON(found->seq > cur->seq);
found = cur;
node = node->rb_left;
- } else if (cur->elem.seq > min_seq) {
+ } else if (cur->seq > min_seq) {
/* we want the node with the smallest seq */
if (found)
- BUG_ON(found->elem.seq < cur->elem.seq);
+ BUG_ON(found->seq < cur->seq);
found = cur;
node = node->rb_right;
} else {
@@ -636,7 +705,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
break;
}
}
- read_unlock(&fs_info->tree_mod_log_lock);
+ tree_mod_log_read_unlock(fs_info);
return found;
}
@@ -664,7 +733,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
return __tree_mod_log_search(fs_info, start, min_seq, 0);
}
-static inline void
+static noinline void
tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
struct extent_buffer *src, unsigned long dst_offset,
unsigned long src_offset, int nr_items)
@@ -675,18 +744,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
if (tree_mod_dont_log(fs_info, NULL))
return;
- if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
+ tree_mod_log_write_unlock(fs_info);
return;
+ }
- /* speed this up by single seq for all operations? */
for (i = 0; i < nr_items; i++) {
- ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
- MOD_LOG_KEY_REMOVE);
+ ret = tree_mod_log_insert_key_locked(fs_info, src,
+ i + src_offset,
+ MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
- ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
- MOD_LOG_KEY_ADD);
+ ret = tree_mod_log_insert_key_locked(fs_info, dst,
+ i + dst_offset,
+ MOD_LOG_KEY_ADD);
BUG_ON(ret < 0);
}
+
+ tree_mod_log_write_unlock(fs_info);
}
static inline void
@@ -699,10 +773,9 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
BUG_ON(ret < 0);
}
-static inline void
+static noinline void
tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int slot, int atomic)
+ struct extent_buffer *eb, int slot, int atomic)
{
int ret;
@@ -712,30 +785,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
BUG_ON(ret < 0);
}
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static noinline void
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
- int i;
- int ret;
- u32 nritems;
-
if (tree_mod_dont_log(fs_info, eb))
return;
- nritems = btrfs_header_nritems(eb);
- for (i = nritems - 1; i >= 0; i--) {
- ret = tree_mod_log_insert_key(fs_info, eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING);
- BUG_ON(ret < 0);
- }
+ __tree_mod_log_free_eb(fs_info, eb);
+
+ tree_mod_log_write_unlock(fs_info);
}
-static inline void
+static noinline void
tree_mod_log_set_root_pointer(struct btrfs_root *root,
struct extent_buffer *new_root_node)
{
int ret;
- tree_mod_log_free_eb(root->fs_info, root->node);
ret = tree_mod_log_insert_root(root->fs_info, root->node,
new_root_node, GFP_NOFS);
BUG_ON(ret < 0);
@@ -862,12 +927,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1, 1);
BUG_ON(ret); /* -ENOMEM */
}
- /*
- * don't log freeing in case we're freeing the root node, this
- * is done by tree_mod_log_set_root_pointer later
- */
- if (buf != root->node && btrfs_header_level(buf) != 0)
- tree_mod_log_free_eb(root->fs_info, buf);
+ tree_mod_log_free_eb(root->fs_info, buf);
clean_tree_block(trans, root, buf);
*last_ref = 1;
}
@@ -1069,7 +1129,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
- while (tm && tm->elem.seq >= time_seq) {
+ while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
* the modification. as we're going backwards, we do the
@@ -1161,6 +1221,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
free_extent_buffer(eb);
__tree_mod_log_rewind(eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
return eb_rewin;
}
@@ -1177,9 +1239,11 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
struct extent_buffer *eb;
+ struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
+ u32 blocksize;
eb = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
@@ -1195,14 +1259,32 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
}
tm = tree_mod_log_search(root->fs_info, logical, time_seq);
- if (old_root)
+ if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ blocksize = btrfs_level_size(root, old_root->level);
+ old = read_tree_block(root, logical, blocksize, 0);
+ if (!old) {
+ pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
+ logical);
+ WARN_ON(1);
+ } else {
+ eb = btrfs_clone_extent_buffer(old);
+ free_extent_buffer(old);
+ }
+ } else if (old_root) {
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
eb = alloc_dummy_extent_buffer(logical, root->nodesize);
- else
+ } else {
eb = btrfs_clone_extent_buffer(root->node);
- btrfs_tree_read_unlock(root->node);
- free_extent_buffer(root->node);
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ }
+
if (!eb)
return NULL;
+ extent_buffer_get(eb);
btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
@@ -1215,11 +1297,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
__tree_mod_log_rewind(eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
- extent_buffer_get(eb);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
return eb;
}
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+
+ tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+ level = tm->old_root.level;
+ } else {
+ rcu_read_lock();
+ level = btrfs_header_level(root->node);
+ rcu_read_unlock();
+ }
+
+ return level;
+}
+
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -1260,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
u64 search_start;
int ret;
- if (trans->transaction != root->fs_info->running_transaction) {
- printk(KERN_CRIT "trans %llu running %llu\n",
+ if (trans->transaction != root->fs_info->running_transaction)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
(unsigned long long)trans->transid,
(unsigned long long)
root->fs_info->running_transaction->transid);
- WARN_ON(1);
- }
- if (trans->transid != root->fs_info->generation) {
- printk(KERN_CRIT "trans %llu running %llu\n",
+
+ if (trans->transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
(unsigned long long)trans->transid,
(unsigned long long)root->fs_info->generation);
- WARN_ON(1);
- }
if (!should_cow_block(trans, root, buf)) {
*cow_ret = buf;
@@ -1368,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (cache_only && parent_level != 1)
return 0;
- if (trans->transaction != root->fs_info->running_transaction)
- WARN_ON(1);
- if (trans->transid != root->fs_info->generation)
- WARN_ON(1);
+ WARN_ON(trans->transaction != root->fs_info->running_transaction);
+ WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1661,6 +1755,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
+ tree_mod_log_free_eb(root->fs_info, root->node);
tree_mod_log_set_root_pointer(root, child);
rcu_assign_pointer(root->node, child);
@@ -1725,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+ del_ptr(trans, root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
@@ -1734,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &right_key, pslot + 1, 0);
+ pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1769,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- del_ptr(trans, root, path, level + 1, pslot, 1);
+ del_ptr(trans, root, path, level + 1, pslot);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
@@ -1778,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+ tree_mod_log_set_node_key(root->fs_info, parent,
pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1878,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &disk_key, pslot, 0);
+ pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -1931,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &disk_key, pslot + 1, 0);
+ pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2117,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
int no_skips = 0;
struct extent_buffer *t;
+ if (path->really_keep_locks)
+ return;
+
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
@@ -2164,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
int i;
- if (path->keep_locks)
+ if (path->keep_locks || path->really_keep_locks)
return;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2397,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (!cow)
write_lock_level = -1;
- if (cow && (p->keep_locks || p->lowest_level))
+ if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
min_write_lock_level = write_lock_level;
@@ -2466,7 +2564,10 @@ again:
* must have write locks on this node and the
* parent
*/
- if (level + 1 > write_lock_level) {
+ if (level > write_lock_level ||
+ (level + 1 > write_lock_level &&
+ level + 1 < BTRFS_MAX_LEVEL &&
+ p->nodes[level + 1])) {
write_lock_level = level + 1;
btrfs_release_path(p);
goto again;
@@ -2722,6 +2823,80 @@ done:
}
/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int find_higher, int return_any)
+{
+ int ret;
+ struct extent_buffer *leaf;
+
+again:
+ ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+ if (ret <= 0)
+ return ret;
+ /*
+ * a return value of 1 means the path is at the position where the
+ * item should be inserted. Normally this is the next bigger item,
+ * but in case the previous item is the last in a leaf, path points
+ * to the first free slot in the previous leaf, i.e. at an invalid
+ * item.
+ */
+ leaf = p->nodes[0];
+
+ if (find_higher) {
+ if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, p);
+ if (ret <= 0)
+ return ret;
+ if (!return_any)
+ return 1;
+ /*
+ * no higher item found, return the next
+ * lower instead
+ */
+ return_any = 0;
+ find_higher = 0;
+ btrfs_release_path(p);
+ goto again;
+ }
+ } else {
+ if (p->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, p);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ p->slots[0] = btrfs_header_nritems(leaf) - 1;
+ return 0;
+ }
+ if (!return_any)
+ return 1;
+ /*
+ * no lower item found, return the next
+ * higher instead
+ */
+ return_any = 0;
+ find_higher = 1;
+ btrfs_release_path(p);
+ goto again;
+ } else {
+ --p->slots[0];
+ }
+ }
+ return 0;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -2741,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+ tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -2832,8 +3007,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
- tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
- src_nritems - push_items);
+ /*
+ * don't call tree_mod_log_eb_move here, key removal was already
+ * fully logged by tree_mod_log_eb_copy above.
+ */
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
(src_nritems - push_items) *
@@ -3124,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
+ struct btrfs_item *start_item;
+ struct btrfs_item *end_item;
+ struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- data_len = btrfs_item_end_nr(l, start);
- data_len = data_len - btrfs_item_offset_nr(l, end);
+ btrfs_init_map_token(&token);
+ start_item = btrfs_item_nr(l, start);
+ end_item = btrfs_item_nr(l, end);
+ data_len = btrfs_token_item_offset(l, start_item, &token) +
+ btrfs_token_item_size(l, start_item, &token);
+ data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3225,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (push_items == 0)
goto out_unlock;
- if (!empty && push_items == left_nritems)
- WARN_ON(1);
+ WARN_ON(!empty && push_items == left_nritems);
/* push left to right */
right_nritems = btrfs_header_nritems(right);
@@ -3464,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(left, old_left_nritems + push_items);
/* fixup right node */
- if (push_items > right_nritems) {
- printk(KERN_CRIT "push items %d nr %u\n", push_items,
+ if (push_items > right_nritems)
+ WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
right_nritems);
- WARN_ON(1);
- }
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4264,149 +4445,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
}
/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
-{
- struct extent_buffer *leaf;
- struct btrfs_item *item;
- int ret = 0;
- int slot;
- int i;
- u32 nritems;
- u32 total_data = 0;
- u32 total_size = 0;
- unsigned int data_end;
- struct btrfs_disk_key disk_key;
- struct btrfs_key found_key;
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token);
-
- for (i = 0; i < nr; i++) {
- if (total_size + data_size[i] + sizeof(struct btrfs_item) >
- BTRFS_LEAF_DATA_SIZE(root)) {
- break;
- nr = i;
- }
- total_data += data_size[i];
- total_size += data_size[i] + sizeof(struct btrfs_item);
- }
- BUG_ON(nr == 0);
-
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
- if (ret == 0)
- return -EEXIST;
- if (ret < 0)
- goto out;
-
- leaf = path->nodes[0];
-
- nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(root, leaf);
-
- if (btrfs_leaf_free_space(root, leaf) < total_size) {
- for (i = nr; i >= 0; i--) {
- total_data -= data_size[i];
- total_size -= data_size[i] + sizeof(struct btrfs_item);
- if (total_size < btrfs_leaf_free_space(root, leaf))
- break;
- }
- nr = i;
- }
-
- slot = path->slots[0];
- BUG_ON(slot < 0);
-
- if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
- item = btrfs_item_nr(leaf, slot);
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
- /* figure out how many keys we can insert in here */
- total_data = data_size[0];
- for (i = 1; i < nr; i++) {
- if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
- break;
- total_data += data_size[i];
- }
- nr = i;
-
- if (old_data < data_end) {
- btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
- slot, old_data, data_end);
- BUG_ON(1);
- }
- /*
- * item0..itemN ... dataN.offset..dataN.size .. data0.size
- */
- /* first correct the data pointers */
- for (i = slot; i < nritems; i++) {
- u32 ioff;
-
- item = btrfs_item_nr(leaf, i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
- }
- /* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
-
- /* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - total_data, btrfs_leaf_data(leaf) +
- data_end, old_data - data_end);
- data_end = old_data;
- } else {
- /*
- * this sucks but it has to be done, if we are inserting at
- * the end of the leaf only insert 1 of the items, since we
- * have no way of knowing whats on the next leaf and we'd have
- * to drop our current locks to figure it out
- */
- nr = 1;
- }
-
- /* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
- btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
- data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
- }
- btrfs_set_header_nritems(leaf, nritems + nr);
- btrfs_mark_buffer_dirty(leaf);
-
- ret = 0;
- if (slot == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(trans, root, path, &disk_key, 1);
- }
-
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
- BUG();
- }
-out:
- if (!ret)
- ret = nr;
- return ret;
-}
-
-/*
* this is a helper for btrfs_insert_empty_items, the main goal here is
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
@@ -4567,8 +4605,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* empty a node.
*/
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot,
- int tree_mod_log)
+ struct btrfs_path *path, int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
@@ -4576,7 +4613,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
- if (tree_mod_log && level)
+ if (level)
tree_mod_log_eb_move(root->fs_info, parent, slot,
slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
@@ -4584,7 +4621,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
- } else if (tree_mod_log && level) {
+ } else if (level) {
ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
@@ -4621,7 +4658,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(trans, root, path, 1, path->slots[1], 1);
+ del_ptr(trans, root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -4931,6 +4968,434 @@ out:
return ret;
}
+static void tree_move_down(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level)
+{
+ BUG_ON(*level == 0);
+ path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+ path->slots[*level]);
+ path->slots[*level - 1] = 0;
+ (*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level)
+{
+ int ret = 0;
+ int nritems;
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+
+ path->slots[*level]++;
+
+ while (path->slots[*level] >= nritems) {
+ if (*level == root_level)
+ return -1;
+
+ /* move upnext */
+ path->slots[*level] = 0;
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ (*level)++;
+ path->slots[*level]++;
+
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level,
+ int allow_down,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ if (*level == 0 || !allow_down) {
+ ret = tree_move_next_or_upnext(root, path, level, root_level);
+ } else {
+ tree_move_down(root, path, level, root_level);
+ ret = 0;
+ }
+ if (ret >= 0) {
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ }
+ return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ char *tmp_buf)
+{
+ int cmp;
+ int len1, len2;
+ unsigned long off1, off2;
+
+ len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+ right_path->slots[0]);
+
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+ if (cmp)
+ return 1;
+ return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t changed_cb, void *ctx)
+{
+ int ret;
+ int cmp;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *left_path = NULL;
+ struct btrfs_path *right_path = NULL;
+ struct btrfs_key left_key;
+ struct btrfs_key right_key;
+ char *tmp_buf = NULL;
+ int left_root_level;
+ int right_root_level;
+ int left_level;
+ int right_level;
+ int left_end_reached;
+ int right_end_reached;
+ int advance_left;
+ int advance_right;
+ u64 left_blockptr;
+ u64 right_blockptr;
+ u64 left_start_ctransid;
+ u64 right_start_ctransid;
+ u64 ctransid;
+
+ left_path = btrfs_alloc_path();
+ if (!left_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ right_path = btrfs_alloc_path();
+ if (!right_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ left_path->search_commit_root = 1;
+ left_path->skip_locking = 1;
+ right_path->search_commit_root = 1;
+ right_path->skip_locking = 1;
+
+ spin_lock(&left_root->root_item_lock);
+ left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
+ spin_unlock(&left_root->root_item_lock);
+
+ spin_lock(&right_root->root_item_lock);
+ right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
+ spin_unlock(&right_root->root_item_lock);
+
+ trans = btrfs_join_transaction(left_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ /*
+ * Strategy: Go to the first items of both trees. Then do
+ *
+ * If both trees are at level 0
+ * Compare keys of current items
+ * If left < right treat left item as new, advance left tree
+ * and repeat
+ * If left > right treat right item as deleted, advance right tree
+ * and repeat
+ * If left == right do deep compare of items, treat as changed if
+ * needed, advance both trees and repeat
+ * If both trees are at the same level but not at level 0
+ * Compare keys of current nodes/leafs
+ * If left < right advance left tree and repeat
+ * If left > right advance right tree and repeat
+ * If left == right compare blockptrs of the next nodes/leafs
+ * If they match advance both trees but stay at the same level
+ * and repeat
+ * If they don't match advance both trees while allowing to go
+ * deeper and repeat
+ * If tree levels are different
+ * Advance the tree that needs it and repeat
+ *
+ * Advancing a tree means:
+ * If we are at level 0, try to go to the next slot. If that's not
+ * possible, go one level up and repeat. Stop when we found a level
+ * where we could go to the next slot. We may at this point be on a
+ * node or a leaf.
+ *
+ * If we are not at level 0 and not on shared tree blocks, go one
+ * level deeper.
+ *
+ * If we are not at level 0 and on shared tree blocks, go one slot to
+ * the right if possible or go up and right.
+ */
+
+ left_level = btrfs_header_level(left_root->commit_root);
+ left_root_level = left_level;
+ left_path->nodes[left_level] = left_root->commit_root;
+ extent_buffer_get(left_path->nodes[left_level]);
+
+ right_level = btrfs_header_level(right_root->commit_root);
+ right_root_level = right_level;
+ right_path->nodes[right_level] = right_root->commit_root;
+ extent_buffer_get(right_path->nodes[right_level]);
+
+ if (left_level == 0)
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ else
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ if (right_level == 0)
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+ else
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+
+ left_end_reached = right_end_reached = 0;
+ advance_left = advance_right = 0;
+
+ while (1) {
+ /*
+ * We need to make sure the transaction does not get committed
+ * while we do anything on commit roots. This means, we need to
+ * join and leave transactions for every item that we process.
+ */
+ if (trans && btrfs_should_end_transaction(trans, left_root)) {
+ btrfs_release_path(left_path);
+ btrfs_release_path(right_path);
+
+ ret = btrfs_end_transaction(trans, left_root);
+ trans = NULL;
+ if (ret < 0)
+ goto out;
+ }
+ /* now rejoin the transaction */
+ if (!trans) {
+ trans = btrfs_join_transaction(left_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ spin_lock(&left_root->root_item_lock);
+ ctransid = btrfs_root_ctransid(&left_root->root_item);
+ spin_unlock(&left_root->root_item_lock);
+ if (ctransid != left_start_ctransid)
+ left_start_ctransid = 0;
+
+ spin_lock(&right_root->root_item_lock);
+ ctransid = btrfs_root_ctransid(&right_root->root_item);
+ spin_unlock(&right_root->root_item_lock);
+ if (ctransid != right_start_ctransid)
+ right_start_ctransid = 0;
+
+ if (!left_start_ctransid || !right_start_ctransid) {
+ WARN(1, KERN_WARNING
+ "btrfs: btrfs_compare_tree detected "
+ "a change in one of the trees while "
+ "iterating. This is probably a "
+ "bug.\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * the commit root may have changed, so start again
+ * where we stopped
+ */
+ left_path->lowest_level = left_level;
+ right_path->lowest_level = right_level;
+ ret = btrfs_search_slot(NULL, left_root,
+ &left_key, left_path, 0, 0);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_search_slot(NULL, right_root,
+ &right_key, right_path, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (advance_left && !left_end_reached) {
+ ret = tree_advance(left_root, left_path, &left_level,
+ left_root_level,
+ advance_left != ADVANCE_ONLY_NEXT,
+ &left_key);
+ if (ret < 0)
+ left_end_reached = ADVANCE;
+ advance_left = 0;
+ }
+ if (advance_right && !right_end_reached) {
+ ret = tree_advance(right_root, right_path, &right_level,
+ right_root_level,
+ advance_right != ADVANCE_ONLY_NEXT,
+ &right_key);
+ if (ret < 0)
+ right_end_reached = ADVANCE;
+ advance_right = 0;
+ }
+
+ if (left_end_reached && right_end_reached) {
+ ret = 0;
+ goto out;
+ } else if (left_end_reached) {
+ if (right_level == 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_right = ADVANCE;
+ continue;
+ } else if (right_end_reached) {
+ if (left_level == 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ continue;
+ }
+
+ if (left_level == 0 && right_level == 0) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_right = ADVANCE;
+ } else {
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = tree_compare_item(left_root, left_path,
+ right_path, tmp_buf);
+ if (ret) {
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_CHANGED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ } else if (left_level == right_level) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ advance_right = ADVANCE;
+ } else {
+ left_blockptr = btrfs_node_blockptr(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_blockptr = btrfs_node_blockptr(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr) {
+ /*
+ * As we're on a shared block, don't
+ * allow to go deeper.
+ */
+ advance_left = ADVANCE_ONLY_NEXT;
+ advance_right = ADVANCE_ONLY_NEXT;
+ } else {
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ }
+ } else if (left_level < right_level) {
+ advance_right = ADVANCE;
+ } else {
+ advance_left = ADVANCE;
+ }
+ }
+
+out:
+ btrfs_free_path(left_path);
+ btrfs_free_path(right_path);
+ kfree(tmp_buf);
+
+ if (trans) {
+ if (!ret)
+ ret = btrfs_end_transaction(trans, left_root);
+ else
+ btrfs_end_transaction(trans, left_root);
+ }
+
+ return ret;
+}
+
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
@@ -5033,6 +5498,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
return btrfs_next_old_leaf(root, path, 0);
}
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+ int i;
+
+ for (i = 0; i < level; i++) {
+ path->slots[i] = 0;
+ if (!path->nodes[i])
+ continue;
+ if (path->locks[i]) {
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+}
+
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int del)
+{
+ struct extent_buffer *b;
+ struct btrfs_key key;
+ u32 nritems;
+ int level = 1;
+ int slot;
+ int ret = 1;
+ int write_lock_level = BTRFS_MAX_LEVEL;
+ int ins_len = del ? -1 : 0;
+
+ WARN_ON(!(path->keep_locks || path->really_keep_locks));
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+ while (path->nodes[level]) {
+ nritems = btrfs_header_nritems(path->nodes[level]);
+ if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path,
+ ins_len, 1);
+ if (ret < 0)
+ goto out;
+ level = 1;
+ continue;
+ }
+
+ if (path->slots[level] >= nritems - 1) {
+ level++;
+ continue;
+ }
+
+ btrfs_release_level(path, level);
+ break;
+ }
+
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto out;
+ }
+
+ path->slots[level]++;
+ b = path->nodes[level];
+
+ while (b) {
+ level = btrfs_header_level(b);
+
+ if (!should_cow_block(trans, root, b))
+ goto cow_done;
+
+ btrfs_set_path_blocking(path);
+ ret = btrfs_cow_block(trans, root, b,
+ path->nodes[level + 1],
+ path->slots[level + 1], &b);
+ if (ret)
+ goto out;
+cow_done:
+ path->nodes[level] = b;
+ btrfs_clear_path_blocking(path, NULL, 0);
+ if (level != 0) {
+ ret = setup_nodes_for_search(trans, root, path, b,
+ level, ins_len,
+ &write_lock_level);
+ if (ret == -EAGAIN)
+ goto search;
+ if (ret)
+ goto out;
+
+ b = path->nodes[level];
+ slot = path->slots[level];
+
+ ret = read_block_for_search(trans, root, path,
+ &b, level, slot, &key, 0);
+ if (ret == -EAGAIN)
+ goto search;
+ if (ret)
+ goto out;
+ level = btrfs_header_level(b);
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(path, b,
+ BTRFS_WRITE_LOCK);
+ }
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ path->nodes[level] = b;
+ path->slots[level] = 0;
+ } else {
+ path->slots[level] = 0;
+ ret = 0;
+ break;
+ }
+ }
+
+out:
+ if (ret)
+ btrfs_release_path(path);
+
+ return ret;
+}
+
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
@@ -5127,6 +5725,7 @@ again:
* locked. To solve this situation, we give up
* on our lock and cycle.
*/
+ free_extent_buffer(next);
btrfs_release_path(path);
cond_resched();
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b3907..547b7b05727 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
#define BTRFS_MAX_LEVEL 8
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -113,7 +116,7 @@ struct btrfs_ordered_sum;
#define BTRFS_FREE_SPACE_OBJECTID -11ULL
/*
- * The inode number assigned to the special inode for sotring
+ * The inode number assigned to the special inode for storing
* free ino cache
*/
#define BTRFS_FREE_INO_OBJECTID -12ULL
@@ -139,6 +142,8 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+#define BTRFS_DEV_REPLACE_DEVID 0
+
/*
* the max metadata block size. This limit is somewhat artificial,
* but the memmove costs go through the roof for larger blocks.
@@ -151,6 +156,13 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_NAME_LEN 255
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
@@ -162,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS (1 << 30)
+
#define BTRFS_FT_UNKNOWN 0
#define BTRFS_FT_REG_FILE 1
#define BTRFS_FT_DIR 2
@@ -403,7 +418,7 @@ struct btrfs_root_backup {
__le64 bytes_used;
__le64 num_devices;
/* future */
- __le64 unsed_64[4];
+ __le64 unused_64[4];
u8 tree_root_level;
u8 chunk_root_level;
@@ -486,6 +501,8 @@ struct btrfs_super_block {
*/
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
+
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -493,7 +510,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -558,6 +576,7 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
+ unsigned int really_keep_locks:1;
};
/*
@@ -640,6 +659,14 @@ struct btrfs_inode_ref {
/* name goes here */
} __attribute__ ((__packed__));
+struct btrfs_inode_extref {
+ __le64 parent_objectid;
+ __le64 index;
+ __le16 name_len;
+ __u8 name[0];
+ /* name goes here */
+} __attribute__ ((__packed__));
+
struct btrfs_timespec {
__le64 sec;
__le32 nsec;
@@ -709,6 +736,36 @@ struct btrfs_root_item {
struct btrfs_disk_key drop_progress;
u8 drop_level;
u8 level;
+
+ /*
+ * The following fields appear after subvol_uuids+subvol_times
+ * were introduced.
+ */
+
+ /*
+ * This generation number is used to test if the new fields are valid
+ * and up to date while reading the root item. Everytime the root item
+ * is written out, the "generation" field is copied into this field. If
+ * anyone ever mounted the fs with an older kernel, we will have
+ * mismatching generation values here and thus must invalidate the
+ * new fields. See btrfs_update_root and btrfs_find_last_root for
+ * details.
+ * the offset of generation_v2 is also used as the start for the memset
+ * when invalidating the fields.
+ */
+ __le64 generation_v2;
+ u8 uuid[BTRFS_UUID_SIZE];
+ u8 parent_uuid[BTRFS_UUID_SIZE];
+ u8 received_uuid[BTRFS_UUID_SIZE];
+ __le64 ctransid; /* updated when an inode changes */
+ __le64 otransid; /* trans when created */
+ __le64 stransid; /* trans when sent. non-zero for received subvol */
+ __le64 rtransid; /* trans when received. non-zero for received subvol */
+ struct btrfs_timespec ctime;
+ struct btrfs_timespec otime;
+ struct btrfs_timespec stime;
+ struct btrfs_timespec rtime;
+ __le64 reserved[8]; /* for future */
} __attribute__ ((__packed__));
/*
@@ -834,6 +891,59 @@ struct btrfs_dev_stats_item {
__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
} __attribute__ ((__packed__));
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
+
+struct btrfs_dev_replace {
+ u64 replace_state; /* see #define above */
+ u64 time_started; /* seconds since 1-Jan-1970 */
+ u64 time_stopped; /* seconds since 1-Jan-1970 */
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ u64 cont_reading_from_srcdev_mode; /* see #define above */
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ pid_t lock_owner;
+ atomic_t nesting_level;
+ struct mutex lock_finishing_cancel_unmount;
+ struct mutex lock_management_lock;
+ struct mutex lock;
+
+ struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 src_devid;
+ __le64 cursor_left;
+ __le64 cursor_right;
+ __le64 cont_reading_from_srcdev_mode;
+
+ __le64 replace_state;
+ __le64 time_started;
+ __le64 time_stopped;
+ __le64 num_write_errors;
+ __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -883,6 +993,72 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
+/*
+ * SCANNING is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION 1
+
+struct btrfs_qgroup_status_item {
+ __le64 version;
+ /*
+ * the generation is updated during every commit. As older
+ * versions of btrfs are not aware of qgroups, it will be
+ * possible to detect inconsistencies by checking the
+ * generation on mount time
+ */
+ __le64 generation;
+
+ /* flag definitions see above */
+ __le64 flags;
+
+ /*
+ * only used during scanning to record the progress
+ * of the scan. It contains a logical address
+ */
+ __le64 scan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+ __le64 generation;
+ __le64 rfer;
+ __le64 rfer_cmpr;
+ __le64 excl;
+ __le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+ /*
+ * only updated when any of the other values change
+ */
+ __le64 flags;
+ __le64 max_rfer;
+ __le64 max_excl;
+ __le64 rsv_rfer;
+ __le64 rsv_excl;
+} __attribute__ ((__packed__));
+
struct btrfs_space_info {
u64 flags;
@@ -929,12 +1105,22 @@ struct btrfs_space_info {
wait_queue_head_t wait;
};
+#define BTRFS_BLOCK_RSV_GLOBAL 1
+#define BTRFS_BLOCK_RSV_DELALLOC 2
+#define BTRFS_BLOCK_RSV_TRANS 3
+#define BTRFS_BLOCK_RSV_CHUNK 4
+#define BTRFS_BLOCK_RSV_DELOPS 5
+#define BTRFS_BLOCK_RSV_EMPTY 6
+#define BTRFS_BLOCK_RSV_TEMP 7
+
struct btrfs_block_rsv {
u64 size;
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned int full;
+ unsigned short full;
+ unsigned short type;
+ unsigned short failfast;
};
/*
@@ -1028,8 +1214,18 @@ struct btrfs_block_group_cache {
* Today it will only have one thing on it, but that may change
*/
struct list_head cluster_list;
+
+ /* For delayed block group creation */
+ struct list_head new_bg_list;
};
+/* delayed seq elem */
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+/* fs_info */
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
@@ -1044,6 +1240,7 @@ struct btrfs_fs_info {
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
struct btrfs_root *csum_root;
+ struct btrfs_root *quota_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1133,7 +1330,6 @@ struct btrfs_fs_info {
struct mutex reloc_mutex;
struct list_head trans_list;
- struct list_head hashers;
struct list_head dead_roots;
struct list_head caching_block_groups;
@@ -1144,6 +1340,7 @@ struct btrfs_fs_info {
spinlock_t tree_mod_seq_lock;
atomic_t tree_mod_seq;
struct list_head tree_mod_seq_list;
+ struct seq_list tree_mod_seq_elem;
/* this protects tree_mod_log */
rwlock_t tree_mod_log_lock;
@@ -1195,6 +1392,7 @@ struct btrfs_fs_info {
struct btrfs_workers generic_worker;
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
+ struct btrfs_workers flush_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
struct btrfs_workers endio_meta_write_workers;
@@ -1240,6 +1438,8 @@ struct btrfs_fs_info {
*/
struct list_head space_info;
+ struct btrfs_space_info *data_sinfo;
+
struct reloc_control *reloc_ctl;
spinlock_t delalloc_lock;
@@ -1256,9 +1456,6 @@ struct btrfs_fs_info {
struct rb_root defrag_inodes;
atomic_t defrag_running;
- spinlock_t ref_cache_lock;
- u64 total_ref_cache_size;
-
/*
* these three are in extended format (availability of single
* chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1292,10 +1489,35 @@ struct btrfs_fs_info {
struct rw_semaphore scrub_super_lock;
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
+ struct btrfs_workers scrub_wr_completion_workers;
+ struct btrfs_workers scrub_nocow_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
+ /*
+ * quota information
+ */
+ unsigned int quota_enabled:1;
+
+ /*
+ * quota_enabled only changes state after a commit. This holds the
+ * next state.
+ */
+ unsigned int pending_quota_state:1;
+
+ /* is qgroup tracking in a consistent state? */
+ u64 qgroup_flags;
+
+ /* holds configuration and tracking. Protected by qgroup_lock */
+ struct rb_root qgroup_tree;
+ spinlock_t qgroup_lock;
+
+ /* list of dirty qgroups to be written at next commit */
+ struct list_head dirty_qgroups;
+
+ /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ u64 qgroup_seq;
/* filesystem state */
u64 fs_state;
@@ -1308,6 +1530,13 @@ struct btrfs_fs_info {
/* next backup root to be overwritten */
int backup_root_index;
+
+ int num_tolerated_disk_barrier_failures;
+
+ /* device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ atomic_t mutually_exclusive_operation_running;
};
/*
@@ -1348,9 +1577,9 @@ struct btrfs_root {
wait_queue_head_t log_commit_wait[2];
atomic_t log_writers;
atomic_t log_commit[2];
+ atomic_t log_batch;
unsigned long log_transid;
unsigned long last_log_commit;
- unsigned long log_batch;
pid_t log_start_pid;
bool log_multiple_pids;
@@ -1416,6 +1645,8 @@ struct btrfs_root {
dev_t anon_dev;
int force_cow;
+
+ spinlock_t root_item_lock;
};
struct btrfs_ioctl_defrag_range_args {
@@ -1457,6 +1688,7 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_INODE_ITEM_KEY 1
#define BTRFS_INODE_REF_KEY 12
+#define BTRFS_INODE_EXTREF_KEY 13
#define BTRFS_XATTR_ITEM_KEY 24
#define BTRFS_ORPHAN_ITEM_KEY 48
/* reserve 2-15 close to the inode for later flexibility */
@@ -1525,6 +1757,30 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY 240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY 242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY 244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY 246
+
#define BTRFS_BALANCE_ITEM_KEY 248
/*
@@ -1534,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_STATS_KEY 249
/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY 250
+
+/*
* string items are for debugging. They just store a short string of
* data in the FS
*/
@@ -1598,7 +1860,7 @@ struct btrfs_map_token {
static inline void btrfs_init_map_token (struct btrfs_map_token *token)
{
- memset(token, 0, sizeof(*token));
+ token->kaddr = NULL;
}
/* some macros to generate set/get funcs for the struct fields. This
@@ -1621,13 +1883,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
offsetof(type, member), \
sizeof(((type *)0)->member)))
-#ifndef BTRFS_SETGET_FUNCS
+#define DECLARE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, \
+ struct btrfs_map_token *token); \
+void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val, \
+ struct btrfs_map_token *token); \
+static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off) \
+{ \
+ return btrfs_get_token_##bits(eb, ptr, off, NULL); \
+} \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val) \
+{ \
+ btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
+}
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
-u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
-u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \
-void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
-void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
-#endif
+static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+} \
+static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
+ u##bits val) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+} \
+static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+ struct btrfs_map_token *token) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+} \
+static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
+ type *s, u##bits val, \
+ struct btrfs_map_token *token) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \
@@ -1778,6 +2081,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
/* struct btrfs_inode_item */
BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2189,6 +2499,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+ generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+ ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+ otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+ stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+ rtransid, 64);
static inline bool btrfs_root_readonly(struct btrfs_root *root)
{
@@ -2465,6 +2785,92 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+ version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
+ scan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+ rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+ excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+ struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+ rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+ struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+ excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+ struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+ max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+ max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+ rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+ rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
@@ -2605,10 +3011,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+ /*
+ * Flushing delalloc may cause deadlock somewhere, in this
+ * case, use FLUSH LIMIT
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+ BTRFS_RESERVE_FLUSH_ALL,
+};
+
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2622,24 +3041,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved);
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
@@ -2661,6 +3075,9 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2680,6 +3097,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_key *max_key,
struct btrfs_path *path, int cache_only,
u64 min_trans);
+enum btrfs_compare_tree_result {
+ BTRFS_COMPARE_TREE_NEW,
+ BTRFS_COMPARE_TREE_DELETED,
+ BTRFS_COMPARE_TREE_CHANGED,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ struct btrfs_key *key,
+ enum btrfs_compare_tree_result result,
+ void *ctx);
+int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t cb, void *ctx);
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
@@ -2711,6 +3143,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
ins_len, int cow);
int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int find_higher, int return_any);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, int cache_only, u64 *last_ret,
@@ -2753,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int del);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -2793,11 +3231,23 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
+ kfree(fs_info->quota_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic_inc_return(&fs_info->tree_mod_seq);
+}
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
@@ -2819,6 +3269,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_root_item *item);
+void btrfs_read_root_item(struct btrfs_root *root,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item);
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2826,8 +3279,12 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
/* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
int name_len, struct inode *dir,
@@ -2884,12 +3341,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod,
+ u64 *ret_index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -2897,13 +3354,26 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+ u64 ref_objectid, const char *name,
+ int name_len,
+ struct btrfs_inode_extref **extref_ret);
+
/* file-item.c */
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 logical_offset, u32 *dst);
+ struct bio *bio, u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2914,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
@@ -2929,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
/* inode.c */
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ int wait;
+ int delay_iput;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create);
@@ -2961,6 +3445,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
@@ -2995,6 +3481,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
@@ -3020,16 +3508,30 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space);
+
/* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
- int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+ u64 start, u64 end, int skip_pinned,
+ int modified);
extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
- u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3053,14 +3555,48 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+#endif
+
+__printf(5, 6)
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ }
+}
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
#define btrfs_abort_transaction(trans, root, errno) \
do { \
__btrfs_abort_transaction(trans, root, __func__, \
@@ -3080,6 +3616,7 @@ do { \
(errno), fmt, ##args); \
} while (0)
+__printf(5, 6)
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
@@ -3127,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
- struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
void btrfs_scrub_pause(struct btrfs_root *root);
void btrfs_scrub_pause_super(struct btrfs_root *root);
void btrfs_scrub_continue(struct btrfs_root *root);
void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+ struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
@@ -3156,17 +3694,49 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
-/* delayed seq elem */
-struct seq_list {
+/* qgroup.c */
+struct qgroup_update {
struct list_head list;
- u64 seq;
- u32 flags;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_delayed_extent_op *extent_op;
};
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2399f408691..34836036f01 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
int __init btrfs_delayed_inode_init(void)
{
- delayed_node_cache = kmem_cache_create("delayed_node",
+ delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
sizeof(struct btrfs_delayed_node),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
delayed_node->bytes_reserved = 0;
+ memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
}
static inline int btrfs_is_continuous_delayed_item(
@@ -511,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
rb_erase(&delayed_item->rb_node, root);
delayed_item->delayed_node->count--;
- atomic_dec(&delayed_root->items);
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+ if (atomic_dec_return(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND &&
waitqueue_active(&delayed_root->wait))
wake_up(&delayed_root->wait);
}
@@ -649,8 +650,9 @@ static int btrfs_delayed_inode_reserve_metadata(
* we're accounted for.
*/
if (!src_rsv || (!trans->bytes_reserved &&
- src_rsv != &root->fs_info->delalloc_block_rsv)) {
- ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* Since we're under a transaction reserve_metadata_bytes could
* try to commit the transaction which will make it return
@@ -667,7 +669,7 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes, 1);
}
return ret;
- } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+ } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
spin_lock(&BTRFS_I(inode)->lock);
if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
&BTRFS_I(inode)->runtime_flags)) {
@@ -685,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
* reserve something strictly for us. If not be a pain and try
* to steal from the delalloc block rsv.
*/
- ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
goto out;
@@ -1027,9 +1030,10 @@ do_again:
btrfs_release_delayed_item(prev);
ret = 0;
btrfs_release_path(path);
- if (curr)
+ if (curr) {
+ mutex_unlock(&node->mutex);
goto do_again;
- else
+ } else
goto delete_fail;
}
@@ -1054,8 +1058,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
delayed_node->count--;
delayed_root = delayed_node->root->fs_info->delayed_root;
- atomic_dec(&delayed_root->items);
- if (atomic_read(&delayed_root->items) <
+ if (atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND &&
waitqueue_active(&delayed_root->wait))
wake_up(&delayed_root->wait);
@@ -1113,8 +1116,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* Returns < 0 on error and returns with an aborted transaction with any
* outstanding delayed items cleaned up.
*/
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr)
{
struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
@@ -1122,6 +1125,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
+ bool count = (nr > 0);
if (trans->aborted)
return -EIO;
@@ -1137,7 +1141,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
delayed_root = btrfs_get_delayed_root(root);
curr_node = btrfs_first_delayed_node(delayed_root);
- while (curr_node) {
+ while (curr_node && (!count || (count && nr--))) {
curr_root = curr_node->root;
ret = btrfs_insert_delayed_items(trans, path, curr_root,
curr_node);
@@ -1149,6 +1153,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
path, curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
+ curr_node = NULL;
btrfs_abort_transaction(trans, root, ret);
break;
}
@@ -1158,12 +1163,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
btrfs_release_delayed_node(prev_node);
}
+ if (curr_node)
+ btrfs_release_delayed_node(curr_node);
btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
}
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_run_delayed_items(trans, root, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr)
+{
+ return __btrfs_run_delayed_items(trans, root, nr);
+}
+
static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
@@ -1238,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
struct btrfs_delayed_node *delayed_node = NULL;
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
- unsigned long nr = 0;
int need_requeue = 0;
int ret;
@@ -1299,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
delayed_node);
mutex_unlock(&delayed_node->mutex);
- nr = trans->blocks_used;
-
trans->block_rsv = block_rsv;
btrfs_end_transaction_dmeta(trans, root);
- __btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty_nodelay(root);
free_path:
btrfs_free_path(path);
out:
@@ -1698,8 +1714,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
struct inode *inode)
{
- btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
- btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
@@ -1747,8 +1763,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode_item = &delayed_node->inode_item;
- inode->i_uid = btrfs_stack_inode_uid(inode_item);
- inode->i_gid = btrfs_stack_inode_gid(inode_item);
+ i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f5aa4023d3e..4f808e1baee 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode);
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr);
void btrfs_balance_delayed_items(struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b04790..ae941177339 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -38,17 +38,14 @@
static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
struct btrfs_delayed_tree_ref *ref1)
{
- if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
return 0;
}
@@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
* type of the delayed backrefs and content of delayed backrefs.
*/
static int comp_entry(struct btrfs_delayed_ref_node *ref2,
- struct btrfs_delayed_ref_node *ref1)
+ struct btrfs_delayed_ref_node *ref1,
+ bool compare_seq)
{
if (ref1->bytenr < ref2->bytenr)
return -1;
@@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
if (ref1->type > ref2->type)
return 1;
/* merging of sequenced refs is not allowed */
- if (ref1->seq < ref2->seq)
- return -1;
- if (ref1->seq > ref2->seq)
- return 1;
+ if (compare_seq) {
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
+ }
if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
rb_node);
- cmp = comp_entry(entry, ins);
+ cmp = comp_entry(entry, ins, 1);
if (cmp < 0)
p = &(*p)->rb_left;
else if (cmp > 0)
@@ -233,22 +233,134 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq)
+static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *ref)
{
- struct seq_list *elem;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ ref->in_tree = 0;
+ btrfs_put_delayed_ref(ref);
+ delayed_refs->num_entries--;
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+}
- assert_spin_locked(&delayed_refs->lock);
- if (list_empty(&delayed_refs->seq_head))
- return 0;
+static int merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+ struct rb_node *node;
+ int merged = 0;
+ int mod = 0;
+ int done = 0;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ struct btrfs_delayed_ref_node *next;
+
+ next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_prev(node);
+ if (next->bytenr != ref->bytenr)
+ break;
+ if (seq && next->seq >= seq)
+ break;
+ if (comp_entry(ref, next, 0))
+ continue;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ struct btrfs_delayed_ref_node *tmp;
- elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
- if (seq >= elem->seq) {
- pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
- seq, elem->seq, delayed_refs);
- return 1;
+ tmp = ref;
+ ref = next;
+ next = tmp;
+ done = 1;
+ }
+ mod = -next->ref_mod;
+ }
+
+ merged++;
+ drop_delayed_ref(trans, delayed_refs, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, ref);
+ break;
+ } else {
+ /*
+ * You can't have multiples of the same ref on a tree
+ * block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+
+ if (done)
+ break;
+ node = rb_prev(&ref->rb_node);
}
- return 0;
+
+ return merged;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct rb_node *node;
+ u64 seq = 0;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ node = rb_prev(&head->node.rb_node);
+ while (node) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr != head->node.bytenr)
+ break;
+
+ /* We can't merge refs that are outside of our seq count */
+ if (seq && ref->seq >= seq)
+ break;
+ if (merge_ref(trans, delayed_refs, ref, seq))
+ node = rb_prev(&head->node.rb_node);
+ else
+ node = rb_prev(node);
+ }
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq)
+{
+ struct seq_list *elem;
+ int ret = 0;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ if (seq >= elem->seq) {
+ pr_debug("holding back delayed_ref %llu, lowest is "
+ "%llu (%p)\n", seq, elem->seq, delayed_refs);
+ ret = 1;
+ }
+ }
+
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
}
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -332,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
* every changing the extent allocation tree.
*/
existing->ref_mod--;
- if (existing->ref_mod == 0) {
- rb_erase(&existing->rb_node,
- &delayed_refs->root);
- existing->in_tree = 0;
- btrfs_put_delayed_ref(existing);
- delayed_refs->num_entries--;
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
- } else {
+ if (existing->ref_mod == 0)
+ drop_delayed_ref(trans, delayed_refs, existing);
+ else
WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
- }
} else {
WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -525,8 +630,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (is_fstree(ref_root))
- seq = inc_delayed_seq(delayed_refs);
+ if (need_ref_seq(for_cow, ref_root))
+ seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -584,8 +689,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (is_fstree(ref_root))
- seq = inc_delayed_seq(delayed_refs);
+ if (need_ref_seq(for_cow, ref_root))
+ seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -658,10 +763,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
for_cow);
- if (!is_fstree(ref_root) &&
- waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+ if (need_ref_seq(for_cow, ref_root))
+ btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -707,10 +811,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
action, for_cow);
- if (!is_fstree(ref_root) &&
- waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+ if (need_ref_seq(for_cow, ref_root))
+ btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -736,8 +839,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
- if (waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927fb995..c9d703693df 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,7 +18,7 @@
#ifndef __DELAYED_REF__
#define __DELAYED_REF__
-/* these are the possible values of struct btrfs_delayed_ref->action */
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
-
- /*
- * seq number of delayed refs. We need to know if a backref was being
- * added before the currently processed ref or afterwards.
- */
- u64 seq;
-
- /*
- * seq_list holds a list of all seq numbers that are currently being
- * added to the list. While walking backrefs (btrfs_find_all_roots,
- * qgroups), which might take some time, no newer ref must be processed,
- * as it might influence the outcome of the walk.
- */
- struct list_head seq_head;
-
- /*
- * when the only refs we have in the list must not be processed, we want
- * to wait for more refs to show up or for the end of backref walking.
- */
- wait_queue_head_t seq_wait;
};
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -187,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
@@ -195,34 +179,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
-static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
-{
- assert_spin_locked(&delayed_refs->lock);
- ++delayed_refs->seq;
- return delayed_refs->seq;
-}
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq);
-static inline void
-btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- struct seq_list *elem)
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
{
- assert_spin_locked(&delayed_refs->lock);
- elem->seq = delayed_refs->seq;
- list_add_tail(&elem->list, &delayed_refs->seq_head);
-}
+ if (for_cow)
+ return 0;
-static inline void
-btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- struct seq_list *elem)
-{
- spin_lock(&delayed_refs->lock);
- list_del(&elem->list);
- wake_up(&delayed_refs->seq_wait);
- spin_unlock(&delayed_refs->lock);
-}
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return 1;
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq);
+ if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+
+ return 0;
+}
/*
* a node might live in a head or a regular ref, this lets you
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 00000000000..66dbc8dbddf
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ int item_size;
+ struct btrfs_dev_replace_item *ptr;
+ u64 src_devid;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+no_valid_dev_replace_entry_found:
+ ret = 0;
+ dev_replace->replace_state =
+ BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+ dev_replace->cont_reading_from_srcdev_mode =
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+ dev_replace->replace_state = 0;
+ dev_replace->time_started = 0;
+ dev_replace->time_stopped = 0;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ dev_replace->is_valid = 0;
+ dev_replace->item_needs_writeback = 0;
+ goto out;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+ if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+ pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+ goto no_valid_dev_replace_entry_found;
+ }
+
+ src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+ dev_replace->cont_reading_from_srcdev_mode =
+ btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+ dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+ dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+ dev_replace->time_stopped =
+ btrfs_dev_replace_time_stopped(eb, ptr);
+ atomic64_set(&dev_replace->num_write_errors,
+ btrfs_dev_replace_num_write_errors(eb, ptr));
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+ btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+ dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+ dev_replace->committed_cursor_left = dev_replace->cursor_left;
+ dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+ dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+ dev_replace->is_valid = 1;
+
+ dev_replace->item_needs_writeback = 0;
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+ NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info,
+ BTRFS_DEV_REPLACE_DEVID,
+ NULL, NULL);
+ /*
+ * allow 'btrfs dev replace_cancel' if src/tgt device is
+ * missing
+ */
+ if (!dev_replace->srcdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+ (unsigned long long)src_devid);
+ }
+ if (!dev_replace->tgtdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+ (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+ }
+ if (dev_replace->tgtdev) {
+ if (dev_replace->srcdev) {
+ dev_replace->tgtdev->total_bytes =
+ dev_replace->srcdev->total_bytes;
+ dev_replace->tgtdev->disk_total_bytes =
+ dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->bytes_used =
+ dev_replace->srcdev->bytes_used;
+ }
+ dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+ btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+ dev_replace->tgtdev);
+ }
+ break;
+ }
+
+out:
+ if (path)
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_replace_item *ptr;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (!dev_replace->is_valid ||
+ !dev_replace->item_needs_writeback) {
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+ ret);
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /*
+ * need to delete old one and insert a new one.
+ * Since no attempt is made to recover any old state, if the
+ * dev_replace state is 'running', the data on the target
+ * drive is lost.
+ * It would be possible to recover the state: just make sure
+ * that the beginning of the item is never changed and always
+ * contains all the essential information. Then read this
+ * minimal set of information and use it as a base for the
+ * new state.
+ */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+ ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ pr_warn("btrfs: insert dev_replace item failed %d!\n",
+ ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_dev_replace_item);
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (dev_replace->srcdev)
+ btrfs_set_dev_replace_src_devid(eb, ptr,
+ dev_replace->srcdev->devid);
+ else
+ btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+ btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+ dev_replace->cont_reading_from_srcdev_mode);
+ btrfs_set_dev_replace_replace_state(eb, ptr,
+ dev_replace->replace_state);
+ btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+ btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+ btrfs_set_dev_replace_num_write_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_write_errors));
+ btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+ dev_replace->cursor_left_last_write_of_item =
+ dev_replace->cursor_left;
+ btrfs_set_dev_replace_cursor_left(eb, ptr,
+ dev_replace->cursor_left_last_write_of_item);
+ btrfs_set_dev_replace_cursor_right(eb, ptr,
+ dev_replace->cursor_right);
+ dev_replace->item_needs_writeback = 0;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ dev_replace->committed_cursor_left =
+ dev_replace->cursor_left_last_write_of_item;
+}
+
+static u64 btrfs_get_seconds_since_1970(void)
+{
+ struct timespec t = CURRENT_TIME_SEC;
+
+ return t.tv_sec;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ mutex_lock(&fs_info->volume_mutex);
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ &tgt_device);
+ if (ret) {
+ pr_err("btrfs: target device %s is invalid!\n",
+ args->start.tgtdev_name);
+ mutex_unlock(&fs_info->volume_mutex);
+ return -EINVAL;
+ }
+
+ ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+ args->start.srcdev_name,
+ &src_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret) {
+ ret = -EINVAL;
+ goto leave_no_lock;
+ }
+
+ if (tgt_device->total_bytes < src_device->total_bytes) {
+ pr_err("btrfs: target device is smaller than source device!\n");
+ ret = -EINVAL;
+ goto leave_no_lock;
+ }
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ goto leave;
+ }
+
+ dev_replace->cont_reading_from_srcdev_mode =
+ args->start.cont_reading_from_srcdev_mode;
+ WARN_ON(!src_device);
+ dev_replace->srcdev = src_device;
+ WARN_ON(!tgt_device);
+ dev_replace->tgtdev = tgt_device;
+
+ printk_in_rcu(KERN_INFO
+ "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+
+ tgt_device->total_bytes = src_device->total_bytes;
+ tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+ tgt_device->bytes_used = src_device->bytes_used;
+
+ /*
+ * from now on, the writes to the srcdev are all duplicated to
+ * go to the tgtdev as well (refer to btrfs_map_block()).
+ */
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ dev_replace->time_started = btrfs_get_seconds_since_1970();
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->is_valid = 1;
+ dev_replace->item_needs_writeback = 1;
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_wait_ordered_extents(root, 0);
+
+ /* force writing the updated state information to disk */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_dev_replace_lock(dev_replace);
+ goto leave;
+ }
+
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* the disk copy procedure reuses the scrub code */
+ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+ src_device->total_bytes,
+ &dev_replace->scrub_progress, 0, 1);
+
+ ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+ WARN_ON(ret);
+
+ return 0;
+
+leave:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ return ret;
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device;
+ struct btrfs_device *src_device;
+ struct btrfs_root *root = fs_info->tree_root;
+ u8 uuid_tmp[BTRFS_UUID_SIZE];
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ /* don't allow cancel or unmount to disturb the finishing procedure */
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* was the operation canceled, or is it finished? */
+ if (dev_replace->replace_state !=
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return 0;
+ }
+
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret)
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+
+ /*
+ * flush all outstanding I/O and inode extent mappings before the
+ * copy operation is declared as being finished
+ */
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ btrfs_dev_replace_lock(dev_replace);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+ : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+
+ if (scrub_ret) {
+ printk_in_rcu(KERN_ERR
+ "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+ }
+
+ printk_in_rcu(KERN_INFO
+ "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+ tgt_device->is_tgtdev_for_dev_replace = 0;
+ tgt_device->devid = src_device->devid;
+ src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+ tgt_device->bytes_used = src_device->bytes_used;
+ memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+ memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+ memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+ tgt_device->total_bytes = src_device->total_bytes;
+ tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+ tgt_device->bytes_used = src_device->bytes_used;
+ if (fs_info->sb->s_bdev == src_device->bdev)
+ fs_info->sb->s_bdev = tgt_device->bdev;
+ if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+ fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+ list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+ btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ if (src_device->bdev) {
+ /* zero out the old super */
+ btrfs_scratch_superblock(src_device);
+ }
+ /*
+ * this is again a consistent state where no dev_replace procedure
+ * is running, the target device is part of the filesystem, the
+ * source device is not part of the filesystem anymore and its 1st
+ * superblock is scratched out so that it is no longer marked to
+ * belong to this filesystem.
+ */
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ /* write back the superblocks */
+ trans = btrfs_start_transaction(root, 0);
+ if (!IS_ERR(trans))
+ btrfs_commit_transaction(trans, root);
+
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device)
+{
+ int ret;
+
+ if (srcdevid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+ device);
+ }
+ return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* even if !dev_replace_is_valid, the values are good enough for
+ * the replace_status ioctl */
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ args->status.replace_state = dev_replace->replace_state;
+ args->status.time_started = dev_replace->time_started;
+ args->status.time_stopped = dev_replace->time_stopped;
+ args->status.num_write_errors =
+ atomic64_read(&dev_replace->num_write_errors);
+ args->status.num_uncorrectable_read_errors =
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ args->status.progress_1000 = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ args->status.progress_1000 = 1000;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+ div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ break;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ args->result = __btrfs_dev_replace_cancel(fs_info);
+ return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = fs_info->tree_root;
+ u64 result;
+ int ret;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ btrfs_dev_replace_unlock(dev_replace);
+ goto leave;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ tgt_device = dev_replace->tgtdev;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ break;
+ }
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(dev_replace);
+ btrfs_scrub_cancel(fs_info);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+ pr_info("btrfs: suspending dev_replace for unmount\n");
+ break;
+ }
+
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ break;
+ }
+ if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+ pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+ "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ WARN_ON(atomic_xchg(
+ &fs_info->mutually_exclusive_operation_running, 1));
+ task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+ return PTR_RET(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_ioctl_dev_replace_args *status_args;
+ u64 progress;
+
+ status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+ if (status_args) {
+ btrfs_dev_replace_status(fs_info, status_args);
+ progress = status_args->status.progress_1000;
+ kfree(status_args);
+ do_div(progress, 10);
+ printk_in_rcu(KERN_INFO
+ "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ dev_replace->srcdev->missing ? "<missing disk>" :
+ rcu_str_deref(dev_replace->srcdev->name),
+ dev_replace->srcdev->devid,
+ dev_replace->tgtdev ?
+ rcu_str_deref(dev_replace->tgtdev->name) :
+ "<missing target disk>",
+ (unsigned int)progress);
+ }
+ btrfs_dev_replace_continue_on_mount(fs_info);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+ return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+
+ ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+ dev_replace->committed_cursor_left,
+ dev_replace->srcdev->total_bytes,
+ &dev_replace->scrub_progress, 0, 1);
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
+ WARN_ON(ret);
+ return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+ if (!dev_replace->is_valid)
+ return 0;
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * return true even if tgtdev is missing (this is
+ * something that can happen if the dev_replace
+ * procedure is suspended by an umount and then
+ * the tgtdev is missing (or "btrfs dev scan") was
+ * not called and the the filesystem is remounted
+ * in degraded state. This does not stop the
+ * dev_replace procedure. It needs to be canceled
+ * manually if the cancelation is wanted.
+ */
+ break;
+ }
+ return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+ /* the beginning is just an optimization for the typical case */
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+ /* this is not a nested case where the same thread
+ * is trying to acqurire the same lock twice */
+ mutex_lock(&dev_replace->lock);
+ mutex_lock(&dev_replace->lock_management_lock);
+ dev_replace->lock_owner = current->pid;
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_lock(&dev_replace->lock_management_lock);
+ if (atomic_read(&dev_replace->nesting_level) > 0 &&
+ dev_replace->lock_owner == current->pid) {
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_unlock(&dev_replace->lock_management_lock);
+ goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ mutex_lock(&dev_replace->lock_management_lock);
+ WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+ WARN_ON(dev_replace->lock_owner != current->pid);
+ atomic_dec(&dev_replace->nesting_level);
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+ dev_replace->lock_owner = 0;
+ mutex_unlock(&dev_replace->lock_management_lock);
+ mutex_unlock(&dev_replace->lock);
+ } else {
+ mutex_unlock(&dev_replace->lock_management_lock);
+ }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 00000000000..20035cbbf02
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+ atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696..502c2158167 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ int data_size;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_path *path;
+
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ /* return back any errors */
+ if (ret < 0)
+ goto out;
+
+ /* nothing found, we're safe */
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ /* we found an item, look for our name in the item */
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di) {
+ /* our exact name was found */
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * see if there is room in the item to insert this
+ * name
+ */
+ data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (data_size + btrfs_item_size_nr(leaf, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+ ret = -EOVERFLOW;
+ } else {
+ /* plenty of insertion room */
+ ret = 0;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* lookup a directory item based on index. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2936ca49b3b..a8f652dc940 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,11 @@
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "dev-replace.h"
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -217,26 +222,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
- u64 failed_start = em->start;
- u64 failed_len = em->len;
-
free_extent_map(em);
em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- ret = 0;
- } else {
- em = lookup_extent_mapping(em_tree, failed_start,
- failed_len);
- ret = -EIO;
- }
+ if (!em)
+ em = ERR_PTR(-EIO);
} else if (ret) {
free_extent_map(em);
- em = NULL;
+ em = ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
- if (ret)
- em = ERR_PTR(ret);
out:
return em;
}
@@ -377,9 +372,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
- if (!ret && !verify_parent_transid(io_tree, eb,
+ if (!ret) {
+ if (!verify_parent_transid(io_tree, eb,
parent_transid, 0))
- break;
+ break;
+ else
+ ret = -EIO;
+ }
/*
* This buffer's crc is fine, but its contents are corrupted, so
@@ -389,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
break;
- num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ num_copies = btrfs_num_copies(root->fs_info,
eb->start, eb->len);
if (num_copies == 1)
break;
@@ -407,7 +406,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
break;
}
- if (failed && !ret)
+ if (failed && !ret && failed_mirror)
repair_eb_io_failure(root, eb, failed_mirror);
return ret;
@@ -435,10 +434,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
WARN_ON(1);
return 0;
}
- if (eb->pages[0] != page) {
- WARN_ON(1);
- return 0;
- }
if (!PageUptodate(page)) {
WARN_ON(1);
return 0;
@@ -754,9 +749,7 @@ static void run_one_async_done(struct btrfs_work *work)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
- atomic_dec(&fs_info->nr_async_submits);
-
- if (atomic_read(&fs_info->nr_async_submits) < limit &&
+ if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -860,21 +853,37 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ int ret;
+
/*
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
+}
+
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+ if (bio_flags & EXTENT_BIO_TREE_LOG)
+ return 0;
+#ifdef CONFIG_X86
+ if (cpu_has_xmm4_2)
+ return 0;
+#endif
+ return 1;
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ int async = check_async_write(inode, bio_flags);
int ret;
if (!(rw & REQ_WRITE)) {
-
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -882,20 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
bio, 1);
if (ret)
- return ret;
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
- mirror_num, 0);
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else if (!async) {
+ ret = btree_csum_one_bio(bio);
+ if (ret)
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else {
+ /*
+ * kthread helpers are used to submit writes so that
+ * checksumming can happen in parallel across all CPUs
+ */
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ bio_offset,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
}
- /*
- * kthread helpers are used to submit writes so that checksumming
- * can happen in parallel across all CPUs
- */
- return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num, 0,
- bio_offset,
- __btree_submit_bio_start,
- __btree_submit_bio_done);
+ if (ret) {
+out_w_error:
+ bio_endio(bio, ret);
+ }
+ return ret;
}
#ifdef CONFIG_MIGRATION
@@ -980,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
static int btree_set_page_dirty(struct page *page)
{
+#ifdef DEBUG
struct extent_buffer *eb;
BUG_ON(!PagePrivate(page));
@@ -988,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
+#endif
return __set_page_dirty_nobuffers(page);
}
@@ -1114,16 +1137,16 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
spin_unlock(&root->fs_info->delalloc_lock);
btrfs_panic(root->fs_info, -EOVERFLOW,
"Can't clear %lu bytes from "
- " dirty_mdatadata_bytes (%lu)",
+ " dirty_mdatadata_bytes (%llu)",
buf->len,
root->fs_info->dirty_metadata_bytes);
}
spin_unlock(&root->fs_info->delalloc_lock);
- }
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
- clear_extent_buffer_dirty(buf);
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
+ clear_extent_buffer_dirty(buf);
+ }
}
}
@@ -1166,8 +1189,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+ atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
- root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
@@ -1182,6 +1205,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
+
+ spin_lock_init(&root->root_item_lock);
}
static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1225,6 +1250,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
return root;
}
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int ret = 0;
+ u64 bytenr;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, objectid);
+ root->root_key.objectid = objectid;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+
+ leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+ 0, objectid, NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ goto fail;
+ }
+
+ bytenr = leaf->start;
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, objectid);
+ root->node = leaf;
+
+ write_extent_buffer(leaf, fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(leaf),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+ BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ root->commit_root = btrfs_root_node(root);
+ root->track_dirty = 1;
+
+
+ root->root_item.flags = 0;
+ root->root_item.byte_limit = 0;
+ btrfs_set_root_bytenr(&root->root_item, leaf->start);
+ btrfs_set_root_generation(&root->root_item, trans->transid);
+ btrfs_set_root_level(&root->root_item, 0);
+ btrfs_set_root_refs(&root->root_item, 1);
+ btrfs_set_root_used(&root->root_item, leaf->len);
+ btrfs_set_root_last_snapshot(&root->root_item, 0);
+ btrfs_set_root_dirid(&root->root_item, 0);
+ root->root_item.drop_level = 0;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+ if (ret)
+ goto fail;
+
+ btrfs_tree_unlock(leaf);
+
+fail:
+ if (ret)
+ return ERR_PTR(ret);
+
+ return root;
+}
+
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1326,6 +1427,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
u64 generation;
u32 blocksize;
int ret = 0;
+ int slot;
root = btrfs_alloc_root(fs_info);
if (!root)
@@ -1352,9 +1454,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
if (ret == 0) {
l = path->nodes[0];
- read_extent_buffer(l, &root->root_item,
- btrfs_item_ptr_offset(l, path->slots[0]),
- sizeof(root->root_item));
+ slot = path->slots[0];
+ btrfs_read_root_item(tree_root, l, slot, &root->root_item);
memcpy(&root->root_key, location, sizeof(*location));
}
btrfs_free_path(path);
@@ -1396,6 +1497,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->dev_root;
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
return fs_info->csum_root;
+ if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return fs_info->quota_root ? fs_info->quota_root :
+ ERR_PTR(-ENOENT);
again:
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1533,8 +1637,6 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
do {
- vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
-
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
mutex_trylock(&root->fs_info->cleaner_mutex)) {
btrfs_run_delayed_iputs(root);
@@ -1566,7 +1668,6 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
delay = HZ * 30;
- vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
@@ -1587,9 +1688,10 @@ static int transaction_kthread(void *arg)
spin_unlock(&root->fs_info->trans_lock);
/* If the file system is aborted, this will always fail. */
- trans = btrfs_join_transaction(root);
+ trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
- cannot_commit = true;
+ if (PTR_ERR(trans) != -ENOENT)
+ cannot_commit = true;
goto sleep;
}
if (transid == trans->transid) {
@@ -1823,6 +1925,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_extent_buffer(info->extent_root->commit_root);
free_extent_buffer(info->csum_root->node);
free_extent_buffer(info->csum_root->commit_root);
+ if (info->quota_root) {
+ free_extent_buffer(info->quota_root->node);
+ free_extent_buffer(info->quota_root->commit_root);
+ }
info->tree_root->node = NULL;
info->tree_root->commit_root = NULL;
@@ -1832,6 +1938,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
info->extent_root->commit_root = NULL;
info->csum_root->node = NULL;
info->csum_root->commit_root = NULL;
+ if (info->quota_root) {
+ info->quota_root->node = NULL;
+ info->quota_root->commit_root = NULL;
+ }
if (chunk_root) {
free_extent_buffer(info->chunk_root->node);
@@ -1862,6 +1972,7 @@ int open_ctree(struct super_block *sb,
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
+ struct btrfs_root *quota_root;
struct btrfs_root *log_tree_root;
int ret;
int err = -EINVAL;
@@ -1873,9 +1984,10 @@ int open_ctree(struct super_block *sb,
csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+ quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
if (!tree_root || !extent_root || !csum_root ||
- !chunk_root || !dev_root) {
+ !chunk_root || !dev_root || !quota_root) {
err = -ENOMEM;
goto fail;
}
@@ -1904,13 +2016,11 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
- INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -1924,12 +2034,15 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
btrfs_mapping_init(&fs_info->mapping_tree);
- btrfs_init_block_rsv(&fs_info->global_block_rsv);
- btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
- btrfs_init_block_rsv(&fs_info->trans_block_rsv);
- btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
- btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv,
+ BTRFS_BLOCK_RSV_GLOBAL);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+ BTRFS_BLOCK_RSV_DELALLOC);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+ BTRFS_BLOCK_RSV_DELOPS);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -2031,6 +2144,18 @@ int open_ctree(struct super_block *sb,
init_rwsem(&fs_info->extent_commit_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
+ fs_info->dev_replace.lock_owner = 0;
+ atomic_set(&fs_info->dev_replace.nesting_level, 0);
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ mutex_init(&fs_info->dev_replace.lock_management_lock);
+ mutex_init(&fs_info->dev_replace.lock);
+
+ spin_lock_init(&fs_info->qgroup_lock);
+ fs_info->qgroup_tree = RB_ROOT;
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ fs_info->qgroup_seq = 1;
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2172,6 +2297,10 @@ int open_ctree(struct super_block *sb,
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
@@ -2243,8 +2372,9 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->delayed_workers);
ret |= btrfs_start_workers(&fs_info->caching_workers);
ret |= btrfs_start_workers(&fs_info->readahead_workers);
+ ret |= btrfs_start_workers(&fs_info->flush_workers);
if (ret) {
- ret = -ENOMEM;
+ err = -ENOMEM;
goto fail_sb_buffer;
}
@@ -2311,7 +2441,11 @@ int open_ctree(struct super_block *sb,
goto fail_tree_roots;
}
- btrfs_close_extra_devices(fs_devices);
+ /*
+ * keep the device that is marked to be the target device for the
+ * dev_replace procedure
+ */
+ btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2356,6 +2490,17 @@ retry_root_backup:
goto recovery_tree_root;
csum_root->track_dirty = 1;
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_QUOTA_TREE_OBJECTID, quota_root);
+ if (ret) {
+ kfree(quota_root);
+ quota_root = fs_info->quota_root = NULL;
+ } else {
+ quota_root->track_dirty = 1;
+ fs_info->quota_enabled = 1;
+ fs_info->pending_quota_state = 1;
+ }
+
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
@@ -2372,6 +2517,14 @@ retry_root_backup:
goto fail_block_groups;
}
+ ret = btrfs_init_dev_replace(fs_info);
+ if (ret) {
+ pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
ret = btrfs_init_space_info(fs_info);
if (ret) {
printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2383,6 +2536,15 @@ retry_root_backup:
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups;
}
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ if (fs_info->fs_devices->missing_devices >
+ fs_info->num_tolerated_disk_barrier_failures &&
+ !(sb->s_flags & MS_RDONLY)) {
+ printk(KERN_WARNING
+ "Btrfs: too many missing devices, writeable mount is not allowed\n");
+ goto fail_block_groups;
+ }
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
@@ -2415,17 +2577,19 @@ retry_root_backup:
" integrity check module %s\n", sb->s_id);
}
#endif
+ ret = btrfs_read_qgroup_config(fs_info);
+ if (ret)
+ goto fail_trans_kthread;
/* do not make disk changes in broken FS */
- if (btrfs_super_log_root(disk_super) != 0 &&
- !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+ if (btrfs_super_log_root(disk_super) != 0) {
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
printk(KERN_WARNING "Btrfs log replay required "
"on RO media\n");
err = -EIO;
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
blocksize =
btrfs_level_size(tree_root,
@@ -2434,7 +2598,7 @@ retry_root_backup:
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
err = -ENOMEM;
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
__setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2466,15 +2630,15 @@ retry_root_backup:
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret) {
- }
+ if (ret)
+ goto fail_trans_kthread;
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
printk(KERN_WARNING
"btrfs: failed to recover relocation\n");
err = -EINVAL;
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
}
@@ -2484,10 +2648,10 @@ retry_root_backup:
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (!fs_info->fs_root)
- goto fail_trans_kthread;
+ goto fail_qgroup;
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
if (sb->s_flags & MS_RDONLY)
@@ -2509,8 +2673,17 @@ retry_root_backup:
return ret;
}
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ pr_warn("btrfs: failed to resume dev_replace\n");
+ close_ctree(tree_root);
+ return ret;
+ }
+
return 0;
+fail_qgroup:
+ btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
fail_cleaner:
@@ -2543,6 +2716,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2762,12 +2936,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
printk_in_rcu("btrfs: disabling barriers on dev %s\n",
rcu_str_deref(device->name));
device->nobarriers = 1;
- }
- if (!bio_flagged(bio, BIO_UPTODATE)) {
+ } else if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
- if (!bio_flagged(bio, BIO_EOPNOTSUPP))
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
}
/* drop the reference from the wait == 0 run */
@@ -2806,14 +2978,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
- int errors = 0;
+ int errors_send = 0;
+ int errors_wait = 0;
int ret;
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
- errors++;
+ errors_send++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2821,13 +2994,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 0);
if (ret)
- errors++;
+ errors_send++;
}
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
- errors++;
+ errors_wait++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2835,13 +3008,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 1);
if (ret)
- errors++;
+ errors_wait++;
}
- if (errors)
+ if (errors_send > info->num_tolerated_disk_barrier_failures ||
+ errors_wait > info->num_tolerated_disk_barrier_failures)
return -EIO;
return 0;
}
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_space_info *sinfo;
+ u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ int num_types = 4;
+ int i;
+ int c;
+ int num_tolerated_disk_barrier_failures =
+ (int)fs_info->fs_devices->num_devices;
+
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ sinfo = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+ if (tmp->flags == types[i]) {
+ sinfo = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!sinfo)
+ continue;
+
+ down_read(&sinfo->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&sinfo->block_groups[c])) {
+ u64 flags;
+
+ btrfs_get_block_group_info(
+ &sinfo->block_groups[c], &space);
+ if (space.total_bytes == 0 ||
+ space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+ /*
+ * return
+ * 0: if dup, single or RAID0 is configured for
+ * any of metadata, system or data, else
+ * 1: if RAID5 is configured, or if RAID1 or
+ * RAID10 is configured and only two mirrors
+ * are used, else
+ * 2: if RAID6 is configured, else
+ * num_mirrors - 1: if RAID1 or RAID10 is
+ * configured and more than
+ * 2 mirrors are used.
+ */
+ if (num_tolerated_disk_barrier_failures > 0 &&
+ ((flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID0)) ||
+ ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+ == 0)))
+ num_tolerated_disk_barrier_failures = 0;
+ else if (num_tolerated_disk_barrier_failures > 1
+ &&
+ (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10)))
+ num_tolerated_disk_barrier_failures = 1;
+ }
+ }
+ up_read(&sinfo->groups_sem);
+ }
+
+ return num_tolerated_disk_barrier_failures;
+}
+
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@@ -2864,8 +3111,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
- if (do_barriers)
- barrier_all_devices(root->fs_info);
+ if (do_barriers) {
+ ret = barrier_all_devices(root->fs_info);
+ if (ret) {
+ mutex_unlock(
+ &root->fs_info->fs_devices->device_list_mutex);
+ btrfs_error(root->fs_info, ret,
+ "errors while submitting device barriers.");
+ return ret;
+ }
+ }
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
@@ -3065,41 +3320,27 @@ int close_ctree(struct btrfs_root *root)
smp_mb();
/* pause restriper - we want to resume on mount */
- btrfs_pause_balance(root->fs_info);
+ btrfs_pause_balance(fs_info);
+
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(root);
+ btrfs_scrub_cancel(fs_info);
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(fs_info);
+ btrfs_cleanup_defrag_inodes(fs_info);
- /*
- * Here come 2 situations when btrfs is broken to flip readonly:
- *
- * 1. when btrfs flips readonly somewhere else before
- * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
- * and btrfs will skip to write sb directly to keep
- * ERROR state on disk.
- *
- * 2. when btrfs flips readonly just in btrfs_commit_super,
- * and in such case, btrfs cannot write sb via btrfs_commit_super,
- * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
- * btrfs will cleanup all FS resources first and write sb then.
- */
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- ret = btrfs_error_commit_super(root);
- if (ret)
- printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
- }
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ btrfs_error_commit_super(root);
btrfs_put_block_group_cache(fs_info);
@@ -3109,14 +3350,12 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 2;
smp_mb();
+ btrfs_free_qgroup_config(root->fs_info);
+
if (fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
(unsigned long long)fs_info->delalloc_bytes);
}
- if (fs_info->total_ref_cache_size) {
- printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
- (unsigned long long)fs_info->total_ref_cache_size);
- }
free_extent_buffer(fs_info->extent_root->node);
free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3128,6 +3367,10 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(fs_info->dev_root->commit_root);
free_extent_buffer(fs_info->csum_root->node);
free_extent_buffer(fs_info->csum_root->commit_root);
+ if (fs_info->quota_root) {
+ free_extent_buffer(fs_info->quota_root->node);
+ free_extent_buffer(fs_info->quota_root->commit_root);
+ }
btrfs_free_block_groups(fs_info);
@@ -3148,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3192,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
int was_dirty;
btrfs_assert_tree_locked(buf);
- if (transid != root->fs_info->generation) {
- printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+ if (transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
"found %llu running %llu\n",
(unsigned long long)buf->start,
(unsigned long long)transid,
(unsigned long long)root->fs_info->generation);
- WARN_ON(1);
- }
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
@@ -3208,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
}
}
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+ int flush_delayed)
{
/*
* looks as though older kernels can get into trouble with
@@ -3220,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
if (current->flags & PF_MEMALLOC)
return;
- btrfs_balance_delayed_items(root);
+ if (flush_delayed)
+ btrfs_balance_delayed_items(root);
num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited_nr(
- root->fs_info->btree_inode->i_mapping, 1);
+ balance_dirty_pages_ratelimited(
+ root->fs_info->btree_inode->i_mapping);
}
return;
}
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
{
- /*
- * looks as though older kernels can get into trouble with
- * this code, they end up stuck in balance_dirty_pages forever
- */
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
-
- if (current->flags & PF_MEMALLOC)
- return;
-
- num_dirty = root->fs_info->dirty_metadata_bytes;
+ __btrfs_btree_balance_dirty(root, 1);
+}
- if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited_nr(
- root->fs_info->btree_inode->i_mapping, 1);
- }
- return;
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+ __btrfs_btree_balance_dirty(root, 0);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
@@ -3258,52 +3491,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
}
-static int btree_lock_page_hook(struct page *page, void *data,
- void (*flush_fn)(void *))
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_buffer *eb;
-
- /*
- * We culled this eb but the page is still hanging out on the mapping,
- * carry on.
- */
- if (!PagePrivate(page))
- goto out;
-
- eb = (struct extent_buffer *)page->private;
- if (!eb) {
- WARN_ON(1);
- goto out;
- }
- if (page != eb->pages[0])
- goto out;
-
- if (!btrfs_try_tree_write_lock(eb)) {
- flush_fn(data);
- btrfs_tree_lock(eb);
- }
- btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (root->fs_info->dirty_metadata_bytes >= eb->len)
- root->fs_info->dirty_metadata_bytes -= eb->len;
- else
- WARN_ON(1);
- spin_unlock(&root->fs_info->delalloc_lock);
- }
-
- btrfs_tree_unlock(eb);
-out:
- if (!trylock_page(page)) {
- flush_fn(data);
- lock_page(page);
- }
- return 0;
-}
-
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
@@ -3315,18 +3502,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
if (read_only)
return 0;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- printk(KERN_WARNING "warning: mount fs with errors, "
- "running btrfsck is recommended\n");
- }
-
return 0;
}
-int btrfs_error_commit_super(struct btrfs_root *root)
+void btrfs_error_commit_super(struct btrfs_root *root)
{
- int ret;
-
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3336,10 +3516,6 @@ int btrfs_error_commit_super(struct btrfs_root *root)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(root);
-
- ret = write_ctree_super(NULL, root, 0);
-
- return ret;
}
static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
@@ -3517,7 +3693,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
+ mark, NULL);
if (ret)
break;
@@ -3572,7 +3748,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
again:
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret)
break;
@@ -3663,14 +3839,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
/* FIXME: cleanup wait for commit */
t->in_commit = 1;
t->blocked = 1;
+ smp_mb();
if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
wake_up(&root->fs_info->transaction_blocked_wait);
t->blocked = 0;
+ smp_mb();
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
t->commit_done = 1;
+ smp_mb();
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
@@ -3706,7 +3885,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
}
static struct extent_io_ops btree_extent_io_ops = {
- .write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b3fab39f7..305c33efb0e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
-int btrfs_error_commit_super(struct btrfs_root *root);
+void btrfs_error_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -89,6 +89,14 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
int btrfs_cleanup_transaction(struct btrfs_root *root);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_root *root);
+void btrfs_abort_devices(struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6e1d36702ff..5a3327b8f90 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,9 @@
#include "volumes.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "math.h"
+
+#undef SCRAMBLE_DELAYED_REFS
/*
* control flags for do_chunk_alloc's force field
@@ -92,8 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 alloc_bytes,
- u64 flags, int force);
+ struct btrfs_root *extent_root, u64 flags,
+ int force);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -310,7 +313,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
while (start < end) {
ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE);
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL);
if (ret)
break;
@@ -646,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-static u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- do_div(num, 10);
- return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
- if (factor == 100)
- return num;
- num *= factor;
- do_div(num, 100);
- return num;
-}
-
u64 btrfs_find_block_group(struct btrfs_root *root,
u64 search_start, u64 search_hint, int owner)
{
@@ -1832,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+ ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
/* Error condition is -ENOMEM */
if (!ret) {
@@ -2217,6 +2203,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int count = 0;
int must_insert_reserved = 0;
@@ -2249,13 +2236,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
/*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ */
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
+
+ /*
* locked_ref is the head node, so we have to go one
* node back for any delayed ref updates
*/
ref = select_delayed_ref(locked_ref);
if (ref && ref->seq &&
- btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+ btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
/*
* there are still refs with lower seq numbers in the
* process of being added. Don't run this ref yet.
@@ -2300,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
kfree(extent_op);
if (ret) {
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+
printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
@@ -2315,12 +2315,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- /*
- * we modified num_entries, but as we're currently running
- * delayed refs, skip
- * wake_up(&delayed_refs->seq_wait);
- * here.
- */
+ if (locked_ref) {
+ /*
+ * when we play the delayed ref, also correct the
+ * ref_mod on head
+ */
+ switch (ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ locked_ref->node.ref_mod -= ref->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ locked_ref->node.ref_mod += ref->ref_mod;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ }
spin_unlock(&delayed_refs->lock);
ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2331,35 +2342,97 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
count++;
if (ret) {
+ if (locked_ref) {
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+ }
printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
}
next:
- do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024,
- btrfs_get_alloc_profile(root, 0),
- CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
return count;
}
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
- unsigned long num_refs,
- struct list_head *first_seq)
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int alt = 1;
+ u64 middle;
+ u64 first = 0, last = 0;
+
+ n = rb_first(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ first = entry->bytenr;
+ }
+ n = rb_last(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ last = entry->bytenr;
+ }
+ n = root->rb_node;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+
+ middle = entry->bytenr;
+
+ if (alt)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+
+ alt = 1 - alt;
+ }
+ return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
- spin_unlock(&delayed_refs->lock);
- pr_debug("waiting for more refs (num %ld, first %p)\n",
- num_refs, first_seq);
- wait_event(delayed_refs->seq_wait,
- num_refs != delayed_refs->num_entries ||
- delayed_refs->seq_head.next != first_seq);
- pr_debug("done waiting for more refs (num %ld, first %p)\n",
- delayed_refs->num_entries, delayed_refs->seq_head.next);
- spin_lock(&delayed_refs->lock);
+ struct qgroup_update *qgroup_update;
+ int ret = 0;
+
+ if (list_empty(&trans->qgroup_ref_list) !=
+ !trans->delayed_ref_elem.seq) {
+ /* list without seq or seq without list */
+ printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+ list_empty(&trans->qgroup_ref_list) ? "" : " not",
+ trans->delayed_ref_elem.seq);
+ BUG();
+ }
+
+ if (!trans->delayed_ref_elem.seq)
+ return 0;
+
+ while (!list_empty(&trans->qgroup_ref_list)) {
+ qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+ struct qgroup_update, list);
+ list_del(&qgroup_update->list);
+ if (!ret)
+ ret = btrfs_qgroup_account_ref(
+ trans, fs_info, qgroup_update->node,
+ qgroup_update->extent_op);
+ kfree(qgroup_update);
+ }
+
+ btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+ return ret;
}
/*
@@ -2379,13 +2452,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
- struct list_head *first_seq = NULL;
int ret;
u64 delayed_start;
int run_all = count == (unsigned long)-1;
int run_most = 0;
- unsigned long num_refs = 0;
- int consider_waiting;
+ int loops;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2394,15 +2465,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
- do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
- CHUNK_ALLOC_NO_FORCE);
+ btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
- consider_waiting = 0;
+ loops = 0;
spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+ delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
if (count == 0) {
count = delayed_refs->num_entries * 2;
run_most = 1;
@@ -2424,31 +2498,6 @@ again:
if (ret)
break;
- if (delayed_start >= delayed_refs->run_delayed_start) {
- if (consider_waiting == 0) {
- /*
- * btrfs_find_ref_cluster looped. let's do one
- * more cycle. if we don't run any delayed ref
- * during that cycle (because we can't because
- * all of them are blocked) and if the number of
- * refs doesn't change, we avoid busy waiting.
- */
- consider_waiting = 1;
- num_refs = delayed_refs->num_entries;
- first_seq = root->fs_info->tree_mod_seq_list.next;
- } else {
- wait_for_more_refs(delayed_refs,
- num_refs, first_seq);
- /*
- * after waiting, things have changed. we
- * dropped the lock and someone else might have
- * run some refs, built new clusters and so on.
- * therefore, we restart staleness detection.
- */
- consider_waiting = 0;
- }
- }
-
ret = run_clustered_refs(trans, root, &cluster);
if (ret < 0) {
spin_unlock(&delayed_refs->lock);
@@ -2461,13 +2510,36 @@ again:
if (count == 0)
break;
- if (ret || delayed_refs->run_delayed_start == 0) {
+ if (delayed_start >= delayed_refs->run_delayed_start) {
+ if (loops == 0) {
+ /*
+ * btrfs_find_ref_cluster looped. let's do one
+ * more cycle. if we don't run any delayed ref
+ * during that cycle (because we can't because
+ * all of them are blocked), bail out.
+ */
+ loops = 1;
+ } else {
+ /*
+ * no runnable refs left, stop trying
+ */
+ BUG_ON(run_all);
+ break;
+ }
+ }
+ if (ret) {
/* refs were run, let's reset staleness detection */
- consider_waiting = 0;
+ loops = 0;
}
}
if (run_all) {
+ if (!list_empty(&trans->new_bgs)) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_create_pending_block_groups(trans, root);
+ spin_lock(&delayed_refs->lock);
+ }
+
node = rb_first(&delayed_refs->root);
if (!node)
goto out;
@@ -2502,6 +2574,7 @@ again:
}
out:
spin_unlock(&delayed_refs->lock);
+ assert_qgroups_uptodate(trans);
return 0;
}
@@ -2581,8 +2654,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
node = rb_prev(node);
if (node) {
+ int seq = ref->seq;
+
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- if (ref->bytenr == bytenr)
+ if (ref->bytenr == bytenr && ref->seq == seq)
goto out_unlock;
}
@@ -2903,25 +2978,29 @@ again:
}
spin_lock(&block_group->lock);
- if (block_group->cached != BTRFS_CACHE_FINISHED) {
- /* We're not cached, don't bother trying to write stuff out */
+ if (block_group->cached != BTRFS_CACHE_FINISHED ||
+ !btrfs_test_opt(root, SPACE_CACHE)) {
+ /*
+ * don't bother trying to write stuff out _if_
+ * a) we're not cached,
+ * b) we're with nospace_cache mount option.
+ */
dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
goto out_put;
}
spin_unlock(&block_group->lock);
- num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+ /*
+ * Try to preallocate enough space based on how big the block group is.
+ * Keep in mind this has to include any pinned space which could end up
+ * taking up quite a bit since it's not folded into the other space
+ * cache.
+ */
+ num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
if (!num_pages)
num_pages = 1;
- /*
- * Just to make absolutely sure we have enough space, we're going to
- * preallocate 12 pages worth of space for each block group. In
- * practice we ought to use at most 8, but we need extra space so we can
- * add our header and have a terminator between the extents and the
- * bitmaps.
- */
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
@@ -3134,6 +3213,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
init_waitqueue_head(&found->wait);
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ info->data_sinfo = found;
return 0;
}
@@ -3263,12 +3344,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return get_alloc_profile(root, flags);
}
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
- BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- BTRFS_BLOCK_GROUP_DATA);
-}
-
/*
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
@@ -3277,6 +3352,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 used;
int ret = 0, committed = 0, alloc_chunk = 1;
@@ -3289,7 +3365,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
committed = 1;
}
- data_sinfo = BTRFS_I(inode)->space_info;
+ data_sinfo = fs_info->data_sinfo;
if (!data_sinfo)
goto alloc;
@@ -3319,7 +3395,6 @@ alloc:
return PTR_ERR(trans);
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- bytes + 2 * 1024 * 1024,
alloc_target,
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
@@ -3330,10 +3405,9 @@ alloc:
goto commit_trans;
}
- if (!data_sinfo) {
- btrfs_set_inode_space_info(root, inode);
- data_sinfo = BTRFS_I(inode)->space_info;
- }
+ if (!data_sinfo)
+ data_sinfo = fs_info->data_sinfo;
+
goto again;
}
@@ -3380,7 +3454,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
- data_sinfo = BTRFS_I(inode)->space_info;
+ data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3402,8 +3476,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
static int should_alloc_chunk(struct btrfs_root *root,
- struct btrfs_space_info *sinfo, u64 alloc_bytes,
- int force)
+ struct btrfs_space_info *sinfo, int force)
{
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3418,7 +3491,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
* and purposes it's used space. Don't worry about locking the
* global_rsv, it doesn't change except when the transaction commits.
*/
- num_allocated += global_rsv->size;
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+ num_allocated += global_rsv->size;
/*
* in limited mode, we want to have some free space up to
@@ -3432,15 +3506,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
- thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
-
- /* 256MB or 2% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
- /* system chunks need a much small threshold */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
- thresh = 32 * 1024 * 1024;
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+ if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -3490,8 +3557,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
}
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 alloc_bytes,
- u64 flags, int force)
+ struct btrfs_root *extent_root, u64 flags, int force)
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3515,7 +3581,7 @@ again:
return 0;
}
- if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+ if (!should_alloc_chunk(extent_root, space_info, force)) {
spin_unlock(&space_info->lock);
return 0;
} else if (space_info->chunk_alloc) {
@@ -3583,92 +3649,125 @@ out:
return ret;
}
+static int can_overcommit(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 avail;
+ u64 used;
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ avail = root->fs_info->free_chunk_space;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually useable.
+ */
+ if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ avail >>= 1;
+
+ /*
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ avail >>= 3;
+ else
+ avail >>= 1;
+
+ if (used + bytes < space_info->total_bytes + avail)
+ return 1;
+ return 0;
+}
+
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+ unsigned long nr_pages,
+ enum wb_reason reason)
+{
+ if (!writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, reason);
+ up_read(&sb->s_umount);
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* shrink metadata reservation for delalloc
*/
-static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
- bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+ bool wait_ordered)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
- u64 reserved;
+ u64 delalloc_bytes;
u64 max_reclaim;
- u64 reclaimed = 0;
long time_left;
unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
- unsigned long progress;
+ enum btrfs_reserve_flush_enum flush;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
smp_mb();
- reserved = space_info->bytes_may_use;
- progress = space_info->reservation_progress;
-
- if (reserved == 0)
- return 0;
-
- smp_mb();
- if (root->fs_info->delalloc_bytes == 0) {
+ delalloc_bytes = root->fs_info->delalloc_bytes;
+ if (delalloc_bytes == 0) {
if (trans)
- return 0;
- btrfs_wait_ordered_extents(root, 0, 0);
- return 0;
+ return;
+ btrfs_wait_ordered_extents(root, 0);
+ return;
}
- max_reclaim = min(reserved, to_reclaim);
- nr_pages = max_t(unsigned long, nr_pages,
- max_reclaim >> PAGE_CACHE_SHIFT);
- while (loops < 1024) {
- /* have the flusher threads jump in and do some IO */
- smp_mb();
- nr_pages = min_t(unsigned long, nr_pages,
- root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
+ while (delalloc_bytes && loops < 3) {
+ max_reclaim = min(delalloc_bytes, to_reclaim);
+ nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+ writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+ nr_pages,
+ WB_REASON_FS_FREE_SPACE);
+
+ /*
+ * We need to wait for the async pages to actually start before
+ * we do anything.
+ */
+ wait_event(root->fs_info->async_submit_wait,
+ !atomic_read(&root->fs_info->async_delalloc_pages));
+ if (!trans)
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ else
+ flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
- if (reserved > space_info->bytes_may_use)
- reclaimed += reserved - space_info->bytes_may_use;
- reserved = space_info->bytes_may_use;
+ if (can_overcommit(root, space_info, orig, flush)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
spin_unlock(&space_info->lock);
loops++;
-
- if (reserved == 0 || reclaimed >= max_reclaim)
- break;
-
- if (trans && trans->transaction->blocked)
- return -EAGAIN;
-
if (wait_ordered && !trans) {
- btrfs_wait_ordered_extents(root, 0, 0);
+ btrfs_wait_ordered_extents(root, 0);
} else {
- time_left = schedule_timeout_interruptible(1);
-
- /* We were interrupted, exit */
+ time_left = schedule_timeout_killable(1);
if (time_left)
break;
}
-
- /* we've kicked the IO a few times, if anything has been freed,
- * exit. There is no sense in looping here for a long time
- * when we really need to commit the transaction, or there are
- * just too many writers without enough free space
- */
-
- if (loops > 3) {
- smp_mb();
- if (progress != space_info->reservation_progress)
- break;
- }
-
+ smp_mb();
+ delalloc_bytes = root->fs_info->delalloc_bytes;
}
-
- return reclaimed >= to_reclaim;
}
/**
@@ -3728,12 +3827,78 @@ commit:
return btrfs_commit_transaction(trans, root);
}
+enum flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
+};
+
+static int flush_space(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 num_bytes,
+ u64 orig_bytes, int state)
+{
+ struct btrfs_trans_handle *trans;
+ int nr;
+ int ret = 0;
+
+ switch (state) {
+ case FLUSH_DELAYED_ITEMS_NR:
+ case FLUSH_DELAYED_ITEMS:
+ if (state == FLUSH_DELAYED_ITEMS_NR) {
+ u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+ nr = (int)div64_u64(num_bytes, bytes);
+ if (!nr)
+ nr = 1;
+ nr *= 2;
+ } else {
+ nr = -1;
+ }
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_run_delayed_items_nr(trans, root, nr);
+ btrfs_end_transaction(trans, root);
+ break;
+ case FLUSH_DELALLOC:
+ case FLUSH_DELALLOC_WAIT:
+ shrink_delalloc(root, num_bytes, orig_bytes,
+ state == FLUSH_DELALLOC_WAIT);
+ break;
+ case ALLOC_CHUNK:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+ btrfs_end_transaction(trans, root);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ case COMMIT_TRANS:
+ ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+ break;
+ default:
+ ret = -ENOSPC;
+ break;
+ }
+
+ return ret;
+}
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
* @block_rsv - the block_rsv we're allocating for
* @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
*
* This will reserve orgi_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -3744,25 +3909,25 @@ commit:
*/
static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes, int flush)
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 used;
u64 num_bytes = orig_bytes;
- int retries = 0;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
int ret = 0;
- bool committed = false;
bool flushing = false;
- bool wait_ordered = false;
again:
ret = 0;
spin_lock(&space_info->lock);
/*
- * We only want to wait if somebody other than us is flushing and we are
- * actually alloed to flush.
+ * We only want to wait if somebody other than us is flushing and we
+ * are actually allowed to flush all things.
*/
- while (flush && !flushing && space_info->flush) {
+ while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+ space_info->flush) {
spin_unlock(&space_info->lock);
/*
* If we have a trans handle we can't wait because the flusher
@@ -3812,111 +3977,57 @@ again:
* amount plus the amount of bytes that we need for this
* reservation.
*/
- wait_ordered = true;
num_bytes = used - space_info->total_bytes +
- (orig_bytes * (retries + 1));
+ (orig_bytes * 2);
}
- if (ret) {
- u64 profile = btrfs_get_alloc_profile(root, 0);
- u64 avail;
-
- /*
- * If we have a lot of space that's pinned, don't bother doing
- * the overcommit dance yet and just commit the transaction.
- */
- avail = (space_info->total_bytes - space_info->bytes_used) * 8;
- do_div(avail, 10);
- if (space_info->bytes_pinned >= avail && flush && !committed) {
- space_info->flush = 1;
- flushing = true;
- spin_unlock(&space_info->lock);
- ret = may_commit_transaction(root, space_info,
- orig_bytes, 1);
- if (ret)
- goto out;
- committed = true;
- goto again;
- }
-
- spin_lock(&root->fs_info->free_chunk_lock);
- avail = root->fs_info->free_chunk_space;
-
- /*
- * If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable.
- */
- if (profile & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- avail >>= 1;
-
- /*
- * If we aren't flushing don't let us overcommit too much, say
- * 1/8th of the space. If we can flush, let it overcommit up to
- * 1/2 of the space.
- */
- if (flush)
- avail >>= 3;
- else
- avail >>= 1;
- spin_unlock(&root->fs_info->free_chunk_lock);
-
- if (used + num_bytes < space_info->total_bytes + avail) {
- space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags, orig_bytes, 1);
- ret = 0;
- } else {
- wait_ordered = true;
- }
+ if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
}
/*
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
* stealing it from us.
+ *
+ * We make the other tasks wait for the flush only when we can flush
+ * all things.
*/
- if (ret && flush) {
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
}
spin_unlock(&space_info->lock);
- if (!ret || !flush)
- goto out;
-
- /*
- * We do synchronous shrinking since we don't actually unreserve
- * metadata until after the IO is completed.
- */
- ret = shrink_delalloc(root, num_bytes, wait_ordered);
- if (ret < 0)
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
goto out;
- ret = 0;
+ ret = flush_space(root, space_info, num_bytes, orig_bytes,
+ flush_state);
+ flush_state++;
/*
- * So if we were overcommitted it's possible that somebody else flushed
- * out enough space and we simply didn't have enough space to reclaim,
- * so go back around and try again.
+ * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+ * would happen. So skip delalloc flush.
*/
- if (retries < 2) {
- wait_ordered = true;
- retries++;
- goto again;
- }
+ if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT))
+ flush_state = ALLOC_CHUNK;
- ret = -ENOSPC;
- if (committed)
- goto out;
-
- ret = may_commit_transaction(root, space_info, orig_bytes, 0);
- if (!ret) {
- committed = true;
+ if (!ret)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ flush_state < COMMIT_TRANS)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ flush_state <= COMMIT_TRANS)
goto again;
- }
out:
if (flushing) {
@@ -3934,7 +4045,10 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (root->ref_cows || root == root->fs_info->csum_root)
+ if (root->ref_cows)
+ block_rsv = trans->block_rsv;
+
+ if (root == root->fs_info->csum_root && trans->adding_csums)
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -4031,13 +4145,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
+ rsv->type = type;
}
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4046,7 +4162,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
if (!block_rsv)
return NULL;
- btrfs_init_block_rsv(block_rsv);
+ btrfs_init_block_rsv(block_rsv, type);
block_rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
return block_rsv;
@@ -4055,13 +4171,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
+ if (!rsv)
+ return;
btrfs_block_rsv_release(root, rsv, (u64)-1);
kfree(rsv);
}
-static inline int __block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
int ret;
@@ -4077,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor)
{
@@ -4109,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
return ret;
}
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -4139,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes)
@@ -4286,6 +4376,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ if (!trans->block_rsv)
+ return;
+
if (!trans->bytes_reserved)
return;
@@ -4330,10 +4423,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
/*
- * two for root back/forward refs, two for directory entries
- * and one for root of the snapshot.
+ * two for root back/forward refs, two for directory entries,
+ * one for root of the snapshot and one for parent inode.
*/
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
dst_rsv->space_info = src_rsv->space_info;
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -4440,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 csum_bytes;
unsigned nr_extents = 0;
int extra_reserve = 0;
- int flush = 1;
- int ret;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+ int ret = 0;
+ bool delalloc_lock = true;
- /* Need to be holding the i_mutex here if we aren't free space cache */
- if (btrfs_is_free_space_inode(root, inode))
- flush = 0;
+ /* If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ delalloc_lock = false;
+ }
- if (flush && btrfs_transaction_in_commit(root->fs_info))
+ if (flush != BTRFS_RESERVE_NO_FLUSH &&
+ btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
- mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+ if (delalloc_lock)
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4476,7 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
- ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ if (root->fs_info->quota_enabled)
+ ret = btrfs_qgroup_reserve(root, num_bytes +
+ nr_extents * root->leafsize);
+
+ /*
+ * ret != 0 here means the qgroup reservation failed, we go straight to
+ * the shared error handling then.
+ */
+ if (ret == 0)
+ ret = reserve_metadata_bytes(root, block_rsv,
+ to_reserve, flush);
+
if (ret) {
u64 to_free = 0;
unsigned dropped;
@@ -4506,7 +4620,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode),
to_free, 0);
}
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ if (root->fs_info->quota_enabled) {
+ btrfs_qgroup_free(root, num_bytes +
+ nr_extents * root->leafsize);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
@@ -4518,7 +4637,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4554,6 +4675,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
+ if (root->fs_info->quota_enabled) {
+ btrfs_qgroup_free(root, num_bytes +
+ dropped * root->leafsize);
+ }
+
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
}
@@ -4863,9 +4989,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 len;
+ bool readonly;
while (start <= end) {
+ readonly = false;
if (!cache ||
start >= cache->key.objectid + cache->key.offset) {
if (cache)
@@ -4883,15 +5013,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
}
start += len;
+ space_info = cache->space_info;
- spin_lock(&cache->space_info->lock);
+ spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- if (cache->ro)
- cache->space_info->bytes_readonly += len;
+ space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ space_info->bytes_readonly += len;
+ readonly = true;
+ }
spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ if (!readonly && global_rsv->space_info == space_info) {
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ len = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += len;
+ space_info->bytes_may_use += len;
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ }
+ spin_unlock(&global_rsv->lock);
+ }
+ spin_unlock(&space_info->lock);
}
if (cache)
@@ -4918,7 +5063,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret)
break;
@@ -4996,8 +5141,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
is_data);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -5015,8 +5162,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root,
path->nodes[0]);
}
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
extent_slot = path->slots[0];
}
} else if (ret == -ENOENT) {
@@ -5030,7 +5179,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)owner_objectid,
(unsigned long long)owner_offset);
} else {
- goto abort;
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
}
leaf = path->nodes[0];
@@ -5040,8 +5190,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(found_extent || extent_slot != path->slots[0]);
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -5058,8 +5210,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
extent_slot = path->slots[0];
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5096,8 +5251,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
is_data);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
} else {
if (found_extent) {
@@ -5114,27 +5271,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
out:
btrfs_free_path(path);
return ret;
-
-abort:
- btrfs_abort_transaction(trans, extent_root, ret);
- goto out;
}
/*
@@ -5190,8 +5349,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
rb_erase(&head->node.rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
/*
* we don't take a ref on the node because we're removing it from the
@@ -5348,7 +5505,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
{
int index;
@@ -5368,7 +5525,7 @@ static int __get_block_group_index(u64 flags)
static int get_block_group_index(struct btrfs_block_group_cache *cache)
{
- return __get_block_group_index(cache->flags);
+ return __get_raid_index(cache->flags);
}
enum btrfs_loop_type {
@@ -5399,11 +5556,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *used_block_group;
u64 search_start = 0;
int empty_cluster = 2 * 1024 * 1024;
- int allowed_chunk_alloc = 0;
- int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = 0;
+ int index = __get_raid_index(data);
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
@@ -5432,9 +5587,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
if (btrfs_mixed_space_info(space_info))
use_cluster = false;
- if (orig_root->ref_cows || empty_size)
- allowed_chunk_alloc = 1;
-
if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
@@ -5708,10 +5860,6 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- if (offset < search_start)
- btrfs_add_free_space(used_block_group, offset,
- search_start - offset);
- BUG_ON(offset > search_start);
if (used_block_group != block_group)
btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
@@ -5744,30 +5892,17 @@ loop:
index = 0;
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
- if (allowed_chunk_alloc) {
- ret = do_chunk_alloc(trans, root, num_bytes +
- 2 * 1024 * 1024, data,
- CHUNK_ALLOC_LIMITED);
- if (ret < 0) {
- btrfs_abort_transaction(trans,
- root, ret);
- goto out;
- }
- allowed_chunk_alloc = 0;
- if (ret == 1)
- done_chunk_alloc = 1;
- } else if (!done_chunk_alloc &&
- space_info->force_alloc ==
- CHUNK_ALLOC_NO_FORCE) {
- space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+ ret = do_chunk_alloc(trans, root, data,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * Do not bail out on ENOSPC since we
+ * can do more things.
+ */
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto out;
}
-
- /*
- * We didn't allocate a chunk, go ahead and drop the
- * empty size and loop again.
- */
- if (!done_chunk_alloc)
- loop = LOOP_NO_EMPTY_SIZE;
}
if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5816,13 +5951,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
- printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
- "%llu pinned %llu reserved\n",
+ printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
(unsigned long long)cache->key.objectid,
(unsigned long long)cache->key.offset,
(unsigned long long)btrfs_block_group_used(&cache->item),
(unsigned long long)cache->pinned,
- (unsigned long long)cache->reserved);
+ (unsigned long long)cache->reserved,
+ cache->ro ? "[readonly]" : "");
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
}
@@ -5842,20 +5977,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
data = btrfs_get_alloc_profile(root, data);
again:
- /*
- * the only place that sets empty_size is btrfs_realloc_node, which
- * is not called recursively on allocations
- */
- if (empty_size || root->ref_cows) {
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes + 2 * 1024 * 1024, data,
- CHUNK_ALLOC_NO_FORCE);
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
- }
-
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
hint_byte, ins, data);
@@ -5865,12 +5986,6 @@ again:
num_bytes = num_bytes >> 1;
num_bytes = num_bytes & ~(root->sectorsize - 1);
num_bytes = max(num_bytes, min_alloc_size);
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, CHUNK_ALLOC_FORCE);
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -6193,7 +6308,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
@@ -6212,15 +6328,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
ret = block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
- if (ret) {
+ if (ret && !block_rsv->failfast) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
/*DEFAULT_RATELIMIT_BURST*/ 2);
- if (__ratelimit(&_rs)) {
- printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
- WARN_ON(1);
- }
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+ ret);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
@@ -6670,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return 1;
}
}
@@ -7177,7 +7295,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
alloc_flags = update_block_group_flags(root, cache->flags);
if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
@@ -7187,7 +7305,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
@@ -7201,7 +7319,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 type)
{
u64 alloc_flags = get_alloc_profile(root, type);
- return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ return do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
}
@@ -7351,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
target = get_restripe_target(root->fs_info, block_group->flags);
if (target) {
- index = __get_block_group_index(extended_to_chunk(target));
+ index = __get_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -7385,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* check to make sure we can actually find a chunk with enough
* space to fit our block group in.
*/
- if (device->total_bytes > device->bytes_used + min_free) {
+ if (device->total_bytes > device->bytes_used + min_free &&
+ !device->is_tgtdev_for_dev_replace) {
ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
@@ -7610,8 +7729,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- if (need_clear)
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
cache->disk_cache_state = BTRFS_DC_CLEAR;
+ if (btrfs_test_opt(root, SPACE_CACHE))
+ cache->dirty = 1;
+ }
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7695,6 +7827,34 @@ error:
return ret;
}
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_block_group_item item;
+ struct btrfs_key key;
+ int ret = 0;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+ new_bg_list) {
+ list_del_init(&block_group->new_bg_list);
+
+ if (ret)
+ continue;
+
+ spin_lock(&block_group->lock);
+ memcpy(&item, &block_group->item, sizeof(item));
+ memcpy(&key, &block_group->key, sizeof(key));
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_insert_item(trans, extent_root, &key, &item,
+ sizeof(item));
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
+ }
+}
+
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7728,6 +7888,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->new_bg_list);
btrfs_init_free_space_ctl(cache);
@@ -7759,12 +7920,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret); /* Logic error */
- ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
- sizeof(cache->item));
- if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
- return ret;
- }
+ list_add_tail(&cache->new_bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 01c21b6c6d4..1b319df29ee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
get_extent_t *get_extent;
+ unsigned long bio_flags;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
int __init extent_io_init(void)
{
- extent_state_cache = kmem_cache_create("extent_state",
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
- extent_buffer_cache = kmem_cache_create("extent_buffers",
+ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
@@ -107,6 +108,12 @@ void extent_io_exit(void)
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
+
+ /*
+ * Make sure all delayed rcu free are flushed before we
+ * destroy caches.
+ */
+ rcu_barrier();
if (extent_state_cache)
kmem_cache_destroy(extent_state_cache);
if (extent_buffer_cache)
@@ -334,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
{
struct rb_node *node;
- if (end < start) {
- printk(KERN_ERR "btrfs end < start %llu %llu\n",
+ if (end < start)
+ WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
(unsigned long long)end,
(unsigned long long)start);
- WARN_ON(1);
- }
state->start = start;
state->end = end;
@@ -929,12 +934,14 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
/**
- * convert_extent - convert all bits in a given range from one bit to another
+ * convert_extent_bit - convert all bits in a given range from one bit to
+ * another
* @tree: the io tree to search
* @start: the start offset in bytes
* @end: the end offset in bytes (inclusive)
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
* @mask: the allocation mask
*
* This will go through and set bits for the given range. If any states exist
@@ -944,7 +951,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits, gfp_t mask)
+ int bits, int clear_bits,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -961,6 +969,15 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+
/*
* this search will find all the extents that end after
* our range starts.
@@ -991,6 +1008,7 @@ hit_next:
*/
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
@@ -1031,6 +1049,7 @@ hit_next:
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
@@ -1069,6 +1088,7 @@ hit_next:
&bits);
if (err)
extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -1091,6 +1111,7 @@ hit_next:
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
clear_state_bit(tree, prealloc, &clear_bits, 0);
prealloc = NULL;
goto out;
@@ -1143,6 +1164,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
NULL, cached_state, mask);
}
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
@@ -1287,18 +1316,42 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits)
+ u64 *start_ret, u64 *end_ret, int bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
+ struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && state->tree) {
+ n = rb_next(&state->rb_node);
+ while (n) {
+ state = rb_entry(n, struct extent_state,
+ rb_node);
+ if (state->state & bits)
+ goto got_it;
+ n = rb_next(n);
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
state = find_first_extent_bit_state(tree, start, bits);
+got_it:
if (state) {
+ cache_state(state, cached_state);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
}
+out:
spin_unlock(&tree->lock);
return ret;
}
@@ -1864,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
* the standard behavior is to write all copies in a raid setup. here we only
* want to write the one bad copy. so we do the mapping for ourselves and issue
* submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
* actually prevents the read that triggered the error from finishing.
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num)
{
@@ -1891,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
bio->bi_size = 0;
map_length = length;
- ret = btrfs_map_block(map_tree, WRITE, logical,
+ ret = btrfs_map_block(fs_info, WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
bio_put(bio);
@@ -1918,7 +1971,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
return -EIO;
}
- printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+ printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
"(dev %s sector %llu)\n", page->mapping->host->i_ino,
start, rcu_str_deref(dev->name), sector);
@@ -1929,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num)
{
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
u64 start = eb->start;
unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
int ret = 0;
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+ ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
start, p, mirror_num);
if (ret)
break;
@@ -1955,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct btrfs_mapping_tree *map_tree;
+ struct btrfs_fs_info *fs_info;
struct extent_state *state;
int num_copies;
int did_repair = 0;
@@ -1991,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
spin_unlock(&BTRFS_I(inode)->io_tree.lock);
if (state && state->start == failrec->start) {
- map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
- num_copies = btrfs_num_copies(map_tree, failrec->logical,
- failrec->len);
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ num_copies = btrfs_num_copies(fs_info, failrec->logical,
+ failrec->len);
if (num_copies > 1) {
- ret = repair_io_failure(map_tree, start, failrec->len,
+ ret = repair_io_failure(fs_info, start, failrec->len,
failrec->logical, page,
failrec->failed_mirror);
did_repair = !ret;
@@ -2061,7 +2113,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
}
read_unlock(&em_tree->lock);
- if (!em || IS_ERR(em)) {
+ if (!em) {
kfree(failrec);
return -EIO;
}
@@ -2104,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
* clean_io_failure() clean all those errors at once.
*/
}
- num_copies = btrfs_num_copies(
- &BTRFS_I(inode)->root->fs_info->mapping_tree,
- failrec->logical, failrec->len);
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
if (num_copies == 1) {
/*
* we only have a single copy of the data, so don't bother with
@@ -2297,8 +2348,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
- pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
- "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+ pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+ "mirror=%ld\n", (u64)bio->bi_sector, err,
(long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2329,23 +2380,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
state, mirror);
- if (ret) {
- /* no IO indicated but software detected errors
- * in the block, either checksum errors or
- * issues with the contents */
- struct btrfs_root *root =
- BTRFS_I(page->mapping->host)->root;
- struct btrfs_device *device;
-
+ if (ret)
uptodate = 0;
- device = btrfs_find_device_for_logical(
- root, start, mirror);
- if (device)
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- } else {
+ else
clean_io_failure(start, page);
- }
}
if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -2424,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
static int __must_check submit_one_bio(int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
@@ -2715,12 +2749,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
- BUG_ON(ret == -ENOMEM);
- nr++;
- *bio_flags = this_bio_flag;
+ if (!ret) {
+ nr++;
+ *bio_flags = this_bio_flag;
+ }
}
- if (ret)
+ if (ret) {
SetPageError(page);
+ unlock_extent(tree, cur, cur + iosize - 1);
+ }
cur = cur + iosize;
pg_offset += iosize;
}
@@ -3077,8 +3114,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
}
}
+ /*
+ * We need to do this to prevent races in people who check if the eb is
+ * under IO since we can end up having no IO bits set for a short period
+ * of time.
+ */
+ spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
spin_lock(&fs_info->delalloc_lock);
if (fs_info->dirty_metadata_bytes >= eb->len)
@@ -3087,6 +3131,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
WARN_ON(1);
spin_unlock(&fs_info->delalloc_lock);
ret = 1;
+ } else {
+ spin_unlock(&eb->refs_lock);
}
btrfs_tree_unlock(eb);
@@ -3158,12 +3204,16 @@ static int write_one_eb(struct extent_buffer *eb,
struct block_device *bdev = fs_info->fs_devices->latest_bdev;
u64 offset = eb->start;
unsigned long i, num_pages;
+ unsigned long bio_flags = 0;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
int ret = 0;
clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
+ if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+ bio_flags = EXTENT_BIO_TREE_LOG;
+
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
@@ -3172,7 +3222,8 @@ static int write_one_eb(struct extent_buffer *eb,
ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
- 0, 0, 0);
+ 0, epd->bio_flags, bio_flags);
+ epd->bio_flags = bio_flags;
if (ret) {
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
SetPageError(p);
@@ -3207,6 +3258,7 @@ int btree_write_cache_pages(struct address_space *mapping,
.tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
int ret = 0;
int done = 0;
@@ -3251,19 +3303,34 @@ retry:
break;
}
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ continue;
+ }
+
eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON
+ * but no sense in crashing the users box for something
+ * we can survive anyway.
+ */
if (!eb) {
+ spin_unlock(&mapping->private_lock);
WARN_ON(1);
continue;
}
- if (eb == prev_eb)
+ if (eb == prev_eb) {
+ spin_unlock(&mapping->private_lock);
continue;
+ }
- if (!atomic_inc_not_zero(&eb->refs)) {
- WARN_ON(1);
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
continue;
- }
prev_eb = eb;
ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3454,7 +3521,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
if (epd->sync_io)
rw = WRITE_SYNC;
- ret = submit_one_bio(rw, epd->bio, 0, 0);
+ ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
@@ -3477,6 +3544,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
ret = __extent_writepage(page, wbc, &epd);
@@ -3501,6 +3569,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
@@ -3540,6 +3609,7 @@ int extent_writepages(struct extent_io_tree *tree,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3557,19 +3627,38 @@ int extent_readpages(struct extent_io_tree *tree,
struct bio *bio = NULL;
unsigned page_idx;
unsigned long bio_flags = 0;
+ struct page *pagepool[16];
+ struct page *page;
+ int i = 0;
+ int nr = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ page = list_entry(pages->prev, struct page, lru);
prefetchw(&page->flags);
list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping,
+ if (add_to_page_cache_lru(page, mapping,
page->index, GFP_NOFS)) {
- __extent_read_full_page(tree, page, get_extent,
- &bio, 0, &bio_flags);
+ page_cache_release(page);
+ continue;
}
- page_cache_release(page);
+
+ pagepool[nr++] = page;
+ if (nr < ARRAY_SIZE(pagepool))
+ continue;
+ for (i = 0; i < nr; i++) {
+ __extent_read_full_page(tree, pagepool[i], get_extent,
+ &bio, 0, &bio_flags);
+ page_cache_release(pagepool[i]);
+ }
+ nr = 0;
+ }
+ for (i = 0; i < nr; i++) {
+ __extent_read_full_page(tree, pagepool[i], get_extent,
+ &bio, 0, &bio_flags);
+ page_cache_release(pagepool[i]);
}
+
BUG_ON(!list_empty(pages));
if (bio)
return submit_one_bio(READ, bio, 0, bio_flags);
@@ -3898,18 +3987,6 @@ out:
return ret;
}
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
- unsigned long i)
-{
- return eb->pages[i];
-}
-
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
- return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
- (start >> PAGE_CACHE_SHIFT);
-}
-
static void __free_extent_buffer(struct extent_buffer *eb)
{
#if LEAK_DEBUG
@@ -4025,8 +4102,8 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
return eb;
err:
- for (i--; i > 0; i--)
- __free_page(eb->pages[i]);
+ for (; i > 0; i--)
+ __free_page(eb->pages[i - 1]);
__free_extent_buffer(eb);
return NULL;
}
@@ -4123,11 +4200,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* So bump the ref count first, then set the bit. If someone
* beat us to it, drop the ref we added.
*/
- if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_inc(&eb->refs);
- if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
- }
+ spin_unlock(&eb->refs_lock);
}
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
@@ -4171,10 +4247,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS);
- if (!p) {
- WARN_ON(1);
+ if (!p)
goto free_eb;
- }
spin_lock(&mapping->private_lock);
if (PagePrivate(p)) {
@@ -4239,9 +4313,7 @@ again:
goto free_eb;
}
/* add one reference for the tree */
- spin_lock(&eb->refs_lock);
check_buffer_tree_ref(eb);
- spin_unlock(&eb->refs_lock);
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
@@ -4300,7 +4372,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
}
/* Expects to have eb->eb_lock already held */
-static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
{
WARN_ON(atomic_read(&eb->refs) == 0);
if (atomic_dec_and_test(&eb->refs)) {
@@ -4319,11 +4391,12 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_page(eb, 0);
-
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
- return;
+ return 1;
}
spin_unlock(&eb->refs_lock);
+
+ return 0;
}
void free_extent_buffer(struct extent_buffer *eb)
@@ -4640,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
}
if (start + min_len > eb->len) {
- printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+ WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
"wanted %lu %lu\n", (unsigned long long)eb->start,
eb->len, start, min_len);
- WARN_ON(1);
return -EINVAL;
}
@@ -4962,7 +5034,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
spin_unlock(&eb->refs_lock);
return 0;
}
- release_extent_buffer(eb, mask);
- return 1;
+ return release_extent_buffer(eb, mask);
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15..2eacfabd326 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits, gfp_t mask);
+ int bits, int clear_bits,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits);
+ u64 *start_ret, u64 *end_ret, int bits,
+ struct extent_state **cached_state);
struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
u64 start, int bits);
int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+ return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ (start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+ unsigned long i)
+{
+ return eb->pages[i];
+}
static inline void extent_buffer_get(struct extent_buffer *eb)
{
@@ -322,9 +337,9 @@ struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags);
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num);
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b330145..fdb7a8db3b5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
- extent_map_cache = kmem_cache_create("extent_map",
+ extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
void extent_map_tree_init(struct extent_map_tree *tree)
{
tree->map = RB_ROOT;
+ INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -48,13 +49,15 @@ void extent_map_tree_init(struct extent_map_tree *tree)
struct extent_map *alloc_extent_map(void)
{
struct extent_map *em;
- em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+ em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
em->in_tree = 0;
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = 0;
atomic_set(&em->refs, 1);
+ INIT_LIST_HEAD(&em->list);
return em;
}
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
WARN_ON(atomic_read(&em->refs) == 0);
if (atomic_dec_and_test(&em->refs)) {
WARN_ON(em->in_tree);
+ WARN_ON(!list_empty(&em->list));
kmem_cache_free(extent_map_cache, em);
}
}
@@ -167,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
return 0;
+ if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+ test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+ return 0;
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -194,10 +202,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(merge, em)) {
em->start = merge->start;
+ em->orig_start = merge->orig_start;
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
merge->in_tree = 0;
+ em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+ em->mod_start = merge->mod_start;
+ em->generation = max(em->generation, merge->generation);
+ list_move(&em->list, &tree->modified_extents);
+
+ list_del_init(&merge->list);
rb_erase(&merge->rb_node, &tree->map);
free_extent_map(merge);
}
@@ -211,14 +226,30 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->block_len += merge->len;
rb_erase(&merge->rb_node, &tree->map);
merge->in_tree = 0;
+ em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+ em->generation = max(em->generation, merge->generation);
+ list_del_init(&merge->list);
free_extent_map(merge);
}
}
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpin_extent_cache - unpin an extent from the cache
+ * @tree: tree to unpin the extent in
+ * @start: logical offset in the file
+ * @len: length of the extent
+ * @gen: generation that this extent has been modified in
+ *
+ * Called after an extent has been written to disk properly. Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+ u64 gen)
{
int ret = 0;
struct extent_map *em;
+ bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +259,25 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
if (!em)
goto out;
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_move(&em->list, &tree->modified_extents);
+ em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+ prealloc = true;
+ clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ }
try_merge_map(tree, em);
+ if (prealloc) {
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+ }
+
free_extent_map(em);
out:
write_unlock(&tree->lock);
@@ -239,6 +285,13 @@ out:
}
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ if (em->in_tree)
+ try_merge_map(tree, em);
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
@@ -269,6 +322,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
}
atomic_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
try_merge_map(tree, em);
out:
return ret;
@@ -358,6 +414,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
rb_erase(&em->rb_node, &tree->map);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_del_init(&em->list);
em->in_tree = 0;
return ret;
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761f..c6598c89cff 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,8 @@
#define EXTENT_FLAG_COMPRESSED 1
#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
struct extent_map {
struct rb_node rb_node;
@@ -20,18 +22,24 @@ struct extent_map {
/* all of these are in bytes */
u64 start;
u64 len;
+ u64 mod_start;
+ u64 mod_len;
u64 orig_start;
+ u64 orig_block_len;
u64 block_start;
u64 block_len;
+ u64 generation;
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
unsigned int in_tree;
unsigned int compress_type;
+ struct list_head list;
};
struct extent_map_tree {
struct rb_root map;
+ struct list_head modified_extents;
rwlock_t lock;
};
@@ -60,7 +68,8 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5d158d32023..94aa53b3872 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
#include "transaction.h"
#include "print-tree.h"
-#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
size) - 1))
-#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+ PAGE_CACHE_SIZE))
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
@@ -132,7 +133,6 @@ fail:
return ERR_PTR(ret);
}
-
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -150,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
+u64 btrfs_file_extent_length(struct btrfs_path *path)
+{
+ int extent_type;
+ struct btrfs_file_extent_item *fi;
+ u64 len;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC)
+ len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+ else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ len = btrfs_file_extent_inline_len(path->nodes[0], fi);
+ else
+ BUG();
+
+ return len;
+}
static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
struct inode *inode, struct bio *bio,
@@ -183,7 +203,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
* read from the commit root and sidestep a nasty deadlock
* between reading the free space cache and updating the csum tree.
*/
- if (btrfs_is_free_space_inode(root, inode)) {
+ if (btrfs_is_free_space_inode(inode)) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
@@ -272,9 +292,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
}
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 offset, u32 *dst)
+ struct bio *bio, u64 offset)
{
- return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -440,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
if (!contig)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
- if (!contig && (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset)) {
+ if (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
this_sum_bytes = 0;
@@ -690,6 +710,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
return -ENOMEM;
sector_sum = sums->sums;
+ trans->adding_csums = 1;
again:
next_offset = (u64)-1;
found_next = 0;
@@ -853,6 +874,7 @@ next_sector:
goto again;
}
out:
+ trans->adding_csums = 0;
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9aa01ec2138..aeb84469d2c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,7 +39,9 @@
#include "tree-log.h"
#include "locking.h"
#include "compat.h"
+#include "volumes.h"
+static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
@@ -89,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
* If an existing record is found the defrag item you
* pass in is freed
*/
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -117,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
entry->transid = defrag->transid;
if (defrag->last_offset > entry->last_offset)
entry->last_offset = defrag->last_offset;
- goto exists;
+ return -EEXIST;
}
}
set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
- return;
+ return 0;
+}
-exists:
- kfree(defrag);
- return;
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+ if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ return 0;
+ if (btrfs_fs_closing(root->fs_info))
+ return 0;
+
+ return 1;
}
/*
@@ -141,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
u64 transid;
+ int ret;
- if (!btrfs_test_opt(root, AUTO_DEFRAG))
- return 0;
-
- if (btrfs_fs_closing(root->fs_info))
+ if (!__need_auto_defrag(root))
return 0;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -156,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
else
transid = BTRFS_I(inode)->root->last_trans;
- defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
return -ENOMEM;
@@ -165,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
- __btrfs_add_inode_defrag(inode, defrag);
- else
- kfree(defrag);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
spin_unlock(&root->fs_info->defrag_inodes_lock);
return 0;
}
/*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
*/
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
- u64 root, u64 ino,
- struct rb_node **next)
+void btrfs_requeue_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (!__need_auto_defrag(root))
+ goto out;
+
+ /*
+ * Here we don't check the IN_DEFRAG flag, because we need merge
+ * them together.
+ */
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ if (ret)
+ goto out;
+ return;
+out:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
struct inode_defrag *entry = NULL;
struct inode_defrag tmp;
@@ -189,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
tmp.ino = ino;
tmp.root = root;
- p = info->defrag_inodes.rb_node;
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -200,52 +243,145 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
else if (ret > 0)
p = parent->rb_right;
else
- return entry;
+ goto out;
}
- if (next) {
- while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
- }
- *next = parent;
+ else
+ entry = NULL;
}
- return NULL;
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
}
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ if (need_resched()) {
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ cond_resched();
+ spin_lock(&fs_info->defrag_inodes_lock);
+ }
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
struct btrfs_root *inode_root;
struct inode *inode;
- struct rb_node *n;
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
- u64 first_ino = 0;
- u64 root_objectid = 0;
int num_defrag;
- int defrag_batch = 1024;
+ int index;
+ int ret;
+
+ /* get the inode */
+ key.objectid = defrag->root;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(inode_root)) {
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+ if (btrfs_root_refs(&inode_root->root_item) == 0) {
+ ret = -ENOENT;
+ goto cleanup;
+ }
+ key.objectid = defrag->ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto cleanup;
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ /* do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
+ range.start = defrag->last_offset;
+
+ sb_start_write(fs_info->sb);
+ num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+ if (num_defrag == BTRFS_DEFRAG_BATCH) {
+ defrag->last_offset = range.start;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+ * we didn't fill our defrag batch, but
+ * we didn't start at zero. Make sure we loop
+ * around to the start of the file.
+ */
+ defrag->last_offset = 0;
+ defrag->cycled = 1;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+
+ iput(inode);
+ return 0;
+cleanup:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
atomic_inc(&fs_info->defrag_running);
- spin_lock(&fs_info->defrag_inodes_lock);
while(1) {
- n = NULL;
+ if (!__need_auto_defrag(fs_info->tree_root))
+ break;
/* find an inode to defrag */
- defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
- first_ino, &n);
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+ first_ino);
if (!defrag) {
- if (n) {
- defrag = rb_entry(n, struct inode_defrag,
- rb_node);
- } else if (root_objectid || first_ino) {
+ if (root_objectid || first_ino) {
root_objectid = 0;
first_ino = 0;
continue;
@@ -254,70 +390,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
}
}
- /* remove it from the rbtree */
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
- rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-
- if (btrfs_fs_closing(fs_info))
- goto next_free;
-
- spin_unlock(&fs_info->defrag_inodes_lock);
-
- /* get the inode */
- key.objectid = defrag->root;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
- key.offset = (u64)-1;
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(inode_root))
- goto next;
-
- key.objectid = defrag->ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
- if (IS_ERR(inode))
- goto next;
-
- /* do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- range.start = defrag->last_offset;
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- defrag_batch);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == defrag_batch) {
- defrag->last_offset = range.start;
- __btrfs_add_inode_defrag(inode, defrag);
- /*
- * we don't want to kfree defrag, we added it back to
- * the rbtree
- */
- defrag = NULL;
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- __btrfs_add_inode_defrag(inode, defrag);
- defrag = NULL;
- }
- iput(inode);
-next:
- spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
- kfree(defrag);
+ __btrfs_run_defrag_inode(fs_info, defrag);
}
- spin_unlock(&fs_info->defrag_inodes_lock);
-
atomic_dec(&fs_info->defrag_running);
/*
@@ -458,14 +535,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
* this drops all the extents in the cache that intersect the range
* [start, end]. Existing extents are split as required.
*/
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
- int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
{
struct extent_map *em;
struct extent_map *split = NULL;
struct extent_map *split2 = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 len = end - start + 1;
+ u64 gen;
int ret;
int testend = 1;
unsigned long flags;
@@ -477,11 +555,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
testend = 0;
}
while (1) {
+ int no_splits = 0;
+
if (!split)
split = alloc_extent_map();
if (!split2)
split2 = alloc_extent_map();
- BUG_ON(!split || !split2); /* -ENOMEM */
+ if (!split || !split2)
+ no_splits = 1;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +571,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
break;
}
flags = em->flags;
+ gen = em->generation;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
if (testend && em->start + em->len >= start + len) {
free_extent_map(em);
@@ -506,6 +588,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
remove_extent_mapping(em_tree, em);
+ if (no_splits)
+ goto next;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
em->start < start) {
@@ -518,12 +602,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
else
split->block_len = split->len;
-
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->generation = gen;
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret); /* Logic error */
+ list_move(&split->list, &em_tree->modified_extents);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -537,6 +624,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
+ split->generation = gen;
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
if (compressed) {
split->block_len = em->block_len;
@@ -545,14 +635,16 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
} else {
split->block_len = split->len;
split->block_start = em->block_start + diff;
- split->orig_start = split->start;
+ split->orig_start = em->orig_start;
}
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret); /* Logic error */
+ list_move(&split->list, &em_tree->modified_extents);
free_extent_map(split);
split = NULL;
}
+next:
write_unlock(&em_tree->lock);
/* once for us */
@@ -564,7 +656,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
if (split2)
free_extent_map(split2);
- return 0;
}
/*
@@ -576,13 +667,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
*/
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
- u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
- struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
@@ -597,14 +688,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
int recow;
int ret;
int modify_tree = -1;
+ int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ int found = 0;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
if (start >= BTRFS_I(inode)->disk_i_size)
modify_tree = 0;
@@ -666,6 +755,7 @@ next_slot:
goto next_slot;
}
+ found = 1;
search_start = max(key.offset, start);
if (recow || !modify_tree) {
modify_tree = -1;
@@ -707,14 +797,13 @@ next_slot:
extent_end - start);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0) {
ret = btrfs_inc_extent_ref(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
start - extent_offset, 0);
BUG_ON(ret); /* -ENOMEM */
- *hint_byte = disk_bytenr;
}
key.offset = start;
}
@@ -734,10 +823,8 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, end - key.offset);
- *hint_byte = disk_bytenr;
- }
break;
}
@@ -753,10 +840,8 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, extent_end - start);
- *hint_byte = disk_bytenr;
- }
if (end == extent_end)
break;
@@ -777,12 +862,13 @@ next_slot:
del_nr++;
}
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ if (update_refs &&
+ extent_type == BTRFS_FILE_EXTENT_INLINE) {
inode_sub_bytes(inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
root->sectorsize);
- } else if (disk_bytenr > 0) {
+ } else if (update_refs && disk_bytenr > 0) {
ret = btrfs_free_extent(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
@@ -791,7 +877,6 @@ next_slot:
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
- *hint_byte = disk_bytenr;
}
if (end == extent_end)
@@ -806,7 +891,7 @@ next_slot:
del_nr);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ break;
}
del_nr = 0;
@@ -825,7 +910,24 @@ next_slot:
btrfs_abort_transaction(trans, root, ret);
}
-out:
+ if (drop_end)
+ *drop_end = found ? min(end, extent_end) : end;
+ btrfs_release_path(path);
+ return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+ drop_cache);
btrfs_free_path(path);
return ret;
}
@@ -892,8 +994,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int ret;
u64 ino = btrfs_ino(inode);
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -935,12 +1035,16 @@ again:
btrfs_set_item_key_safe(trans, root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_set_file_extent_offset(leaf, fi,
end - orig_offset);
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +1062,16 @@ again:
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
path->slots[0]++;
new_key.offset = start;
btrfs_set_item_key_safe(trans, root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1099,14 @@ again:
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
split - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
@@ -1056,12 +1166,14 @@ again:
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1285,8 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, &cached_state, GFP_NOFS);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, &cached_state,
GFP_NOFS);
@@ -1314,10 +1426,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
cond_resched();
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_pages);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root, 1);
+ btrfs_btree_balance_dirty(root);
pos += copied;
num_written += copied;
@@ -1366,6 +1477,24 @@ out:
return written ? written : err;
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_fs_time(inode->i_sb);
+ if (!timespec_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
@@ -1378,8 +1507,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
ssize_t num_written = 0;
ssize_t err = 0;
size_t count, ocount;
+ bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
@@ -1420,11 +1550,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
- err = file_update_time(file);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
@@ -1435,6 +1567,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
}
}
+ if (sync)
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+
if (unlikely(file->f_flags & O_DIRECT)) {
num_written = __btrfs_direct_write(iocb, iov, nr_segs,
pos, ppos, count, ocount);
@@ -1461,14 +1596,23 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
* this will either be one more than the running transaction
* or the generation used for the next transaction if there isn't
* one running right now.
+ *
+ * We also have to set last_sub_trans to the current log transid,
+ * otherwise subsequent syncs to a file that's been synced in this
+ * transaction will appear to have already occured.
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0 || num_written == -EIOCBQUEUED) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
}
+
+ if (sync)
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
out:
+ sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return num_written ? num_written : err;
}
@@ -1513,16 +1657,26 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_btrfs_sync_file(file, datasync);
+ /*
+ * We write the dirty pages in the range and wait until they complete
+ * out of the ->i_mutex. If so, we can flush the dirty pages by
+ * multi-task, and make the performance up.
+ */
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+ if (ret)
+ return ret;
+
mutex_lock(&inode->i_mutex);
/*
- * we wait first, since the writeback may change the inode, also wait
- * ordered range does a filemape_write_and_wait_range which is why we
- * don't do it above like other file systems.
+ * We flush the dirty pages again to avoid some dirty pages in the
+ * range being left.
*/
- root->log_batch++;
- btrfs_wait_ordered_range(inode, start, end);
- root->log_batch++;
+ atomic_inc(&root->log_batch);
+ btrfs_wait_ordered_range(inode, start, end - start + 1);
+ atomic_inc(&root->log_batch);
/*
* check the transaction that last modified this inode
@@ -1543,6 +1697,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
+
+ /*
+ * We'v had everything committed since the last time we were
+ * modified so clear this flag in case it was set for whatever
+ * reason, it's no longer relevant.
+ */
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
mutex_unlock(&inode->i_mutex);
goto out;
}
@@ -1598,6 +1760,7 @@ out:
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = btrfs_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -1609,11 +1772,333 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
file_accessed(filp);
vma->vm_ops = &btrfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi))
+ return 0;
+
+ if (key.offset == end)
+ return 1;
+ if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+ return 1;
+ return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct btrfs_path *path, u64 offset, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct extent_map *hole_em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(!ret);
+
+ leaf = path->nodes[0];
+ if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]--;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+ end - offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+
+ if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]++;
+ key.offset = offset;
+ btrfs_set_item_key_safe(trans, root, path, &key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+ offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+ 0, 0, end - offset, 0, end - offset,
+ 0, 0, 0);
+ if (ret)
+ return ret;
+
+out:
+ btrfs_release_path(path);
+
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ hole_em->start = offset;
+ hole_em->len = end - offset;
+ hole_em->orig_start = offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+
+ do {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, hole_em);
+ if (!ret)
+ list_move(&hole_em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+ free_extent_map(hole_em);
+ if (ret)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
+ return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *rsv;
+ struct btrfs_trans_handle *trans;
+ u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+ u64 lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ u64 cur_offset = lockstart;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 drop_end;
+ int ret = 0;
+ int err = 0;
+ bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
+ btrfs_wait_ordered_range(inode, offset, len);
+
+ mutex_lock(&inode->i_mutex);
+ /*
+ * We needn't truncate any page which is beyond the end of the file
+ * because we are sure there is no data there.
+ */
+ /*
+ * Only do this if we are in the same page and we aren't doing the
+ * entire page.
+ */
+ if (same_page && len < PAGE_CACHE_SIZE) {
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ ret = btrfs_truncate_page(inode, offset, len, 0);
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+
+ /* zero back part of the first page */
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ ret = btrfs_truncate_page(inode, offset, 0, 0);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ /* zero the front end of the last page */
+ if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ if (lockend < lockstart) {
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+ }
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+ /*
+ * We need to make sure we have no ordered extents in this range
+ * and nobody raced in and read a page in this range, if we did
+ * we need to try again.
+ */
+ if ((!ordered ||
+ (ordered->file_offset + ordered->len < lockstart ||
+ ordered->file_offset > lockend)) &&
+ !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_UPTODATE, 0,
+ cached_state)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state, GFP_NOFS);
+ btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+ rsv->failfast = 1;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
+ BUG_ON(ret);
+ trans->block_rsv = rsv;
+
+ while (cur_offset < lockend) {
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ cur_offset, lockend + 1,
+ &drop_end, 1);
+ if (ret != -ENOSPC)
+ break;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ cur_offset = drop_end;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+ }
+
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+out_free:
+ btrfs_free_path(path);
+ btrfs_free_block_rsv(root, rsv);
+out:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ mutex_unlock(&inode->i_mutex);
+ if (ret && !err)
+ err = ret;
+ return err;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
@@ -1625,22 +2110,25 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
- u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
+ int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
- alloc_start = offset & ~mask;
- alloc_end = (offset + len + mask) & ~mask;
+ alloc_start = round_down(offset, blocksize);
+ alloc_end = round_up(offset + len, blocksize);
- /* We only support the FALLOC_FL_KEEP_SIZE mode */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
+ /* Make sure we aren't being give some crap mode */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return btrfs_punch_hole(inode, offset, len);
+
/*
* Make sure we have enough space before we do the
* allocation.
*/
- ret = btrfs_check_data_free_space(inode, len);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
return ret;
@@ -1708,7 +2196,7 @@ static long btrfs_fallocate(struct file *file, int mode,
}
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
- last_byte = (last_byte + mask) & ~mask;
+ last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
@@ -1747,11 +2235,11 @@ static long btrfs_fallocate(struct file *file, int mode,
out:
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em;
@@ -1768,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
+ lockend--;
len = lockend - lockstart + 1;
len = max_t(u64, len, root->sectorsize);
@@ -1785,7 +2274,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
* before the position we want in case there is outstanding delalloc
* going on here.
*/
- if (origin == SEEK_HOLE && start != 0) {
+ if (whence == SEEK_HOLE && start != 0) {
if (start <= root->sectorsize)
em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
root->sectorsize, 0);
@@ -1819,13 +2308,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
}
}
- if (origin == SEEK_HOLE) {
+ if (whence == SEEK_HOLE) {
*offset = start;
free_extent_map(em);
break;
}
} else {
- if (origin == SEEK_DATA) {
+ if (whence == SEEK_DATA) {
if (em->block_start == EXTENT_MAP_DELALLOC) {
if (start >= inode->i_size) {
free_extent_map(em);
@@ -1834,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
}
}
- *offset = start;
- free_extent_map(em);
- break;
+ if (!test_bit(EXTENT_FLAG_PREALLOC,
+ &em->flags)) {
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
}
}
@@ -1862,16 +2354,16 @@ out:
return ret;
}
-static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int ret;
mutex_lock(&inode->i_mutex);
- switch (origin) {
+ switch (whence) {
case SEEK_END:
case SEEK_CUR:
- offset = generic_file_llseek(file, offset, origin);
+ offset = generic_file_llseek(file, offset, whence);
goto out;
case SEEK_DATA:
case SEEK_HOLE:
@@ -1880,7 +2372,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
return -ENXIO;
}
- ret = find_desired_extent(inode, &offset, origin);
+ ret = find_desired_extent(inode, &offset, whence);
if (ret) {
mutex_unlock(&inode->i_mutex);
return ret;
@@ -1923,3 +2415,21 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_ioctl,
#endif
};
+
+void btrfs_auto_defrag_exit(void)
+{
+ if (btrfs_inode_defrag_cachep)
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6c4e2baa929..0be7a8742a4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
{
- WARN_ON(io_ctl->cur);
BUG_ON(io_ctl->index >= io_ctl->num_pages);
io_ctl->page = io_ctl->pages[io_ctl->index++];
io_ctl->cur = kmap(io_ctl->page);
@@ -966,7 +965,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
block_group->key.offset)) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret) {
ret = 0;
break;
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
* if previous extent entry covers the offset,
* we should return it instead of the bitmap entry
*/
- n = &entry->offset_index;
- while (1) {
- n = rb_prev(n);
- if (!n)
- break;
+ n = rb_prev(&entry->offset_index);
+ if (n) {
prev = rb_entry(n, struct btrfs_free_space,
offset_index);
- if (!prev->bitmap) {
- if (prev->offset + prev->bytes > offset)
- entry = prev;
- break;
- }
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ entry = prev;
}
}
return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
}
if (entry->bitmap) {
- n = &entry->offset_index;
- while (1) {
- n = rb_prev(n);
- if (!n)
- break;
+ n = rb_prev(&entry->offset_index);
+ if (n) {
prev = rb_entry(n, struct btrfs_free_space,
offset_index);
- if (!prev->bitmap) {
- if (prev->offset + prev->bytes > offset)
- return prev;
- break;
- }
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ return prev;
}
if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
- u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1454,9 +1443,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
- for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
- i < BITS_PER_BITMAP;
- i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+ for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
if ((next_zero - i) >= bits) {
@@ -1652,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* some block groups are so tiny they can't be enveloped by a bitmap, so
* don't even bother to create a bitmap for this
*/
- if (BITS_PER_BITMAP * block_group->sectorsize >
- block_group->key.offset)
+ if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
return false;
return true;
@@ -1876,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- int ret = 0;
+ int ret;
+ bool re_search = false;
spin_lock(&ctl->tree_lock);
again:
+ ret = 0;
if (!bytes)
goto out_lock;
@@ -1893,17 +1881,17 @@ again:
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
- /* the tree logging code might be calling us before we
- * have fully loaded the free space rbtree for this
- * block group. So it is possible the entry won't
- * be in the rbtree yet at all. The caching code
- * will make sure not to put it in the rbtree if
- * the logging code has pinned it.
+ /*
+ * If we found a partial bit of our free space in a
+ * bitmap but then couldn't find the other part this may
+ * be a problem, so WARN about it.
*/
+ WARN_ON(re_search);
goto out_lock;
}
}
+ re_search = false;
if (!info->bitmap) {
unlink_free_space(ctl, info);
if (offset == info->offset) {
@@ -1949,8 +1937,10 @@ again:
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ re_search = true;
goto again;
+ }
BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
@@ -1968,7 +1958,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
- if (info->bytes >= bytes)
+ if (info->bytes >= bytes && !block_group->ro)
count++;
printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
(unsigned long long)info->offset,
@@ -2300,16 +2290,14 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long total_found = 0;
int ret;
- i = offset_to_bit(entry->offset, block_group->sectorsize,
+ i = offset_to_bit(entry->offset, ctl->unit,
max_t(u64, offset, entry->offset));
- want_bits = bytes_to_bits(bytes, block_group->sectorsize);
- min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, ctl->unit);
+ min_bits = bytes_to_bits(min_bytes, ctl->unit);
again:
found_bits = 0;
- for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
- i < BITS_PER_BITMAP;
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+ for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
@@ -2329,23 +2317,22 @@ again:
total_found += found_bits;
- if (cluster->max_size < found_bits * block_group->sectorsize)
- cluster->max_size = found_bits * block_group->sectorsize;
+ if (cluster->max_size < found_bits * ctl->unit)
+ cluster->max_size = found_bits * ctl->unit;
if (total_found < want_bits || cluster->max_size < cont1_bytes) {
i = next_zero + 1;
goto again;
}
- cluster->window_start = start * block_group->sectorsize +
- entry->offset;
+ cluster->window_start = start * ctl->unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
BUG_ON(ret); /* -EEXIST; Logic error */
trace_btrfs_setup_cluster(block_group, cluster,
- total_found * block_group->sectorsize, 1);
+ total_found * ctl->unit, 1);
return 0;
}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b9..1d982812ab6 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
{
return crc32c((u32)~1, name, len);
}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+ int len)
+{
+ return (u64) crc32c(parent_objectid, name, len);
+}
+
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c7..48b8fda9313 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
#include "ctree.h"
#include "disk-io.h"
+#include "hash.h"
#include "transaction.h"
#include "print-tree.h"
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
return 0;
}
-struct btrfs_inode_ref *
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+ const char *name, int name_len,
+ struct btrfs_inode_extref **extref_ret)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_inode_extref *extref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int ref_name_len;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ /*
+ * Search all extended backrefs in this item. We're only
+ * looking through any collisions so most of the time this is
+ * just going to compare against one buffer. If all is well,
+ * we'll return success and the inode ref object.
+ */
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_ptr = (unsigned long)(&extref->name);
+ ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (ref_name_len == name_len &&
+ btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+ (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+ if (extref_ret)
+ *extref_ret = extref;
+ return 1;
+ }
+
+ cur_offset += ref_name_len + sizeof(*extref);
+ }
+ return 0;
+}
+
+static struct btrfs_inode_ref *
btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod)
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow)
{
+ int ret;
struct btrfs_key key;
struct btrfs_inode_ref *ref;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
- int ret;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
return ref;
}
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+ return NULL;
+ return extref;
+}
+
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod,
+ u64 *ret_index)
+{
+ struct btrfs_inode_ref *ref;
+ struct btrfs_inode_extref *extref;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+ inode_objectid, ref_objectid, ins_len,
+ cow);
+ if (IS_ERR(ref))
+ return PTR_ERR(ref);
+
+ if (ref != NULL) {
+ *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+ return 0;
+ }
+
+ btrfs_release_path(path);
+
+ extref = btrfs_lookup_inode_extref(trans, root, path, name,
+ name_len, inode_objectid,
+ ref_objectid, ins_len, cow);
+ if (IS_ERR(extref))
+ return PTR_ERR(extref);
+
+ if (extref) {
+ *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index)
{
struct btrfs_path *path;
struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ int ret;
+ int del_len = name_len + sizeof(*extref);
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+
+ key.objectid = inode_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Sanity check - did we find the right item for this name?
+ * This should always succeed so error here will make the FS
+ * readonly.
+ */
+ if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, name_len, &extref)) {
+ btrfs_std_error(root->fs_info, -ENOENT);
+ ret = -EROFS;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (index)
+ *index = btrfs_inode_extref_index(leaf, extref);
+
+ if (del_len == item_size) {
+ /*
+ * Common case only one ref in the item, remove the
+ * whole item.
+ */
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+
+ ptr = (unsigned long)extref;
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ memmove_extent_buffer(leaf, ptr, ptr + del_len,
+ item_size - (ptr + del_len - item_start));
+
+ btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
struct btrfs_inode_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
u32 item_size;
u32 sub_item_len;
int ret;
+ int search_ext_refs = 0;
int del_len = name_len + sizeof(*ref);
key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
+ search_ext_refs = 1;
goto out;
} else if (ret < 0) {
goto out;
}
if (!find_name_in_backref(path, name, name_len, &ref)) {
ret = -ENOENT;
+ search_ext_refs = 1;
goto out;
}
leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(trans, root, path,
- item_size - sub_item_len, 1);
+ btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+out:
+ btrfs_free_path(path);
+
+ if (search_ext_refs) {
+ /*
+ * No refs were found, or we could not find the
+ * name in our ref array. Find and remove the extended
+ * inode ref then.
+ */
+ return btrfs_del_inode_extref(trans, root, name, name_len,
+ inode_objectid, ref_objectid, index);
+ }
+
+ return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_inode_extref *extref;
+ int ret;
+ int ins_len = name_len + sizeof(*extref);
+ unsigned long ptr;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, name_len, NULL))
+ goto out;
+
+ btrfs_extend_item(trans, root, path, ins_len);
+ ret = 0;
+ }
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+ ptr += btrfs_item_size(leaf, item) - ins_len;
+ extref = (struct btrfs_inode_extref *)ptr;
+
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+ btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+ btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+ ptr = (unsigned long)&extref->name;
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
out:
btrfs_free_path(path);
return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
+
+ if (ret == -EMLINK) {
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+ /* We ran out of space in the ref array. Need to
+ * add an extended ref. */
+ if (btrfs_super_incompat_flags(disk_super)
+ & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+ ret = btrfs_insert_inode_extref(trans, root, name,
+ name_len,
+ inode_objectid,
+ ref_objectid, index);
+ }
+
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba8..d26f67a59e3 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
* 3 items for pre-allocation
*/
trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
- ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
- trans->bytes_reserved);
+ ret = btrfs_block_rsv_add(root, trans->block_rsv,
+ trans->bytes_reserved,
+ BTRFS_RESERVE_NO_FLUSH);
if (ret)
goto out;
trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a7d1921ac76..cc93b23ca35 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
@@ -87,15 +88,17 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
};
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
-static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+ u64 len, u64 orig_start,
+ u64 block_start, u64 block_len,
+ u64 orig_block_len, int type);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
@@ -230,7 +233,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
u64 inline_len = actual_end - start;
u64 aligned_end = (end + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
- u64 hint_byte;
u64 data_len = inline_len;
int ret;
@@ -247,8 +249,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
return 1;
}
- ret = btrfs_drop_extents(trans, inode, start, aligned_end,
- &hint_byte, 1);
+ ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
if (ret)
return ret;
@@ -324,7 +325,8 @@ static noinline int add_async_extent(struct async_cow *cow,
* If this code finds it can't get good compression, it puts an
* entry onto the work queue to write the uncompressed bytes. This
* makes sure that both compressed inodes and uncompressed inodes
- * are written in the same order that pdflush sent them down.
+ * are written in the same order that the flusher thread sent them
+ * down.
*/
static noinline int compress_file_range(struct inode *inode,
struct page *locked_page,
@@ -663,7 +665,7 @@ retry:
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint, &ins, 1);
- if (ret)
+ if (ret && ret != -ENOSPC)
btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
}
@@ -701,14 +703,19 @@ retry:
em->block_start = ins.objectid;
em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -806,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
* required to start IO on it. It may be clean and already done with
* IO when we return.
*/
-static noinline int cow_file_range(struct inode *inode,
- struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- int unlock)
+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_root *root,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
@@ -825,26 +832,11 @@ static noinline int cow_file_range(struct inode *inode,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
- BUG_ON(btrfs_is_free_space_inode(root, inode));
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, locked_page,
- EXTENT_CLEAR_UNLOCK_PAGE |
- EXTENT_CLEAR_UNLOCK |
- EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_DIRTY |
- EXTENT_SET_WRITEBACK |
- EXTENT_END_WRITEBACK);
- return PTR_ERR(trans);
- }
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ BUG_ON(btrfs_is_free_space_inode(inode));
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
num_bytes = max(blocksize, num_bytes);
disk_num_bytes = num_bytes;
- ret = 0;
/* if this is a small write inside eof, kick off defrag */
if (num_bytes < 64 * 1024 &&
@@ -903,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
em->block_start = ins.objectid;
em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -955,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
}
- ret = 0;
out:
- btrfs_end_transaction(trans, root);
-
return ret;
+
out_unlock:
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -974,6 +969,39 @@ out_unlock:
goto out;
}
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ return PTR_ERR(trans);
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ ret = __cow_file_range(trans, inode, root, locked_page, start, end,
+ page_started, nr_written, unlock);
+
+ btrfs_end_transaction(trans, root);
+
+ return ret;
+}
+
/*
* work queue call back to started compression on a file and pages
*/
@@ -1007,10 +1035,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
- atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
-
- if (atomic_read(&root->fs_info->async_delalloc_pages) <
- 5 * 1042 * 1024 &&
+ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
+ 5 * 1024 * 1024 &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1035,7 +1061,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
- int limit = 10 * 1024 * 1042;
+ int limit = 10 * 1024 * 1024;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1, 0, NULL, GFP_NOFS);
@@ -1131,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 extent_offset;
u64 disk_bytenr;
u64 num_bytes;
+ u64 disk_num_bytes;
int extent_type;
int ret, err;
int type;
@@ -1153,7 +1180,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
return -ENOMEM;
}
- nolock = btrfs_is_free_space_inode(root, inode);
+ nolock = btrfs_is_free_space_inode(inode);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
@@ -1233,6 +1260,8 @@ next_slot:
extent_offset = btrfs_file_extent_offset(leaf, fi);
extent_end = found_key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
+ disk_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
if (extent_end <= start) {
path->slots[0]++;
goto next_slot;
@@ -1286,9 +1315,9 @@ out_check:
btrfs_release_path(path);
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page, cow_start,
- found_key.offset - 1, page_started,
- nr_written, 1);
+ ret = __cow_file_range(trans, inode, root, locked_page,
+ cow_start, found_key.offset - 1,
+ page_started, nr_written, 1);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error;
@@ -1303,15 +1332,21 @@ out_check:
em = alloc_extent_map();
BUG_ON(!em); /* -ENOMEM */
em->start = cur_offset;
- em->orig_start = em->start;
+ em->orig_start = found_key.offset - extent_offset;
em->len = num_bytes;
em->block_len = num_bytes;
em->block_start = disk_bytenr;
+ em->orig_block_len = disk_num_bytes;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
+ em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -1356,8 +1391,9 @@ out_check:
}
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = __cow_file_range(trans, inode, root, locked_page,
+ cow_start, end,
+ page_started, nr_written, 1);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error;
@@ -1365,11 +1401,7 @@ out_check:
}
error:
- if (nolock) {
- err = btrfs_end_transaction_nolock(trans, root);
- } else {
- err = btrfs_end_transaction(trans, root);
- }
+ err = btrfs_end_transaction(trans, root);
if (!ret)
ret = err;
@@ -1466,7 +1498,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- bool do_list = !btrfs_is_free_space_inode(root, inode);
+ bool do_list = !btrfs_is_free_space_inode(inode);
if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1501,7 +1533,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- bool do_list = !btrfs_is_free_space_inode(root, inode);
+ bool do_list = !btrfs_is_free_space_inode(inode);
if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1539,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct btrfs_mapping_tree *map_tree;
u64 logical = (u64)bio->bi_sector << 9;
u64 length = 0;
u64 map_length;
@@ -1549,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
return 0;
length = bio->bi_size;
- map_tree = &root->fs_info->mapping_tree;
map_length = length;
- ret = btrfs_map_block(map_tree, READ, logical,
+ ret = btrfs_map_block(root->fs_info, READ, logical,
&map_length, NULL, 0);
- /* Will always return 0 or 1 with map_multi == NULL */
+ /* Will always return 0 with map_multi == NULL */
BUG_ON(ret < 0);
if (map_length < length + size)
return 1;
@@ -1594,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+ int ret;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
}
/*
@@ -1609,40 +1644,53 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int ret = 0;
int skip_sum;
int metadata = 0;
+ int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- if (btrfs_is_free_space_inode(root, inode))
+ if (btrfs_is_free_space_inode(inode))
metadata = 2;
if (!(rw & REQ_WRITE)) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
if (ret)
- return ret;
+ goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) {
- return btrfs_submit_compressed_read(inode, bio,
- mirror_num, bio_flags);
+ ret = btrfs_submit_compressed_read(inode, bio,
+ mirror_num,
+ bio_flags);
+ goto out;
} else if (!skip_sum) {
ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
if (ret)
- return ret;
+ goto out;
}
goto mapit;
- } else if (!skip_sum) {
+ } else if (async && !skip_sum) {
/* csum items have already been cloned */
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
bio_flags, bio_offset,
__btrfs_submit_bio_start,
__btrfs_submit_bio_done);
+ goto out;
+ } else if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ if (ret)
+ goto out;
}
mapit:
- return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+ if (ret < 0)
+ bio_endio(bio, ret);
+ return ret;
}
/*
@@ -1665,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state)
{
- if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
- WARN_ON(1);
+ WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
cached_state, GFP_NOFS);
}
@@ -1786,7 +1833,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
- u64 hint;
int ret;
path = btrfs_alloc_path();
@@ -1804,8 +1850,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
- &hint, 0);
+ ret = btrfs_drop_extents(trans, root, inode, file_pos,
+ file_pos + num_bytes, 0);
if (ret)
goto out;
@@ -1829,10 +1875,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_encryption(leaf, fi, encryption);
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
- btrfs_unlock_up_safe(path, 1);
- btrfs_set_lock_blocking(leaf);
-
btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
inode_add_bytes(inode, num_bytes);
@@ -1869,7 +1913,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
int ret;
bool nolock;
- nolock = btrfs_is_free_space_inode(root, inode);
+ nolock = btrfs_is_free_space_inode(inode);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -1878,19 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
- ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (!ret) {
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_update_inode_fallback(trans, root, inode);
- if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
}
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
goto out;
}
@@ -1927,11 +1972,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset,
- ordered_extent->len);
}
-
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset, ordered_extent->len,
+ trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
goto out_unlock;
@@ -1940,13 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
- ret = btrfs_update_inode_fallback(trans, root, inode);
- if (ret) { /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
- goto out_unlock;
- }
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) { /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
}
ret = 0;
out_unlock:
@@ -1956,12 +1998,8 @@ out_unlock:
out:
if (root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
- if (trans) {
- if (nolock)
- btrfs_end_transaction_nolock(trans, root);
- else
- btrfs_end_transaction(trans, root);
- }
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (ret)
clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -1969,8 +2007,8 @@ out:
ordered_extent->len - 1, NULL, GFP_NOFS);
/*
- * This needs to be dont to make sure anybody waiting knows we are done
- * upating everything for this ordered extent.
+ * This needs to be done to make sure anybody waiting knows we are done
+ * updating everything for this ordered extent.
*/
btrfs_remove_ordered_extent(inode, ordered_extent);
@@ -2007,7 +2045,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
ordered_extent->work.func = finish_ordered_fn;
ordered_extent->work.flags = 0;
- if (btrfs_is_free_space_inode(root, inode))
+ if (btrfs_is_free_space_inode(inode))
workers = &root->fs_info->endio_freespace_worker;
else
workers = &root->fs_info->endio_write_workers;
@@ -2117,7 +2155,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
if (empty)
return;
- down_read(&root->fs_info->cleanup_work_sem);
spin_lock(&fs_info->delayed_iput_lock);
list_splice_init(&fs_info->delayed_iputs, &list);
spin_unlock(&fs_info->delayed_iput_lock);
@@ -2128,7 +2165,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
iput(delayed->inode);
kfree(delayed);
}
- up_read(&root->fs_info->cleanup_work_sem);
}
enum btrfs_orphan_cleanup_state {
@@ -2196,7 +2232,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int ret;
if (!root->orphan_block_rsv) {
- block_rsv = btrfs_alloc_block_rsv(root);
+ block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!block_rsv)
return -ENOMEM;
}
@@ -2223,7 +2259,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
insert = 1;
#endif
insert = 1;
- atomic_dec(&root->orphan_inodes);
+ atomic_inc(&root->orphan_inodes);
}
if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2442,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
+
+ /* 1 for the orphan item deletion. */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+
ret = btrfs_truncate(inode);
} else {
nr_unlink++;
@@ -2570,8 +2618,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_inode_item);
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
- inode->i_uid = btrfs_inode_uid(leaf, inode_item);
- inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+ i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
tspec = btrfs_inode_atime(inode_item);
@@ -2588,6 +2636,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+ /*
+ * If we were modified in the current generation and evicted from memory
+ * and then re-read we need to do a full sync since we don't have any
+ * idea about which extents were modified before we were evicted from
+ * cache.
+ */
+ if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
inode->i_version = btrfs_inode_sequence(leaf, inode_item);
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
@@ -2649,8 +2709,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- btrfs_set_inode_uid(leaf, item, inode->i_uid);
- btrfs_set_inode_gid(leaf, item, inode->i_gid);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
btrfs_set_inode_mode(leaf, item, inode->i_mode);
btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
@@ -2732,8 +2792,10 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* The data relocation inode should also be directly updated
* without delay
*/
- if (!btrfs_is_free_space_inode(root, inode)
+ if (!btrfs_is_free_space_inode(inode)
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ btrfs_update_root_times(trans, root);
+
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
btrfs_set_inode_last_trans(trans, inode);
@@ -2743,8 +2805,9 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
return btrfs_update_inode_item(trans, root, inode);
}
-static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
{
int ret;
@@ -2833,7 +2896,7 @@ err:
inode_inc_iversion(inode);
inode_inc_iversion(dir);
inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
}
@@ -2890,7 +2953,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
- struct btrfs_inode_ref *ref;
struct btrfs_dir_item *di;
struct inode *inode = dentry->d_inode;
u64 index;
@@ -3004,17 +3066,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
}
btrfs_release_path(path);
- ref = btrfs_lookup_inode_ref(trans, root, path,
- dentry->d_name.name, dentry->d_name.len,
- ino, dir_ino, 0);
- if (IS_ERR(ref)) {
- err = PTR_ERR(ref);
+ ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
+ dentry->d_name.len, ino, dir_ino, 0,
+ &index);
+ if (ret) {
+ err = ret;
goto out;
}
- BUG_ON(!ref); /* Logic error */
+
if (check_path_shared(root, path))
goto out;
- index = btrfs_inode_ref_index(path->nodes[0], ref);
+
btrfs_release_path(path);
/*
@@ -3057,7 +3119,7 @@ out:
static void __unlink_end_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
btrfs_block_rsv_release(root, trans->block_rsv,
trans->bytes_reserved);
trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3073,7 +3135,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
struct inode *inode = dentry->d_inode;
int ret;
- unsigned long nr = 0;
trans = __unlink_start_trans(dir, dentry);
if (IS_ERR(trans))
@@ -3093,9 +3154,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
out:
- nr = trans->blocks_used;
__unlink_end_trans(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return ret;
}
@@ -3171,7 +3231,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, root, ret);
out:
@@ -3185,11 +3245,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
- unsigned long nr = 0;
- if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
- btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+ return -EPERM;
trans = __unlink_start_trans(dir, dentry);
if (IS_ERR(trans))
@@ -3213,9 +3273,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!err)
btrfs_i_size_write(inode, 0);
out:
- nr = trans->blocks_used;
__unlink_end_trans(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -3263,8 +3322,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = -1;
+ /*
+ * We want to drop from the next block forward in case this new size is
+ * not block aligned since we will be keeping the last block of the
+ * extent just the way it is.
+ */
if (root->ref_cows || root == root->fs_info->tree_root)
- btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+ btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
/*
* This function is also used to drop the items in the log tree before
@@ -3425,12 +3489,6 @@ delete:
if (path->slots[0] == 0 ||
path->slots[0] != pending_del_slot) {
- if (root->ref_cows &&
- BTRFS_I(inode)->location.objectid !=
- BTRFS_FREE_INO_OBJECTID) {
- err = -EAGAIN;
- goto out;
- }
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -3461,12 +3519,20 @@ error:
}
/*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ * offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero. This is used with truncate and hole punching.
*/
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front)
{
- struct inode *inode = mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
@@ -3481,17 +3547,18 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
u64 page_start;
u64 page_end;
- if ((offset & (blocksize - 1)) == 0)
+ if ((offset & (blocksize - 1)) == 0 &&
+ (!len || ((len & (blocksize - 1)) == 0)))
goto out;
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
goto out;
- ret = -ENOMEM;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ ret = -ENOMEM;
goto out;
}
@@ -3528,7 +3595,8 @@ again:
}
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3539,10 +3607,14 @@ again:
goto out_unlock;
}
- ret = 0;
if (offset != PAGE_CACHE_SIZE) {
+ if (!len)
+ len = PAGE_CACHE_SIZE - offset;
kaddr = kmap(page);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ if (front)
+ memset(kaddr, 0, offset);
+ else
+ memset(kaddr + offset, 0, len);
flush_dcache_page(page);
kunmap(page);
}
@@ -3573,6 +3645,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 mask = root->sectorsize - 1;
u64 hole_start = (oldsize + mask) & ~mask;
u64 block_end = (size + mask) & ~mask;
@@ -3604,12 +3677,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
block_end - cur_offset, 0);
if (IS_ERR(em)) {
err = PTR_ERR(em);
+ em = NULL;
break;
}
last_byte = min(extent_map_end(em), block_end);
last_byte = (last_byte + mask) & ~mask;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- u64 hint_byte = 0;
+ struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
trans = btrfs_start_transaction(root, 3);
@@ -3618,9 +3692,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
break;
}
- err = btrfs_drop_extents(trans, inode, cur_offset,
- cur_offset + hole_size,
- &hint_byte, 1);
+ err = btrfs_drop_extents(trans, root, inode,
+ cur_offset,
+ cur_offset + hole_size, 1);
if (err) {
btrfs_abort_transaction(trans, root, err);
btrfs_end_transaction(trans, root);
@@ -3637,9 +3711,40 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
break;
}
- btrfs_drop_extent_cache(inode, hole_start,
- last_byte - 1, 0);
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + hole_size - 1, 0);
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ goto next;
+ }
+ hole_em->start = cur_offset;
+ hole_em->len = hole_size;
+ hole_em->orig_start = cur_offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+ while (1) {
+ write_lock(&em_tree->lock);
+ err = add_extent_mapping(em_tree, hole_em);
+ if (!err)
+ list_move(&hole_em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ if (err != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset +
+ hole_size - 1, 0);
+ }
+ free_extent_map(hole_em);
+next:
btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
}
@@ -3656,16 +3761,27 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
return err;
}
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
loff_t oldsize = i_size_read(inode);
+ loff_t newsize = attr->ia_size;
+ int mask = attr->ia_valid;
int ret;
if (newsize == oldsize)
return 0;
+ /*
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+ inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+
if (newsize > oldsize) {
truncate_pagecache(inode, oldsize, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3691,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * We need to do this in case we fail at _any_ point during the
+ * actual truncate. Once we do the truncate_setsize we could
+ * invalidate pages which forces any outstanding ordered io to
+ * be instantly completed which will give us extents that need
+ * to be truncated. If we fail to get an orphan inode down we
+ * could have left over extents that were never meant to live,
+ * so we need to garuntee from this point on that everything
+ * will be consistent.
+ */
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
+
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
ret = btrfs_truncate(inode);
+ if (ret && inode->i_nlink)
+ btrfs_orphan_del(NULL, inode);
}
return ret;
@@ -3713,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr->ia_size);
+ err = btrfs_setsize(inode, attr);
if (err)
return err;
}
@@ -3736,14 +3877,13 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- unsigned long nr;
int ret;
trace_btrfs_inode_evict(inode);
truncate_inode_pages(&inode->i_data, 0);
if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
- btrfs_is_free_space_inode(root, inode)))
+ btrfs_is_free_space_inode(inode)))
goto no_delete;
if (is_bad_inode(inode)) {
@@ -3764,29 +3904,26 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- rsv = btrfs_alloc_block_rsv(root);
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
btrfs_orphan_del(NULL, inode);
goto no_delete;
}
rsv->size = min_size;
+ rsv->failfast = 1;
global_rsv = &root->fs_info->global_block_rsv;
btrfs_i_size_write(inode, 0);
/*
- * This is a bit simpler than btrfs_truncate since
- *
- * 1) We've already reserved our space for our orphan item in the
- * unlink.
- * 2) We're going to delete the inode item, so we don't need to update
- * it at all.
- *
- * So we just need to reserve some slack space in case we add bytes when
- * doing the truncate.
+ * This is a bit simpler than btrfs_truncate since we've already
+ * reserved our space for our orphan item in the unlink, so we just
+ * need to reserve some slack space in case we add bytes and update
+ * inode item when doing the truncate.
*/
while (1) {
- ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+ ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ BTRFS_RESERVE_FLUSH_LIMIT);
/*
* Try and steal from the global reserve since we will
@@ -3804,7 +3941,7 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_lflush(root, 1);
if (IS_ERR(trans)) {
btrfs_orphan_del(NULL, inode);
btrfs_free_block_rsv(root, rsv);
@@ -3814,13 +3951,16 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- if (ret != -EAGAIN)
+ if (ret != -ENOSPC)
break;
- nr = trans->blocks_used;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+
btrfs_end_transaction(trans, root);
trans = NULL;
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
}
btrfs_free_block_rsv(root, rsv);
@@ -3836,9 +3976,8 @@ void btrfs_evict_inode(struct inode *inode)
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(inode));
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
no_delete:
clear_inode(inode);
return;
@@ -4082,7 +4221,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
struct btrfs_iget_args *args = p;
inode->i_ino = args->ino;
BTRFS_I(inode)->root = args->root;
- btrfs_set_inode_space_info(args->root, inode);
return 0;
}
@@ -4173,16 +4311,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- if (unlikely(d_need_lookup(dentry))) {
- memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
- kfree(dentry->d_fsdata);
- dentry->d_fsdata = NULL;
- /* This thing is hashed, drop it for now */
- d_drop(dentry);
- } else {
- ret = btrfs_inode_by_name(dir, dentry, &location);
- }
-
+ ret = btrfs_inode_by_name(dir, dentry, &location);
if (ret < 0)
return ERR_PTR(ret);
@@ -4247,16 +4376,11 @@ static void btrfs_dentry_release(struct dentry *dentry)
}
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct dentry *ret;
ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
- if (unlikely(d_need_lookup(dentry))) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
- spin_unlock(&dentry->d_lock);
- }
return ret;
}
@@ -4457,7 +4581,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
return 0;
- if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
+ if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
nolock = true;
if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4467,10 +4591,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- if (nolock)
- ret = btrfs_end_transaction_nolock(trans, root);
- else
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
}
return ret;
}
@@ -4518,6 +4639,11 @@ int btrfs_dirty_inode(struct inode *inode)
static int btrfs_update_time(struct inode *inode, struct timespec *now,
int flags)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
if (flags & S_VERSION)
inode_inc_iversion(inode);
if (flags & S_CTIME)
@@ -4662,7 +4788,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
- btrfs_set_inode_space_info(root, inode);
+
+ /*
+ * We could have gotten an inode number from somebody who was fsynced
+ * and then removed in this same transaction, so let's just set full
+ * sync since it will be a full sync anyway and this will blow away the
+ * old info in the log.
+ */
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
if (S_ISDIR(mode))
owner = 0;
@@ -4673,6 +4806,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
key[0].offset = 0;
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
key[1].objectid = objectid;
btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
key[1].offset = ref_objectid;
@@ -4690,6 +4829,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
+ memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
+ sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
@@ -4712,8 +4853,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (S_ISREG(mode)) {
if (btrfs_test_opt(root, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root, NODATACOW) ||
- (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
+ if (btrfs_test_opt(root, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
}
@@ -4723,6 +4863,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, inode);
+ btrfs_update_root_times(trans, root);
+
return inode;
fail:
if (dir)
@@ -4777,7 +4919,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
ret = btrfs_insert_dir_item(trans, root, name, name_len,
parent_inode, &key,
btrfs_inode_type(inode), index);
- if (ret == -EEXIST)
+ if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -4832,7 +4974,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
int err;
int drop_inode = 0;
u64 objectid;
- unsigned long nr = 0;
u64 index = 0;
if (!new_valid_dev(rdev))
@@ -4865,6 +5006,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
+ err = btrfs_update_inode(trans, root, inode);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -4882,9 +5029,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
d_instantiate(dentry, inode);
}
out_unlock:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4893,14 +5039,13 @@ out_unlock:
}
static int btrfs_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, bool excl)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
- int drop_inode = 0;
+ int drop_inode_on_err = 0;
int err;
- unsigned long nr = 0;
u64 objectid;
u64 index = 0;
@@ -4924,12 +5069,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
err = PTR_ERR(inode);
goto out_unlock;
}
+ drop_inode_on_err = 1;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err) {
- drop_inode = 1;
+ if (err)
+ goto out_unlock;
+
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
goto out_unlock;
- }
/*
* If the active LSM wants to access the inode during
@@ -4942,21 +5090,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
- drop_inode = 1;
- else {
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- d_instantiate(dentry, inode);
- }
+ goto out_unlock;
+
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ d_instantiate(dentry, inode);
+
out_unlock:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- if (drop_inode) {
+ if (err && drop_inode_on_err) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -4967,7 +5114,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = old_dentry->d_inode;
u64 index;
- unsigned long nr = 0;
int err;
int drop_inode = 0;
@@ -4975,7 +5121,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (root->objectid != BTRFS_I(inode)->root->objectid)
return -EXDEV;
- if (inode->i_nlink == ~0U)
+ if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
err = btrfs_set_inode_index(dir, &index);
@@ -4997,6 +5143,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -5011,14 +5158,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -5031,7 +5177,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int drop_on_err = 0;
u64 objectid = 0;
u64 index = 0;
- unsigned long nr = 1;
/*
* 2 items for inode and ref
@@ -5077,11 +5222,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
drop_on_err = 0;
out_fail:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -5275,6 +5419,7 @@ again:
if (start + len <= found_key.offset)
goto not_found;
em->start = start;
+ em->orig_start = start;
em->len = found_key.offset - start;
goto not_found_em;
}
@@ -5285,6 +5430,8 @@ again:
em->len = extent_end - extent_start;
em->orig_start = extent_start -
btrfs_file_extent_offset(leaf, item);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+ item);
bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
if (bytenr == 0) {
em->block_start = EXTENT_MAP_HOLE;
@@ -5294,8 +5441,7 @@ again:
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
em->block_start = bytenr;
- em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
- item);
+ em->block_len = em->orig_block_len;
} else {
bytenr += btrfs_file_extent_offset(leaf, item);
em->block_start = bytenr;
@@ -5325,7 +5471,8 @@ again:
em->start = extent_start + extent_offset;
em->len = (copy_size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
- em->orig_start = EXTENT_MAP_INLINE;
+ em->orig_block_len = em->len;
+ em->orig_start = em->start;
if (compress_type) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
@@ -5374,11 +5521,11 @@ again:
extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
} else {
- printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
- WARN_ON(1);
+ WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
}
not_found:
em->start = start;
+ em->orig_start = start;
em->len = len;
not_found_em:
em->block_start = EXTENT_MAP_HOLE;
@@ -5439,7 +5586,8 @@ insert:
write_unlock(&em_tree->lock);
out:
- trace_btrfs_get_extent(root, em);
+ if (em)
+ trace_btrfs_get_extent(root, em);
if (path)
btrfs_free_path(path);
@@ -5473,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
return em;
if (em) {
/*
- * if our em maps to a hole, there might
- * actually be delalloc bytes behind it
+ * if our em maps to
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
*/
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return em;
else
hole_em = em;
@@ -5558,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
em->start = range_start;
em->len = found;
@@ -5579,38 +5732,19 @@ out:
}
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
- struct extent_map *em,
u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
int ret;
- bool insert = false;
-
- /*
- * Ok if the extent map we looked up is a hole and is for the exact
- * range we want, there is no reason to allocate a new one, however if
- * it is not right then we need to free this one and drop the cache for
- * our range.
- */
- if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
- em->len != len) {
- free_extent_map(em);
- em = NULL;
- insert = true;
- btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
- }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return ERR_CAST(trans);
- if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
- btrfs_add_inode_defrag(trans, inode);
-
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5621,37 +5755,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
goto out;
}
- if (!em) {
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- }
-
- em->start = start;
- em->orig_start = em->start;
- em->len = ins.offset;
-
- em->block_start = ins.objectid;
- em->block_len = ins.offset;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
-
- /*
- * We need to do this because if we're using the original em we searched
- * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
- */
- em->flags = 0;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
- while (insert) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST)
- break;
- btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
- }
+ em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+ ins.offset, ins.offset, 0);
+ if (IS_ERR(em))
+ goto out;
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
@@ -5764,18 +5871,159 @@ out:
return ret;
}
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state, int writing)
+{
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ while (1) {
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, cached_state);
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure theres no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered && (!writing ||
+ !test_range_bit(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, EXTENT_UPTODATE, 0,
+ *cached_state)))
+ break;
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state, GFP_NOFS);
+
+ if (ordered) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /* Screw you mmap */
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ lockstart,
+ lockend);
+ if (ret)
+ break;
+
+ /*
+ * If we found a page that couldn't be invalidated just
+ * fall back to buffered.
+ */
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ lockstart >> PAGE_CACHE_SHIFT,
+ lockend >> PAGE_CACHE_SHIFT);
+ if (ret)
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+ u64 len, u64 orig_start,
+ u64 block_start, u64 block_len,
+ u64 orig_block_len, int type)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ em = alloc_extent_map();
+ if (!em)
+ return ERR_PTR(-ENOMEM);
+
+ em->start = start;
+ em->orig_start = orig_start;
+ em->len = len;
+ em->block_len = block_len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->orig_block_len = orig_block_len;
+ em->generation = -1;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ if (type == BTRFS_ORDERED_PREALLOC)
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
+
+ do {
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+
+ if (ret) {
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+
+ return em;
+}
+
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
u64 start = iblock << inode->i_blkbits;
+ u64 lockstart, lockend;
u64 len = bh_result->b_size;
struct btrfs_trans_handle *trans;
+ int unlock_bits = EXTENT_LOCKED;
+ int ret;
+
+ if (create) {
+ ret = btrfs_delalloc_reserve_space(inode, len);
+ if (ret)
+ return ret;
+ unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+ } else {
+ len = min_t(u64, len, root->sectorsize);
+ }
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered.
+ */
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+ return -ENOTBLK;
+
+ if (create) {
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_DELALLOC, NULL,
+ &cached_state, GFP_NOFS);
+ if (ret)
+ goto unlock_err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
/*
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5794,17 +6042,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- return -ENOTBLK;
+ ret = -ENOTBLK;
+ goto unlock_err;
}
/* Just a good old fashioned hole, return */
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
free_extent_map(em);
- /* DIO will do one hole at a time, so just unlock a sector */
- unlock_extent(&BTRFS_I(inode)->io_tree, start,
- start + root->sectorsize - 1);
- return 0;
+ ret = 0;
+ goto unlock_err;
}
/*
@@ -5817,8 +6064,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
*
*/
if (!create) {
- len = em->len - (start - em->start);
- goto map;
+ len = min(len, em->len - (start - em->start));
+ lockstart = start + len;
+ goto unlock;
}
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5845,12 +6093,27 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto must_cow;
if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ u64 orig_start = em->orig_start;
+ u64 orig_block_len = em->orig_block_len;
+
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ em = create_pinned_em(inode, start, len,
+ orig_start,
+ block_start, len,
+ orig_block_len, type);
+ if (IS_ERR(em)) {
+ btrfs_end_transaction(trans, root);
+ goto unlock_err;
+ }
+ }
+
ret = btrfs_add_ordered_extent_dio(inode, start,
block_start, len, len, type);
btrfs_end_transaction(trans, root);
if (ret) {
free_extent_map(em);
- return ret;
+ goto unlock_err;
}
goto unlock;
}
@@ -5862,15 +6125,14 @@ must_cow:
* it above
*/
len = bh_result->b_size;
- em = btrfs_new_extent_direct(inode, em, start, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
len = min(len, em->len - (start - em->start));
unlock:
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
- 0, NULL, GFP_NOFS);
-map:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
@@ -5888,9 +6150,45 @@ map:
i_size_write(inode, start + len);
}
+ /*
+ * In the case of write we need to clear and unlock the entire range,
+ * in the case of read we need to unlock only the end area that we
+ * aren't using if there is any left over space.
+ */
+ if (lockstart < lockend) {
+ if (create && len < lockend - lockstart) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockstart + len - 1,
+ unlock_bits | EXTENT_DEFRAG, 1, 0,
+ &cached_state, GFP_NOFS);
+ /*
+ * Beside unlock, we also need to cleanup reserved space
+ * for the left range by attaching EXTENT_DO_ACCOUNTING.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ lockstart + len, lockend,
+ unlock_bits | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
+ } else {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ }
+ } else {
+ free_extent_state(cached_state);
+ }
+
free_extent_map(em);
return 0;
+
+unlock_err:
+ if (create)
+ unlock_bits |= EXTENT_DO_ACCOUNTING;
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ return ret;
}
struct btrfs_dio_private {
@@ -5898,7 +6196,6 @@ struct btrfs_dio_private {
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
- u32 *csums;
void *private;
/* number of bios pending for this dio */
@@ -5918,7 +6215,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start;
- u32 *private = dip->csums;
start = dip->logical_offset;
do {
@@ -5926,8 +6222,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct page *page = bvec->bv_page;
char *kaddr;
u32 csum = ~(u32)0;
+ u64 private = ~(u32)0;
unsigned long flags;
+ if (get_state_private(&BTRFS_I(inode)->io_tree,
+ start, &private))
+ goto failed;
local_irq_save(flags);
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5937,18 +6237,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
local_irq_restore(flags);
flush_dcache_page(bvec->bv_page);
- if (csum != *private) {
+ if (csum != private) {
+failed:
printk(KERN_ERR "btrfs csum failed ino %llu off"
" %llu csum %u private %u\n",
(unsigned long long)btrfs_ino(inode),
(unsigned long long)start,
- csum, *private);
+ csum, (unsigned)private);
err = -EIO;
}
}
start += bvec->bv_len;
- private++;
bvec++;
} while (bvec <= bvec_end);
@@ -5956,7 +6256,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
dip->logical_offset + dip->bytes - 1);
bio->bi_private = dip->private;
- kfree(dip->csums);
kfree(dip);
/* If we had a csum failure make sure to clear the uptodate flag */
@@ -6062,12 +6361,15 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
- u32 *csums, int async_submit)
+ int async_submit)
{
int write = rw & REQ_WRITE;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ if (async_submit)
+ async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
bio_get(bio);
if (!write) {
@@ -6095,8 +6397,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
if (ret)
goto err;
} else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
- file_offset, csums);
+ ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
if (ret)
goto err;
}
@@ -6113,7 +6414,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6122,13 +6422,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 submit_len = 0;
u64 map_length;
int nr_pages = 0;
- u32 *csums = dip->csums;
int ret = 0;
int async_submit = 0;
- int write = rw & REQ_WRITE;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+ ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
bio_put(orig_bio);
@@ -6161,16 +6459,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
atomic_inc(&dip->pending_bios);
ret = __btrfs_submit_dio_bio(bio, inode, rw,
file_offset, skip_sum,
- csums, async_submit);
+ async_submit);
if (ret) {
bio_put(bio);
atomic_dec(&dip->pending_bios);
goto out_err;
}
- /* Write's use the ordered csums */
- if (!write && !skip_sum)
- csums = csums + nr_pages;
start_sector += submit_len >> 9;
file_offset += submit_len;
@@ -6185,7 +6480,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_end_io = btrfs_end_dio_bio;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+ ret = btrfs_map_block(root->fs_info, READ,
+ start_sector << 9,
&map_length, NULL, 0);
if (ret) {
bio_put(bio);
@@ -6200,7 +6496,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
submit:
ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
- csums, async_submit);
+ async_submit);
if (!ret)
return 0;
@@ -6236,17 +6532,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
ret = -ENOMEM;
goto free_ordered;
}
- dip->csums = NULL;
-
- /* Write's use the ordered csum stuff, so we don't need dip->csums */
- if (!write && !skip_sum) {
- dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
- if (!dip->csums) {
- kfree(dip);
- ret = -ENOMEM;
- goto free_ordered;
- }
- }
dip->private = bio->bi_private;
dip->inode = inode;
@@ -6331,137 +6616,35 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
out:
return retval;
}
+
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- u64 lockstart, lockend;
- ssize_t ret;
- int writing = rw & WRITE;
- int write_bits = 0;
- size_t count = iov_length(iov, nr_segs);
if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
- offset, nr_segs)) {
+ offset, nr_segs))
return 0;
- }
- lockstart = offset;
- lockend = offset + count - 1;
-
- if (writing) {
- ret = btrfs_delalloc_reserve_space(inode, count);
- if (ret)
- goto out;
- }
-
- while (1) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
- /*
- * We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure theres no ordered
- * extents in this range.
- */
- ordered = btrfs_lookup_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
-
- /*
- * We need to make sure there are no buffered pages in this
- * range either, we could have raced between the invalidate in
- * generic_file_direct_write and locking the extent. The
- * invalidate needs to happen so that reads after a write do not
- * get stale data.
- */
- if (!ordered && (!writing ||
- !test_range_bit(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, EXTENT_UPTODATE, 0,
- cached_state)))
- break;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state, GFP_NOFS);
-
- if (ordered) {
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- } else {
- /* Screw you mmap */
- ret = filemap_write_and_wait_range(file->f_mapping,
- lockstart,
- lockend);
- if (ret)
- goto out;
-
- /*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
- */
- ret = invalidate_inode_pages2_range(file->f_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret) {
- if (ret == -EBUSY)
- ret = 0;
- goto out;
- }
- }
-
- cond_resched();
- }
-
- /*
- * we don't use btrfs_set_extent_delalloc because we don't want
- * the dirty or uptodate bits
- */
- if (writing) {
- write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DELALLOC, NULL, &cached_state,
- GFP_NOFS);
- if (ret) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_LOCKED | write_bits,
- 1, 0, &cached_state, GFP_NOFS);
- goto out;
- }
- }
-
- free_extent_state(cached_state);
- cached_state = NULL;
-
- ret = __blockdev_direct_IO(rw, iocb, inode,
+ return __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, 0);
-
- if (ret < 0 && ret != -EIOCBQUEUED) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
- offset + iov_length(iov, nr_segs) - 1,
- EXTENT_LOCKED | write_bits, 1, 0,
- &cached_state, GFP_NOFS);
- } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
- /*
- * We're falling back to buffered, unlock the section we didn't
- * do IO on.
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
- offset + iov_length(iov, nr_segs) - 1,
- EXTENT_LOCKED | write_bits, 1, 0,
- &cached_state, GFP_NOFS);
- }
-out:
- free_extent_state(cached_state);
- return ret;
}
+#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
+
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
+ int ret;
+
+ ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+ if (ret)
+ return ret;
+
return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
}
@@ -6561,8 +6744,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
- &cached_state, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -6578,7 +6761,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ &cached_state, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -6620,6 +6804,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_start;
u64 page_end;
+ sb_start_pagefault(inode->i_sb);
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
@@ -6674,7 +6859,8 @@ again:
* prepare_pages in the normal write path.
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6705,16 +6891,20 @@ again:
BTRFS_I(inode)->last_trans = root->fs_info->generation;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
out_unlock:
- if (!ret)
+ if (!ret) {
+ sb_end_pagefault(inode->i_sb);
return VM_FAULT_LOCKED;
+ }
unlock_page(page);
out:
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out_noreserve:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
@@ -6725,11 +6915,10 @@ static int btrfs_truncate(struct inode *inode)
int ret;
int err = 0;
struct btrfs_trans_handle *trans;
- unsigned long nr;
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
if (ret)
return ret;
@@ -6772,18 +6961,17 @@ static int btrfs_truncate(struct inode *inode)
* 3) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(root);
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
+ rsv->failfast = 1;
/*
* 1 for the truncate slack space
- * 1 for the orphan item we're going to add
- * 1 for the orphan item deletion
* 1 for updating the inode.
*/
- trans = btrfs_start_transaction(root, 4);
+ trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
@@ -6794,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
min_size);
BUG_ON(ret);
- ret = btrfs_orphan_add(trans, inode);
- if (ret) {
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
@@ -6821,36 +7003,21 @@ static int btrfs_truncate(struct inode *inode)
&BTRFS_I(inode)->runtime_flags))
btrfs_add_ordered_operation(trans, root, inode);
- while (1) {
- ret = btrfs_block_rsv_refill(root, rsv, min_size);
- if (ret) {
- /*
- * This can only happen with the original transaction we
- * started above, every other time we shouldn't have a
- * transaction started yet.
- */
- if (ret == -EAGAIN)
- goto end_trans;
- err = ret;
- break;
- }
-
- if (!trans) {
- /* Just need the 1 for updating the inode */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = err = PTR_ERR(trans);
- trans = NULL;
- break;
- }
- }
-
- trans->block_rsv = rsv;
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ */
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ trans->block_rsv = rsv;
+ while (1) {
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
- if (ret != -EAGAIN) {
+ if (ret != -ENOSPC) {
err = ret;
break;
}
@@ -6861,11 +7028,21 @@ static int btrfs_truncate(struct inode *inode)
err = ret;
break;
}
-end_trans:
- nr = trans->blocks_used;
+
btrfs_end_transaction(trans, root);
- trans = NULL;
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = err = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
}
if (ret == 0 && inode->i_nlink > 0) {
@@ -6873,12 +7050,6 @@ end_trans:
ret = btrfs_orphan_del(trans, inode);
if (ret)
err = ret;
- } else if (ret && inode->i_nlink > 0) {
- /*
- * Failed to do the truncate, remove us from the in memory
- * orphan list.
- */
- ret = btrfs_orphan_del(NULL, inode);
}
if (trans) {
@@ -6887,9 +7058,8 @@ end_trans:
if (ret && !err)
err = ret;
- nr = trans->blocks_used;
ret = btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
}
out:
@@ -6939,7 +7109,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
return NULL;
ei->root = NULL;
- ei->space_info = NULL;
ei->generation = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
@@ -6950,6 +7119,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->last_unlink_trans = 0;
+ ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -6966,6 +7136,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
ei->io_tree.track_uptodate = 1;
ei->io_failure_tree.track_uptodate = 1;
+ atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -6987,7 +7158,7 @@ void btrfs_destroy_inode(struct inode *inode)
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
- WARN_ON(!list_empty(&inode->i_dentry));
+ WARN_ON(!hlist_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
WARN_ON(BTRFS_I(inode)->outstanding_extents);
WARN_ON(BTRFS_I(inode)->reserved_extents);
@@ -7046,7 +7217,7 @@ int btrfs_drop_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_root_refs(&root->root_item) == 0 &&
- !btrfs_is_free_space_inode(root, inode))
+ !btrfs_is_free_space_inode(inode))
return 1;
else
return generic_drop_inode(inode);
@@ -7061,6 +7232,11 @@ static void init_once(void *foo)
void btrfs_destroy_cachep(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
if (btrfs_inode_cachep)
kmem_cache_destroy(btrfs_inode_cachep);
if (btrfs_trans_handle_cachep)
@@ -7071,40 +7247,49 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_path_cachep);
if (btrfs_free_space_cachep)
kmem_cache_destroy(btrfs_free_space_cachep);
+ if (btrfs_delalloc_work_cachep)
+ kmem_cache_destroy(btrfs_delalloc_work_cachep);
}
int btrfs_init_cachep(void)
{
- btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+ btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
if (!btrfs_inode_cachep)
goto fail;
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
sizeof(struct btrfs_trans_handle), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
goto fail;
- btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+ btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
sizeof(struct btrfs_transaction), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_transaction_cachep)
goto fail;
- btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+ btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
goto fail;
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
sizeof(struct btrfs_free_space), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_cachep)
goto fail;
+ btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+ sizeof(struct btrfs_delalloc_work), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_delalloc_work_cachep)
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -7176,6 +7361,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
+
+
+ /* check for collisions, even if the name isn't there */
+ ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+
+ if (ret) {
+ if (ret == -EEXIST) {
+ /* we shouldn't get
+ * eexist without a new_inode */
+ if (!new_inode) {
+ WARN_ON(1);
+ return ret;
+ }
+ } else {
+ /* maybe -EOVERFLOW */
+ return ret;
+ }
+ }
+ ret = 0;
+
/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
@@ -7315,39 +7522,110 @@ out_notrans:
return ret;
}
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+ struct btrfs_delalloc_work *delalloc_work;
+
+ delalloc_work = container_of(work, struct btrfs_delalloc_work,
+ work);
+ if (delalloc_work->wait)
+ btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+ else
+ filemap_flush(delalloc_work->inode->i_mapping);
+
+ if (delalloc_work->delay_iput)
+ btrfs_add_delayed_iput(delalloc_work->inode);
+ else
+ iput(delalloc_work->inode);
+ complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput)
+{
+ struct btrfs_delalloc_work *work;
+
+ work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ if (!work)
+ return NULL;
+
+ init_completion(&work->completion);
+ INIT_LIST_HEAD(&work->list);
+ work->inode = inode;
+ work->wait = wait;
+ work->delay_iput = delay_iput;
+ work->work.func = btrfs_run_delalloc_work;
+
+ return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+ wait_for_completion(&work->completion);
+ kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
/*
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
- struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
struct inode *inode;
+ struct btrfs_delalloc_work *work, *next;
+ struct list_head works;
+ struct list_head splice;
+ int ret = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ INIT_LIST_HEAD(&works);
+ INIT_LIST_HEAD(&splice);
+again:
spin_lock(&root->fs_info->delalloc_lock);
- while (!list_empty(head)) {
- binode = list_entry(head->next, struct btrfs_inode,
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ while (!list_empty(&splice)) {
+ binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
+
+ list_del_init(&binode->delalloc_inodes);
+
inode = igrab(&binode->vfs_inode);
if (!inode)
- list_del_init(&binode->delalloc_inodes);
+ continue;
+
+ list_add_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
spin_unlock(&root->fs_info->delalloc_lock);
- if (inode) {
- filemap_flush(inode->i_mapping);
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+
+ work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ if (unlikely(!work)) {
+ ret = -ENOMEM;
+ goto out;
}
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
+
cond_resched();
spin_lock(&root->fs_info->delalloc_lock);
}
spin_unlock(&root->fs_info->delalloc_lock);
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (!list_empty(&root->fs_info->delalloc_inodes)) {
+ spin_unlock(&root->fs_info->delalloc_lock);
+ goto again;
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
/* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
@@ -7361,6 +7639,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
}
atomic_dec(&root->fs_info->async_submit_draining);
return 0;
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ return ret;
}
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7380,7 +7670,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
unsigned long ptr;
struct btrfs_file_extent_item *ei;
struct extent_buffer *leaf;
- unsigned long nr = 0;
name_len = strlen(symname) + 1;
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7478,13 +7767,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
out_unlock:
if (!err)
d_instantiate(dentry, inode);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -7493,6 +7781,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
@@ -7533,6 +7823,38 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ goto next;
+ }
+
+ em->start = cur_offset;
+ em->orig_start = cur_offset;
+ em->len = ins.offset;
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->generation = trans->transid;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset - 1,
+ 0);
+ }
+ free_extent_map(em);
+next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
*alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0e92e576300..338f2597bf7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -41,6 +41,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
+#include <linux/uuid.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -53,6 +54,8 @@
#include "inode-map.h"
#include "backref.h"
#include "rcu-string.h"
+#include "send.h"
+#include "dev-replace.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -138,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
}
- if (flags & BTRFS_INODE_NODATACOW)
+ if (flags & BTRFS_INODE_NODATACOW) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ }
btrfs_update_iflags(inode);
}
@@ -179,6 +185,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
int ret;
u64 ip_oldflags;
unsigned int i_oldflags;
+ umode_t mode;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -193,10 +200,15 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
mutex_lock(&inode->i_mutex);
ip_oldflags = ip->flags;
i_oldflags = inode->i_flags;
+ mode = inode->i_mode;
flags = btrfs_mask_flags(inode->i_mode, flags);
oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -207,10 +219,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
}
- ret = mnt_want_write_file(file);
- if (ret)
- goto out_unlock;
-
if (flags & FS_SYNC_FL)
ip->flags |= BTRFS_INODE_SYNC;
else
@@ -235,10 +243,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags |= BTRFS_INODE_DIRSYNC;
else
ip->flags &= ~BTRFS_INODE_DIRSYNC;
- if (flags & FS_NOCOW_FL)
- ip->flags |= BTRFS_INODE_NODATACOW;
- else
- ip->flags &= ~BTRFS_INODE_NODATACOW;
+ if (flags & FS_NOCOW_FL) {
+ if (S_ISREG(mode)) {
+ /*
+ * It's safe to turn csums off here, no extents exist.
+ * Otherwise we want the flag to reflect the real COW
+ * status of the file and will not set it.
+ */
+ if (inode->i_size == 0)
+ ip->flags |= BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM;
+ } else {
+ ip->flags |= BTRFS_INODE_NODATACOW;
+ }
+ } else {
+ /*
+ * Revert back under same assuptions as above
+ */
+ if (S_ISREG(mode)) {
+ if (inode->i_size == 0)
+ ip->flags &= ~(BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM);
+ } else {
+ ip->flags &= ~BTRFS_INODE_NODATACOW;
+ }
+ }
/*
* The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -273,9 +302,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode->i_flags = i_oldflags;
}
- mnt_drop_write_file(file);
out_unlock:
mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(file);
return ret;
}
@@ -318,7 +347,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes)
+ if (range.start > total_bytes ||
+ range.len < fs_info->sb->s_blocksize)
return -EINVAL;
range.len = min(range.len, total_bytes - range.start);
@@ -336,7 +366,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
static noinline int create_subvol(struct btrfs_root *root,
struct dentry *dentry,
char *name, int namelen,
- u64 *async_transid)
+ u64 *async_transid,
+ struct btrfs_qgroup_inherit **inherit)
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
@@ -346,11 +377,13 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_root *new_root;
struct dentry *parent = dentry->d_parent;
struct inode *dir;
+ struct timespec cur_time = CURRENT_TIME;
int ret;
int err;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
+ uuid_le new_uuid;
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret)
@@ -368,6 +401,11 @@ static noinline int create_subvol(struct btrfs_root *root,
if (IS_ERR(trans))
return PTR_ERR(trans);
+ ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
+ inherit ? *inherit : NULL);
+ if (ret)
+ goto fail;
+
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
@@ -389,8 +427,9 @@ static noinline int create_subvol(struct btrfs_root *root,
BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
+ memset(&root_item, 0, sizeof(root_item));
+
inode_item = &root_item.inode;
- memset(inode_item, 0, sizeof(*inode_item));
inode_item->generation = cpu_to_le64(1);
inode_item->size = cpu_to_le64(3);
inode_item->nlink = cpu_to_le32(1);
@@ -408,8 +447,15 @@ static noinline int create_subvol(struct btrfs_root *root,
btrfs_set_root_used(&root_item, leaf->len);
btrfs_set_root_last_snapshot(&root_item, 0);
- memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
- root_item.drop_level = 0;
+ btrfs_set_root_generation_v2(&root_item,
+ btrfs_root_generation(&root_item));
+ uuid_le_gen(&new_uuid);
+ memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
+ root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec);
+ root_item.ctime = root_item.otime;
+ btrfs_set_root_ctransid(&root_item, trans->transid);
+ btrfs_set_root_otransid(&root_item, trans->transid);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
@@ -469,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root,
BUG_ON(ret);
- d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
if (async_transid) {
*async_transid = trans->transid;
@@ -479,12 +524,16 @@ fail:
}
if (err && !ret)
ret = err;
+
+ if (!ret)
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+
return ret;
}
static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
char *name, int namelen, u64 *async_transid,
- bool readonly)
+ bool readonly, struct btrfs_qgroup_inherit **inherit)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -498,12 +547,17 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!pending_snapshot)
return -ENOMEM;
- btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+ BTRFS_BLOCK_RSV_TEMP);
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
+ if (inherit) {
+ pending_snapshot->inherit = *inherit;
+ *inherit = NULL; /* take responsibility to free it */
+ }
- trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+ trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
@@ -524,7 +578,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
ret = btrfs_commit_transaction(trans,
root->fs_info->extent_root);
}
- BUG_ON(ret);
+ if (ret) {
+ /* cleanup_transaction has freed this for us */
+ if (trans->aborted)
+ pending_snapshot = NULL;
+ goto fail;
+ }
ret = pending_snapshot->error;
if (ret)
@@ -553,13 +612,13 @@ fail:
*/
static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
{
- uid_t fsuid = current_fsuid();
+ kuid_t fsuid = current_fsuid();
if (!(dir->i_mode & S_ISVTX))
return 0;
- if (inode->i_uid == fsuid)
+ if (uid_eq(inode->i_uid, fsuid))
return 0;
- if (dir->i_uid == fsuid)
+ if (uid_eq(dir->i_uid, fsuid))
return 0;
return !capable(CAP_FOWNER);
}
@@ -592,7 +651,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
return -ENOENT;
BUG_ON(victim->d_parent->d_inode != dir);
- audit_inode_child(victim, dir);
+ audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
@@ -635,7 +694,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
- u64 *async_transid, bool readonly)
+ u64 *async_transid, bool readonly,
+ struct btrfs_qgroup_inherit **inherit)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
@@ -652,13 +712,19 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (dentry->d_inode)
goto out_dput;
- error = mnt_want_write(parent->mnt);
+ error = btrfs_may_create(dir, dentry);
if (error)
goto out_dput;
- error = btrfs_may_create(dir, dentry);
+ /*
+ * even if this name doesn't exist, we may get hash collisions.
+ * check for them now when we can safely fail
+ */
+ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+ dir->i_ino, name,
+ namelen);
if (error)
- goto out_drop_write;
+ goto out_dput;
down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
@@ -666,18 +732,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry,
- name, namelen, async_transid, readonly);
+ error = create_snapshot(snap_src, dentry, name, namelen,
+ async_transid, readonly, inherit);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
- name, namelen, async_transid);
+ name, namelen, async_transid, inherit);
}
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
-out_drop_write:
- mnt_drop_write(parent->mnt);
out_dput:
dput(dentry);
out_unlock:
@@ -832,7 +896,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
}
static int should_defrag_range(struct inode *inode, u64 start, int thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end)
+ u64 *last_len, u64 *skip, u64 *defrag_end,
+ int compress)
{
struct extent_map *em;
int ret = 1;
@@ -863,7 +928,7 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
*/
- if ((*last_len == 0 || *last_len >= thresh) &&
+ if (!compress && (*last_len == 0 || *last_len >= thresh) &&
(em->len >= thresh || !next_mergeable))
ret = 0;
out:
@@ -1004,8 +1069,8 @@ again:
page_start, page_end - 1, 0, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ &cached_state, GFP_NOFS);
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -1016,8 +1081,8 @@ again:
}
- btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
- &cached_state);
+ set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+ &cached_state, GFP_NOFS);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state,
@@ -1047,11 +1112,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_super_block *disk_super;
struct file_ra_state *ra = NULL;
unsigned long last_index;
u64 isize = i_size_read(inode);
- u64 features;
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
@@ -1145,7 +1208,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
extent_thresh, &last_len, &skip,
- &defrag_end)) {
+ &defrag_end, range->flags &
+ BTRFS_DEFRAG_RANGE_COMPRESS)) {
unsigned long next;
/*
* the should_defrag function tells us how much to skip
@@ -1182,7 +1246,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
defrag_count += ret;
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
mutex_unlock(&inode->i_mutex);
if (newer_than) {
@@ -1237,11 +1301,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
mutex_unlock(&inode->i_mutex);
}
- disk_super = root->fs_info->super_copy;
- features = btrfs_super_incompat_flags(disk_super);
if (range->compress_type == BTRFS_COMPRESS_LZO) {
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
}
ret = defrag_count;
@@ -1253,12 +1314,13 @@ out_ra:
return ret;
}
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
u64 new_size;
u64 old_size;
u64 devid = 1;
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
@@ -1273,13 +1335,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- mutex_lock(&root->fs_info->volume_mutex);
- if (root->fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: balance in progress\n");
- ret = -EINVAL;
- goto out;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
+ mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -1299,16 +1366,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
- device = btrfs_find_device(root, devid, NULL, NULL);
+
+ device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (!device) {
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_free;
}
- if (device->fs_devices && device->fs_devices->seeding) {
+
+ if (!device->writeable) {
printk(KERN_INFO "btrfs: resizer unable to apply on "
- "seeding device %llu\n",
+ "readonly device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_free;
@@ -1331,6 +1400,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
}
}
+ if (device->is_tgtdev_for_dev_replace) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
old_size = device->total_bytes;
if (mod < 0) {
@@ -1369,66 +1443,66 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
btrfs_commit_transaction(trans, root);
} else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
- }
+ } /* equal, nothing need to do */
out_free:
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
- char *name,
- unsigned long fd,
- int subvol,
- u64 *transid,
- bool readonly)
+ char *name, unsigned long fd, int subvol,
+ u64 *transid, bool readonly,
+ struct btrfs_qgroup_inherit **inherit)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
- struct file *src_file;
int namelen;
int ret = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
namelen = strlen(name);
if (strchr(name, '/')) {
ret = -EINVAL;
- goto out;
+ goto out_drop_write;
}
if (name[0] == '.' &&
(namelen == 1 || (name[1] == '.' && namelen == 2))) {
ret = -EEXIST;
- goto out;
+ goto out_drop_write;
}
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, transid, readonly);
+ NULL, transid, readonly, inherit);
} else {
+ struct fd src = fdget(fd);
struct inode *src_inode;
- src_file = fget(fd);
- if (!src_file) {
+ if (!src.file) {
ret = -EINVAL;
- goto out;
+ goto out_drop_write;
}
- src_inode = src_file->f_path.dentry->d_inode;
+ src_inode = src.file->f_path.dentry->d_inode;
if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
printk(KERN_INFO "btrfs: Snapshot src from "
"another FS\n");
ret = -EINVAL;
- fput(src_file);
- goto out;
+ } else {
+ ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ BTRFS_I(src_inode)->root,
+ transid, readonly, inherit);
}
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
- BTRFS_I(src_inode)->root,
- transid, readonly);
- fput(src_file);
+ fdput(src);
}
+out_drop_write:
+ mnt_drop_write_file(file);
out:
return ret;
}
@@ -1446,7 +1520,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol,
- NULL, false);
+ NULL, false, NULL);
kfree(vol_args);
return ret;
@@ -1460,6 +1534,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
u64 transid = 0;
u64 *ptr = NULL;
bool readonly = false;
+ struct btrfs_qgroup_inherit *inherit = NULL;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
@@ -1467,7 +1542,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
if (vol_args->flags &
- ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+ ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+ BTRFS_SUBVOL_QGROUP_INHERIT)) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -1476,10 +1552,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ptr = &transid;
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
+ if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+ if (vol_args->size > PAGE_CACHE_SIZE) {
+ ret = -EINVAL;
+ goto out;
+ }
+ inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+ if (IS_ERR(inherit)) {
+ ret = PTR_ERR(inherit);
+ goto out;
+ }
+ }
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol,
- ptr, readonly);
+ vol_args->fd, subvol, ptr,
+ readonly, &inherit);
if (ret == 0 && ptr &&
copy_to_user(arg +
@@ -1488,6 +1575,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = -EFAULT;
out:
kfree(vol_args);
+ kfree(inherit);
return ret;
}
@@ -1523,29 +1611,40 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
u64 flags;
int ret = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
- if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
- return -EINVAL;
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
- if (copy_from_user(&flags, arg, sizeof(flags)))
- return -EFAULT;
+ if (copy_from_user(&flags, arg, sizeof(flags))) {
+ ret = -EFAULT;
+ goto out_drop_write;
+ }
- if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
- return -EINVAL;
+ if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
- if (flags & ~BTRFS_SUBVOL_RDONLY)
- return -EOPNOTSUPP;
+ if (flags & ~BTRFS_SUBVOL_RDONLY) {
+ ret = -EOPNOTSUPP;
+ goto out_drop_write;
+ }
- if (!inode_owner_or_capable(inode))
- return -EACCES;
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out_drop_write;
+ }
down_write(&root->fs_info->subvol_sem);
/* nothing to do */
if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
- goto out;
+ goto out_drop_sem;
root_flags = btrfs_root_flags(&root->root_item);
if (flags & BTRFS_SUBVOL_RDONLY)
@@ -1568,8 +1667,11 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
out_reset:
if (ret)
btrfs_set_root_flags(&root->root_item, root_flags);
-out:
+out_drop_sem:
up_write(&root->fs_info->subvol_sem);
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
return ret;
}
@@ -1999,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
-
- /* check if subvolume may be deleted by a non-root user */
- err = btrfs_may_delete(dir, dentry, 1);
- if (err)
- goto out_dput;
}
+ /* check if subvolume may be deleted by a user */
+ err = btrfs_may_delete(dir, dentry, 1);
+ if (err)
+ goto out_dput;
+
if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
@@ -2087,13 +2189,22 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
struct btrfs_ioctl_defrag_range_args *range;
int ret;
- if (btrfs_root_readonly(root))
- return -EROFS;
-
ret = mnt_want_write_file(file);
if (ret)
return ret;
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ mnt_drop_write_file(file);
+ return -EINVAL;
+ }
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
+ }
+
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -2143,6 +2254,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EINVAL;
}
out:
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
mnt_drop_write_file(file);
return ret;
}
@@ -2155,13 +2267,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- mutex_lock(&root->fs_info->volume_mutex);
- if (root->fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: balance in progress\n");
- ret = -EINVAL;
- goto out;
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ return -EINVAL;
}
+ mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2174,27 +2286,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
return ret;
}
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
- mutex_lock(&root->fs_info->volume_mutex);
- if (root->fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: balance in progress\n");
- ret = -EINVAL;
- goto out;
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
+ mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2207,6 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2262,7 +2380,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
s_uuid = di_args->uuid;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -2299,7 +2417,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file *src_file;
+ struct fd src_file;
struct inode *src;
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
@@ -2311,7 +2429,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- u64 hint_byte;
/*
* TODO:
@@ -2334,20 +2451,24 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (ret)
return ret;
- src_file = fget(srcfd);
- if (!src_file) {
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
ret = -EBADF;
goto out_drop_write;
}
- src = src_file->f_dentry->d_inode;
+ ret = -EXDEV;
+ if (src_file.file->f_path.mnt != file->f_path.mnt)
+ goto out_fput;
+
+ src = src_file.file->f_dentry->d_inode;
ret = -EINVAL;
if (src == inode)
goto out_fput;
/* the src must be open for reading */
- if (!(src_file->f_mode & FMODE_READ))
+ if (!(src_file.file->f_mode & FMODE_READ))
goto out_fput;
/* don't make the dst file partly checksummed */
@@ -2360,7 +2481,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
goto out_fput;
ret = -EXDEV;
- if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+ if (src->i_sb != inode->i_sb)
goto out_fput;
ret = -ENOMEM;
@@ -2412,13 +2533,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
another, and lock file content */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
- ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+ lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
if (!ordered &&
- !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
- EXTENT_DELALLOC, 0, NULL))
+ !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
+ EXTENT_DELALLOC, 0, NULL))
break;
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
btrfs_wait_ordered_range(src, off, len);
@@ -2434,13 +2555,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* note the key will change type as we walk through the
* tree.
*/
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+ 0, 0);
if (ret < 0)
goto out;
nritems = btrfs_header_nritems(path->nodes[0]);
if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
goto out;
if (ret > 0)
@@ -2491,7 +2613,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_release_path(path);
if (key.offset + datal <= off ||
- key.offset >= off+len)
+ key.offset >= off + len - 1)
goto next;
memcpy(&new_key, &key, sizeof(new_key));
@@ -2529,10 +2651,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datal -= off - key.offset;
}
- ret = btrfs_drop_extents(trans, inode,
+ ret = btrfs_drop_extents(trans, root, inode,
new_key.offset,
new_key.offset + datal,
- &hint_byte, 1);
+ 1);
if (ret) {
btrfs_abort_transaction(trans, root,
ret);
@@ -2592,8 +2714,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset += skip;
}
- if (key.offset + datal > off+len)
- trim = key.offset + datal - (off+len);
+ if (key.offset + datal > off + len)
+ trim = key.offset + datal - (off + len);
if (comp && (skip || trim)) {
ret = -EINVAL;
@@ -2603,10 +2725,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
size -= skip + trim;
datal -= skip + trim;
- ret = btrfs_drop_extents(trans, inode,
+ ret = btrfs_drop_extents(trans, root, inode,
new_key.offset,
new_key.offset + datal,
- &hint_byte, 1);
+ 1);
if (ret) {
btrfs_abort_transaction(trans, root,
ret);
@@ -2670,14 +2792,14 @@ next:
ret = 0;
out:
btrfs_release_path(path);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
vfree(buf);
btrfs_free_path(path);
out_fput:
- fput(src_file);
+ fdput(src_file);
out_drop_write:
mnt_drop_write_file(file);
return ret;
@@ -2749,16 +2871,21 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_path *path;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
- struct btrfs_super_block *disk_super;
- u64 features;
u64 objectid = 0;
u64 dir_id;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&objectid, argp, sizeof(objectid)))
- return -EFAULT;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+ ret = -EFAULT;
+ goto out;
+ }
if (!objectid)
objectid = root->root_key.objectid;
@@ -2768,21 +2895,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
location.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
- if (IS_ERR(new_root))
- return PTR_ERR(new_root);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
+ goto out;
+ }
- if (btrfs_root_refs(&new_root->root_item) == 0)
- return -ENOENT;
+ if (btrfs_root_refs(&new_root->root_item) == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_free_path(path);
- return PTR_ERR(trans);
+ ret = PTR_ERR(trans);
+ goto out;
}
dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2793,7 +2927,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_end_transaction(trans, root);
printk(KERN_ERR "Umm, you don't have the default dir item, "
"this isn't going to work\n");
- return -ENOENT;
+ ret = -ENOENT;
+ goto out;
}
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2801,19 +2936,15 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- disk_super = root->fs_info->super_copy;
- features = btrfs_super_incompat_flags(disk_super);
- if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
- features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
- btrfs_set_super_incompat_flags(disk_super, features);
- }
+ btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
btrfs_end_transaction(trans, root);
-
- return 0;
+out:
+ mnt_drop_write_file(file);
+ return ret;
}
-static void get_block_group_info(struct list_head *groups_list,
- struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
{
struct btrfs_block_group_cache *block_group;
@@ -2921,8 +3052,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c])) {
- get_block_group_info(&info->block_groups[c],
- &space);
+ btrfs_get_block_group_info(
+ &info->block_groups[c], &space);
memcpy(dest, &space, sizeof(space));
dest++;
space_args.total_spaces++;
@@ -2973,32 +3104,38 @@ long btrfs_ioctl_trans_end(struct file *file)
return 0;
}
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ void __user *argp)
{
- struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
struct btrfs_trans_handle *trans;
u64 transid;
int ret;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ return PTR_ERR(trans);
+
+ /* No running transaction, don't bother */
+ transid = root->fs_info->last_trans_committed;
+ goto out;
+ }
transid = trans->transid;
ret = btrfs_commit_transaction_async(trans, root, 0);
if (ret) {
btrfs_end_transaction(trans, root);
return ret;
}
-
+out:
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
return -EFAULT;
return 0;
}
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+ void __user *argp)
{
- struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
u64 transid;
if (argp) {
@@ -3010,10 +3147,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
return btrfs_wait_for_commit(root, transid);
}
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
- int ret;
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_scrub_args *sa;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3022,12 +3160,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
- &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+ if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+ 0);
if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
+ if (!(sa->flags & BTRFS_SCRUB_READONLY))
+ mnt_drop_write_file(file);
+out:
kfree(sa);
return ret;
}
@@ -3037,7 +3185,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return btrfs_scrub_cancel(root);
+ return btrfs_scrub_cancel(root->fs_info);
}
static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3063,19 +3211,21 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
}
static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
- void __user *arg, int reset_after_read)
+ void __user *arg)
{
struct btrfs_ioctl_get_dev_stats *sa;
int ret;
- if (reset_after_read && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = btrfs_get_dev_stats(root, sa, reset_after_read);
+ if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+ kfree(sa);
+ return -EPERM;
+ }
+
+ ret = btrfs_get_dev_stats(root, sa);
if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
@@ -3084,6 +3234,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
return ret;
}
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_dev_replace_args *p;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ p = memdup_user(arg, sizeof(*p));
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ switch (p->cmd) {
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+ if (atomic_xchg(
+ &root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ ret = -EINPROGRESS;
+ } else {
+ ret = btrfs_dev_replace_start(root, p);
+ atomic_set(
+ &root->fs_info->mutually_exclusive_operation_running,
+ 0);
+ }
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+ btrfs_dev_replace_status(root->fs_info, p);
+ ret = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+ ret = btrfs_dev_replace_cancel(root->fs_info, p);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_to_user(arg, p, sizeof(*p)))
+ ret = -EFAULT;
+
+ kfree(p);
+ return ret;
+}
+
static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
int ret = 0;
@@ -3168,11 +3363,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
{
int ret = 0;
int size;
- u64 extent_item_pos;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
- struct btrfs_key key;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3190,7 +3383,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- size = min_t(u32, loi->size, 4096);
+ size = min_t(u32, loi->size, 64 * 1024);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -3198,22 +3391,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
- btrfs_release_path(path);
-
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+ build_ino_list, inodes);
+ if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
goto out;
- extent_item_pos = loi->logical - key.objectid;
- ret = iterate_extent_inodes(root->fs_info, key.objectid,
- extent_item_pos, 0, build_ino_list,
- inodes);
-
- if (ret < 0)
- goto out;
-
ret = copy_to_user((void *)(unsigned long)loi->inodes,
(void *)(unsigned long)inodes, size);
if (ret)
@@ -3221,7 +3405,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
out:
btrfs_free_path(path);
- kfree(inodes);
+ vfree(inodes);
kfree(loi);
return ret;
@@ -3260,26 +3444,71 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
+ bool need_unlock; /* for mut. excl. ops lock */
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
- ret = mnt_want_write(file->f_path.mnt);
+ ret = mnt_want_write_file(file);
if (ret)
return ret;
- mutex_lock(&fs_info->volume_mutex);
+again:
+ if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+ need_unlock = true;
+ goto locked;
+ }
+
+ /*
+ * mut. excl. ops lock is locked. Three possibilites:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* this is either (2) or (3) */
+ if (!atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ if (!mutex_trylock(&fs_info->volume_mutex))
+ goto again;
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !atomic_read(&fs_info->balance_running)) {
+ /* this is (3) */
+ need_unlock = false;
+ goto locked;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ goto again;
+ } else {
+ /* this is (2) */
+ mutex_unlock(&fs_info->balance_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ } else {
+ /* this is (1) */
+ mutex_unlock(&fs_info->balance_mutex);
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+locked:
+ BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
- goto out;
+ goto out_unlock;
}
if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3323,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
}
do_balance:
- ret = btrfs_balance(bctl, bargs);
/*
- * bctl is freed in __cancel_balance or in free_fs_info if
- * restriper was paused all the way until unmount
+ * Ownership of bctl and mutually_exclusive_operation_running
+ * goes to to btrfs_balance. bctl is freed in __cancel_balance,
+ * or, if restriper was paused all the way until unmount, in
+ * free_fs_info. mutually_exclusive_operation_running is
+ * cleared in __cancel_balance.
*/
+ need_unlock = false;
+
+ ret = btrfs_balance(bctl, bargs);
+
if (arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
@@ -3335,10 +3570,13 @@ do_balance:
out_bargs:
kfree(bargs);
-out:
+out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
- mnt_drop_write(file->f_path.mnt);
+ if (need_unlock)
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
+ mnt_drop_write_file(file);
return ret;
}
@@ -3390,6 +3628,292 @@ out:
return ret;
}
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_quota_ctl_args *sa;
+ struct btrfs_trans_handle *trans = NULL;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ }
+
+ switch (sa->cmd) {
+ case BTRFS_QUOTA_CTL_ENABLE:
+ ret = btrfs_quota_enable(trans, root->fs_info);
+ break;
+ case BTRFS_QUOTA_CTL_DISABLE:
+ ret = btrfs_quota_disable(trans, root->fs_info);
+ break;
+ case BTRFS_QUOTA_CTL_RESCAN:
+ ret = btrfs_quota_rescan(root->fs_info);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ if (trans) {
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ }
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_qgroup_assign_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /* FIXME: check if the IDs really exist */
+ if (sa->assign) {
+ ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+ sa->src, sa->dst);
+ } else {
+ ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+ sa->src, sa->dst);
+ }
+
+ err = btrfs_end_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_qgroup_create_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ if (!sa->qgroupid) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /* FIXME: check if the IDs really exist */
+ if (sa->create) {
+ ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+ NULL);
+ } else {
+ ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+ }
+
+ err = btrfs_end_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_qgroup_limit_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+ u64 qgroupid;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ qgroupid = sa->qgroupid;
+ if (!qgroupid) {
+ /* take the current subvol as qgroup */
+ qgroupid = root->root_key.objectid;
+ }
+
+ /* FIXME: check if the IDs really exist */
+ ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+ err = btrfs_end_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root_item *root_item = &root->root_item;
+ struct btrfs_trans_handle *trans;
+ struct timespec ct = CURRENT_TIME;
+ int ret = 0;
+
+ ret = mnt_want_write_file(file);
+ if (ret < 0)
+ return ret;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ sa = NULL;
+ goto out;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ sa->rtransid = trans->transid;
+ sa->rtime.sec = ct.tv_sec;
+ sa->rtime.nsec = ct.tv_nsec;
+
+ memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
+ btrfs_set_root_stransid(root_item, sa->stransid);
+ btrfs_set_root_rtransid(root_item, sa->rtransid);
+ root_item->stime.sec = cpu_to_le64(sa->stime.sec);
+ root_item->stime.nsec = cpu_to_le32(sa->stime.nsec);
+ root_item->rtime.sec = cpu_to_le64(sa->rtime.sec);
+ root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec);
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ if (ret < 0) {
+ btrfs_end_transaction(trans, root);
+ trans = NULL;
+ goto out;
+ } else {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = copy_to_user(arg, sa, sizeof(*sa));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(sa);
+ up_write(&root->fs_info->subvol_sem);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -3411,6 +3935,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_snap_create_v2(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SUBVOL_CREATE_V2:
+ return btrfs_ioctl_snap_create_v2(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -3424,11 +3950,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEFRAG_RANGE:
return btrfs_ioctl_defrag(file, argp);
case BTRFS_IOC_RESIZE:
- return btrfs_ioctl_resize(root, argp);
+ return btrfs_ioctl_resize(file, argp);
case BTRFS_IOC_ADD_DEV:
return btrfs_ioctl_add_dev(root, argp);
case BTRFS_IOC_RM_DEV:
- return btrfs_ioctl_rm_dev(root, argp);
+ return btrfs_ioctl_rm_dev(file, argp);
case BTRFS_IOC_FS_INFO:
return btrfs_ioctl_fs_info(root, argp);
case BTRFS_IOC_DEV_INFO:
@@ -3457,11 +3983,11 @@ long btrfs_ioctl(struct file *file, unsigned int
btrfs_sync_fs(file->f_dentry->d_sb, 1);
return 0;
case BTRFS_IOC_START_SYNC:
- return btrfs_ioctl_start_sync(file, argp);
+ return btrfs_ioctl_start_sync(root, argp);
case BTRFS_IOC_WAIT_SYNC:
- return btrfs_ioctl_wait_sync(file, argp);
+ return btrfs_ioctl_wait_sync(root, argp);
case BTRFS_IOC_SCRUB:
- return btrfs_ioctl_scrub(root, argp);
+ return btrfs_ioctl_scrub(file, argp);
case BTRFS_IOC_SCRUB_CANCEL:
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3472,10 +3998,22 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_balance_ctl(root, arg);
case BTRFS_IOC_BALANCE_PROGRESS:
return btrfs_ioctl_balance_progress(root, argp);
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL:
+ return btrfs_ioctl_set_received_subvol(file, argp);
+ case BTRFS_IOC_SEND:
+ return btrfs_ioctl_send(file, argp);
case BTRFS_IOC_GET_DEV_STATS:
- return btrfs_ioctl_get_dev_stats(root, argp, 0);
- case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
- return btrfs_ioctl_get_dev_stats(root, argp, 1);
+ return btrfs_ioctl_get_dev_stats(root, argp);
+ case BTRFS_IOC_QUOTA_CTL:
+ return btrfs_ioctl_quota_ctl(file, argp);
+ case BTRFS_IOC_QGROUP_ASSIGN:
+ return btrfs_ioctl_qgroup_assign(file, argp);
+ case BTRFS_IOC_QGROUP_CREATE:
+ return btrfs_ioctl_qgroup_create(file, argp);
+ case BTRFS_IOC_QGROUP_LIMIT:
+ return btrfs_ioctl_qgroup_limit(file, argp);
+ case BTRFS_IOC_DEV_REPLACE:
+ return btrfs_ioctl_dev_replace(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e440aa653c3..dabca9cc8c2 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,17 +30,50 @@ struct btrfs_ioctl_vol_args {
char name[BTRFS_PATH_NAME_MAX + 1];
};
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+
#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
+#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
#define BTRFS_FSID_SIZE 16
#define BTRFS_UUID_SIZE 16
+#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
+
+struct btrfs_qgroup_limit {
+ __u64 flags;
+ __u64 max_rfer;
+ __u64 max_excl;
+ __u64 rsv_rfer;
+ __u64 rsv_excl;
+};
+
+struct btrfs_qgroup_inherit {
+ __u64 flags;
+ __u64 num_qgroups;
+ __u64 num_ref_copies;
+ __u64 num_excl_copies;
+ struct btrfs_qgroup_limit lim;
+ __u64 qgroups[0];
+};
+
+struct btrfs_ioctl_qgroup_limit_args {
+ __u64 qgroupid;
+ struct btrfs_qgroup_limit lim;
+};
+
#define BTRFS_SUBVOL_NAME_MAX 4039
struct btrfs_ioctl_vol_args_v2 {
__s64 fd;
__u64 transid;
__u64 flags;
- __u64 unused[4];
+ union {
+ struct {
+ __u64 size;
+ struct btrfs_qgroup_inherit __user *qgroup_inherit;
+ };
+ __u64 unused[4];
+ };
char name[BTRFS_SUBVOL_NAME_MAX + 1];
};
@@ -92,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
};
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
+struct btrfs_ioctl_dev_replace_start_params {
+ __u64 srcdevid; /* in, if 0, use srcdev_name instead */
+ __u64 cont_reading_from_srcdev_mode; /* in, see #define
+ * above */
+ __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
+ __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
+struct btrfs_ioctl_dev_replace_status_params {
+ __u64 replace_state; /* out, see #define above */
+ __u64 progress_1000; /* out, 0 <= x <= 1000 */
+ __u64 time_started; /* out, seconds since 1-Jan-1970 */
+ __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
+ __u64 num_write_errors; /* out */
+ __u64 num_uncorrectable_read_errors; /* out */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
+struct btrfs_ioctl_dev_replace_args {
+ __u64 cmd; /* in */
+ __u64 result; /* out */
+
+ union {
+ struct btrfs_ioctl_dev_replace_start_params start;
+ struct btrfs_ioctl_dev_replace_status_params status;
+ }; /* in/out */
+
+ __u64 spare[64];
+};
+
struct btrfs_ioctl_dev_info_args {
__u64 devid; /* in/out */
__u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -285,9 +359,13 @@ enum btrfs_dev_stat_values {
BTRFS_DEV_STAT_VALUES_MAX
};
+/* Reset statistics after reading; needs SYS_ADMIN capability */
+#define BTRFS_DEV_STATS_RESET (1ULL << 0)
+
struct btrfs_ioctl_get_dev_stats {
__u64 devid; /* in */
__u64 nr_items; /* in/out */
+ __u64 flags; /* in/out */
/* out values: */
__u64 values[BTRFS_DEV_STAT_VALUES_MAX];
@@ -295,6 +373,48 @@ struct btrfs_ioctl_get_dev_stats {
__u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
};
+#define BTRFS_QUOTA_CTL_ENABLE 1
+#define BTRFS_QUOTA_CTL_DISABLE 2
+#define BTRFS_QUOTA_CTL_RESCAN 3
+struct btrfs_ioctl_quota_ctl_args {
+ __u64 cmd;
+ __u64 status;
+};
+
+struct btrfs_ioctl_qgroup_assign_args {
+ __u64 assign;
+ __u64 src;
+ __u64 dst;
+};
+
+struct btrfs_ioctl_qgroup_create_args {
+ __u64 create;
+ __u64 qgroupid;
+};
+struct btrfs_ioctl_timespec {
+ __u64 sec;
+ __u32 nsec;
+};
+
+struct btrfs_ioctl_received_subvol_args {
+ char uuid[BTRFS_UUID_SIZE]; /* in */
+ __u64 stransid; /* in */
+ __u64 rtransid; /* out */
+ struct btrfs_ioctl_timespec stime; /* in */
+ struct btrfs_ioctl_timespec rtime; /* out */
+ __u64 flags; /* in */
+ __u64 reserved[16]; /* in */
+};
+
+struct btrfs_ioctl_send_args {
+ __s64 send_fd; /* in */
+ __u64 clone_sources_count; /* in */
+ __u64 __user *clone_sources; /* in */
+ __u64 parent_root; /* in */
+ __u64 flags; /* in */
+ __u64 reserved[4]; /* in */
+};
+
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -339,6 +459,8 @@ struct btrfs_ioctl_get_dev_stats {
#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
+ struct btrfs_ioctl_vol_args_v2)
#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
@@ -359,9 +481,22 @@ struct btrfs_ioctl_get_dev_stats {
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+ struct btrfs_ioctl_received_subvol_args)
+#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
+#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
+ struct btrfs_ioctl_quota_ctl_args)
+#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
+ struct btrfs_ioctl_qgroup_assign_args)
+#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
+ struct btrfs_ioctl_qgroup_create_args)
+#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
+ struct btrfs_ioctl_qgroup_limit_args)
#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
struct btrfs_ioctl_get_dev_stats)
-#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
- struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+ struct btrfs_ioctl_dev_replace_args)
#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 272f911203f..2a1762c6604 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -67,7 +67,7 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
if (eb->lock_nested) {
read_lock(&eb->lock);
- if (&eb->lock_nested && current->pid == eb->lock_owner) {
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
read_unlock(&eb->lock);
return;
}
@@ -78,13 +78,15 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
- if (atomic_dec_and_test(&eb->blocking_writers))
+ if (atomic_dec_and_test(&eb->blocking_writers) &&
+ waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
- if (atomic_dec_and_test(&eb->blocking_readers))
+ if (atomic_dec_and_test(&eb->blocking_readers) &&
+ waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
}
return;
@@ -199,7 +201,8 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
- if (atomic_dec_and_test(&eb->blocking_readers))
+ if (atomic_dec_and_test(&eb->blocking_readers) &&
+ waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
atomic_dec(&eb->read_locks);
}
@@ -247,8 +250,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
- smp_wmb();
- wake_up(&eb->write_lock_wq);
+ smp_mb();
+ if (waitqueue_active(&eb->write_lock_wq))
+ wake_up(&eb->write_lock_wq);
} else {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 00000000000..b7816cefbd1
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (C) 2012 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ do_div(num, 10);
+ return num;
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor == 100)
+ return num;
+ num *= factor;
+ do_div(num, 100);
+ return num;
+}
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 643335a4fe3..e5ed5672960 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
#include "btrfs_inode.h"
#include "extent_io.h"
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
struct btrfs_ordered_extent *entry;
tree = &BTRFS_I(inode)->ordered_tree;
- entry = kzalloc(sizeof(*entry), GFP_NOFS);
+ entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
@@ -209,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
+ INIT_LIST_HEAD(&entry->work_list);
+ init_completion(&entry->completion);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -421,7 +425,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
list_del(&sum->list);
kfree(sum);
}
- kfree(entry);
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
@@ -462,19 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
wake_up(&entry->wait);
}
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+ btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ complete(&ordered->completion);
+}
+
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
{
- struct list_head splice;
+ struct list_head splice, works;
struct list_head *cur;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *ordered, *next;
struct inode *inode;
INIT_LIST_HEAD(&splice);
+ INIT_LIST_HEAD(&works);
spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -482,15 +495,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
cur = splice.next;
ordered = list_entry(cur, struct btrfs_ordered_extent,
root_extent_list);
- if (nocow_only &&
- !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
- list_move(&ordered->root_extent_list,
- &root->fs_info->ordered_extents);
- cond_resched_lock(&root->fs_info->ordered_extent_lock);
- continue;
- }
-
list_del_init(&ordered->root_extent_list);
atomic_inc(&ordered->refs);
@@ -502,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
spin_unlock(&root->fs_info->ordered_extent_lock);
if (inode) {
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &ordered->flush_work);
} else {
btrfs_put_ordered_extent(ordered);
}
+ cond_resched();
spin_lock(&root->fs_info->ordered_extent_lock);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ list_for_each_entry_safe(ordered, next, &works, work_list) {
+ list_del_init(&ordered->work_list);
+ wait_for_completion(&ordered->completion);
+
+ inode = ordered->inode;
+ btrfs_put_ordered_extent(ordered);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+
+ cond_resched();
+ }
}
/*
@@ -527,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
* extra check to make sure the ordered operation list really is empty
* before we return
*/
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
struct list_head splice;
+ struct list_head works;
+ struct btrfs_delalloc_work *work, *next;
+ int ret = 0;
INIT_LIST_HEAD(&splice);
+ INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
@@ -541,6 +562,7 @@ again:
list_splice_init(&root->fs_info->ordered_operations, &splice);
while (!list_empty(&splice)) {
+
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
@@ -557,15 +579,26 @@ again:
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&root->fs_info->ordered_operations);
}
+
+ if (!inode)
+ continue;
spin_unlock(&root->fs_info->ordered_extent_lock);
- if (inode) {
- if (wait)
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- else
- filemap_flush(inode->i_mapping);
- btrfs_add_delayed_iput(inode);
+ work = btrfs_alloc_delalloc_work(inode, wait, 1);
+ if (!work) {
+ if (list_empty(&BTRFS_I(inode)->ordered_operations))
+ list_add_tail(&btrfs_inode->ordered_operations,
+ &splice);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_splice_tail(&splice,
+ &root->fs_info->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ ret = -ENOMEM;
+ goto out;
}
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
cond_resched();
spin_lock(&root->fs_info->ordered_extent_lock);
@@ -574,7 +607,13 @@ again:
goto again;
spin_unlock(&root->fs_info->ordered_extent_lock);
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
mutex_unlock(&root->fs_info->ordered_operations_mutex);
+ return ret;
}
/*
@@ -596,7 +635,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
/*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
- * for pdflush to find them
+ * for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->i_mapping, start, end);
@@ -614,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
- int found;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
@@ -650,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
- found = 0;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
@@ -663,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- found++;
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
@@ -775,7 +811,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
u64 disk_i_size;
u64 new_i_size;
- u64 i_size_test;
u64 i_size = i_size_read(inode);
struct rb_node *node;
struct rb_node *prev = NULL;
@@ -801,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
* if the disk i_size is already at the inode->i_size, or
* this ordered extent is inside the disk i_size, we're done
*/
- if (disk_i_size == i_size || offset <= disk_i_size) {
+ if (disk_i_size == i_size)
+ goto out;
+
+ /*
+ * We still need to update disk_i_size if outstanding_isize is greater
+ * than disk_i_size.
+ */
+ if (offset <= disk_i_size &&
+ (!ordered || ordered->outstanding_isize <= disk_i_size))
goto out;
- }
/*
* walk backward from this ordered extent to disk_i_size.
@@ -835,55 +877,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
break;
if (test->file_offset >= i_size)
break;
- if (test->file_offset >= disk_i_size)
+ if (entry_end(test) > disk_i_size) {
+ /*
+ * we don't update disk_i_size now, so record this
+ * undealt i_size. Or we will not know the real
+ * i_size.
+ */
+ if (test->outstanding_isize < offset)
+ test->outstanding_isize = offset;
+ if (ordered &&
+ ordered->outstanding_isize >
+ test->outstanding_isize)
+ test->outstanding_isize =
+ ordered->outstanding_isize;
goto out;
- }
- new_i_size = min_t(u64, offset, i_size);
-
- /*
- * at this point, we know we can safely update i_size to at least
- * the offset from this ordered extent. But, we need to
- * walk forward and see if ios from higher up in the file have
- * finished.
- */
- if (ordered) {
- node = rb_next(&ordered->rb_node);
- } else {
- if (prev)
- node = rb_next(prev);
- else
- node = rb_first(&tree->tree);
- }
-
- /*
- * We are looking for an area between our current extent and the next
- * ordered extent to update the i_size to. There are 3 cases here
- *
- * 1) We don't actually have anything and we can update to i_size.
- * 2) We have stuff but they already did their i_size update so again we
- * can just update to i_size.
- * 3) We have an outstanding ordered extent so the most we can update
- * our disk_i_size to is the start of the next offset.
- */
- i_size_test = i_size;
- for (; node; node = rb_next(node)) {
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
- if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
- continue;
- if (test->file_offset > offset) {
- i_size_test = test->file_offset;
- break;
}
}
+ new_i_size = min_t(u64, offset, i_size);
/*
- * i_size_test is the end of a region after this ordered
- * extent where there are no ordered extents, we can safely set
- * disk_i_size to this.
+ * Some ordered extents may completed before the current one, and
+ * we hold the real i_size in ->outstanding_isize.
*/
- if (i_size_test > offset)
- new_i_size = min_t(u64, i_size_test, i_size);
+ if (ordered && ordered->outstanding_isize > new_i_size)
+ new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
BTRFS_I(inode)->disk_i_size = new_i_size;
ret = 0;
out:
@@ -968,15 +985,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
if (last_mod < root->fs_info->last_trans_committed)
return;
- /*
- * the transaction is already committing. Just start the IO and
- * don't bother with all of this list nonsense
- */
- if (trans && root->fs_info->running_transaction->blocked) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- return;
- }
-
spin_lock(&root->fs_info->ordered_extent_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -984,3 +992,21 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
}
spin_unlock(&root->fs_info->ordered_extent_lock);
}
+
+int __init ordered_data_init(void)
+{
+ btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+ sizeof(struct btrfs_ordered_extent), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_ordered_extent_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ordered_data_exit(void)
+{
+ if (btrfs_ordered_extent_cache)
+ kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d299..f29d4bf5fbe 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
* has done its due diligence in updating
* the isize. */
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
+ /*
+ * the end of the ordered extent which is behind it but
+ * didn't update disk_i_size. Please see the comment of
+ * btrfs_ordered_update_i_size();
+ */
+ u64 outstanding_isize;
+
/* flags (described above) */
unsigned long flags;
@@ -121,8 +128,11 @@ struct btrfs_ordered_extent {
struct list_head root_extent_list;
struct btrfs_work work;
-};
+ struct completion completion;
+ struct btrfs_work flush_work;
+ struct list_head work_list;
+};
/*
* calculates the total size you need to allocate for an ordered sum
@@ -179,10 +189,11 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput);
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887e..50d95fd190a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
case BTRFS_DEV_STATS_KEY:
printk(KERN_INFO "\t\tdevice stats\n");
break;
+ case BTRFS_DEV_REPLACE_KEY:
+ printk(KERN_INFO "\t\tdev replace\n");
+ break;
};
}
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 00000000000..a5c85623432
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,1608 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "ioctl.h"
+#include "backref.h"
+
+/* TODO XXX FIXME
+ * - subvol delete -> delete when ref goes to 0? delete limits also?
+ * - reorganize keys
+ * - compressed
+ * - sync
+ * - rescan
+ * - copy also limits on subvol creation
+ * - limit
+ * - caches fuer ulists
+ * - performance benchmarks
+ * - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+ u64 qgroupid;
+
+ /*
+ * state
+ */
+ u64 rfer; /* referenced */
+ u64 rfer_cmpr; /* referenced compressed */
+ u64 excl; /* exclusive */
+ u64 excl_cmpr; /* exclusive compressed */
+
+ /*
+ * limits
+ */
+ u64 lim_flags; /* which limits are set */
+ u64 max_rfer;
+ u64 max_excl;
+ u64 rsv_rfer;
+ u64 rsv_excl;
+
+ /*
+ * reservation tracking
+ */
+ u64 reserved;
+
+ /*
+ * lists
+ */
+ struct list_head groups; /* groups this group is member of */
+ struct list_head members; /* groups that are members of this group */
+ struct list_head dirty; /* dirty groups */
+ struct rb_node node; /* tree of qgroups */
+
+ /*
+ * temp variables for accounting operations
+ */
+ u64 tag;
+ u64 refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct btrfs_qgroup *group;
+ struct btrfs_qgroup *member;
+};
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+ u64 qgroupid)
+{
+ struct rb_node *n = fs_info->qgroup_tree.rb_node;
+ struct btrfs_qgroup *qgroup;
+
+ while (n) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ if (qgroup->qgroupid < qgroupid)
+ n = n->rb_left;
+ else if (qgroup->qgroupid > qgroupid)
+ n = n->rb_right;
+ else
+ return qgroup;
+ }
+ return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+ u64 qgroupid)
+{
+ struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup *qgroup;
+
+ while (*p) {
+ parent = *p;
+ qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < qgroupid)
+ p = &(*p)->rb_left;
+ else if (qgroup->qgroupid > qgroupid)
+ p = &(*p)->rb_right;
+ else
+ return qgroup;
+ }
+
+ qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+ if (!qgroup)
+ return ERR_PTR(-ENOMEM);
+
+ qgroup->qgroupid = qgroupid;
+ INIT_LIST_HEAD(&qgroup->groups);
+ INIT_LIST_HEAD(&qgroup->members);
+ INIT_LIST_HEAD(&qgroup->dirty);
+
+ rb_link_node(&qgroup->node, parent, p);
+ rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+ return qgroup;
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+ struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+ struct btrfs_qgroup_list *list;
+
+ if (!qgroup)
+ return -ENOENT;
+
+ rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+ list_del(&qgroup->dirty);
+
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+
+ while (!list_empty(&qgroup->members)) {
+ list = list_first_entry(&qgroup->members,
+ struct btrfs_qgroup_list, next_member);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+ kfree(qgroup);
+
+ return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!member || !parent)
+ return -ENOENT;
+
+ list = kzalloc(sizeof(*list), GFP_ATOMIC);
+ if (!list)
+ return -ENOMEM;
+
+ list->group = parent;
+ list->member = member;
+ list_add_tail(&list->next_group, &member->groups);
+ list_add_tail(&list->next_member, &parent->members);
+
+ return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+ u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!member || !parent)
+ return -ENOENT;
+
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *l;
+ int slot;
+ int ret = 0;
+ u64 flags = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* default this to quota off, in case no status key is found */
+ fs_info->qgroup_flags = 0;
+
+ /*
+ * pass 1: read status, all qgroup infos and limits
+ */
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+ ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+ if (ret)
+ goto out;
+
+ while (1) {
+ struct btrfs_qgroup *qgroup;
+
+ slot = path->slots[0];
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+ struct btrfs_qgroup_status_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_status_item);
+
+ if (btrfs_qgroup_status_version(l, ptr) !=
+ BTRFS_QGROUP_STATUS_VERSION) {
+ printk(KERN_ERR
+ "btrfs: old qgroup version, quota disabled\n");
+ goto out;
+ }
+ if (btrfs_qgroup_status_generation(l, ptr) !=
+ fs_info->generation) {
+ flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ printk(KERN_ERR
+ "btrfs: qgroup generation mismatch, "
+ "marked as inconsistent\n");
+ }
+ fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+ ptr);
+ /* FIXME read scan element */
+ goto next1;
+ }
+
+ if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+ found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+ goto next1;
+
+ qgroup = find_qgroup_rb(fs_info, found_key.offset);
+ if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+ printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+ flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ if (!qgroup) {
+ qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ }
+ switch (found_key.type) {
+ case BTRFS_QGROUP_INFO_KEY: {
+ struct btrfs_qgroup_info_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_info_item);
+ qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+ qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+ qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+ qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+ /* generation currently unused */
+ break;
+ }
+ case BTRFS_QGROUP_LIMIT_KEY: {
+ struct btrfs_qgroup_limit_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_limit_item);
+ qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+ qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+ qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+ qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+ qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+ break;
+ }
+ }
+next1:
+ ret = btrfs_next_item(quota_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * pass 2: read all qgroup relations
+ */
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+ if (ret)
+ goto out;
+ while (1) {
+ slot = path->slots[0];
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+ goto next2;
+
+ if (found_key.objectid > found_key.offset) {
+ /* parent <- member, not needed to build config */
+ /* FIXME should we omit the key completely? */
+ goto next2;
+ }
+
+ ret = add_relation_rb(fs_info, found_key.objectid,
+ found_key.offset);
+ if (ret == -ENOENT) {
+ printk(KERN_WARNING
+ "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)found_key.offset);
+ ret = 0; /* ignore the error */
+ }
+ if (ret)
+ goto out;
+next2:
+ ret = btrfs_next_item(quota_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ }
+out:
+ fs_info->qgroup_flags |= flags;
+ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ }
+ btrfs_free_path(path);
+
+ return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is only called from close_ctree() or open_ctree(), both in single-
+ * treaded paths. Clean up the in-memory structures. No locking needed.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *list;
+
+ while ((n = rb_first(&fs_info->qgroup_tree))) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ rb_erase(n, &fs_info->qgroup_tree);
+
+ WARN_ON(!list_empty(&qgroup->dirty));
+
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list,
+ next_group);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+
+ while (!list_empty(&qgroup->members)) {
+ list = list_first_entry(&qgroup->members,
+ struct btrfs_qgroup_list,
+ next_member);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+ kfree(qgroup);
+ }
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root,
+ u64 src, u64 dst)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = src;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = dst;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root,
+ u64 src, u64 dst)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = src;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = dst;
+
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root, u64 qgroupid)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_qgroup_info_item *qgroup_info;
+ struct btrfs_qgroup_limit_item *qgroup_limit;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroupid;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*qgroup_info));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_info_item);
+ btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+ btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*qgroup_limit));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_limit_item);
+ btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root, u64 qgroupid)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroupid;
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 qgroupid,
+ u64 flags, u64 max_rfer, u64 max_excl,
+ u64 rsv_rfer, u64 rsv_excl)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_limit_item *qgroup_limit;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ key.offset = qgroupid;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ qgroup_limit = btrfs_item_ptr(l, path->slots[0],
+ struct btrfs_qgroup_limit_item);
+ btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
+ btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
+ btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
+ btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
+ btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_info_item *qgroup_info;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroup->qgroupid;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ qgroup_info = btrfs_item_ptr(l, path->slots[0],
+ struct btrfs_qgroup_info_item);
+ btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+ btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+ btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+ btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+ btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_status_item *ptr;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+ /* XXX scan */
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ if (!root)
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = 0;
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ break;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+ ret = 0;
+out:
+ root->fs_info->pending_quota_state = 0;
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_path *path = NULL;
+ struct btrfs_qgroup_status_item *ptr;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret = 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (fs_info->quota_root) {
+ fs_info->pending_quota_state = 1;
+ spin_unlock(&fs_info->qgroup_lock);
+ goto out;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * initially create the quota tree
+ */
+ quota_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_QUOTA_TREE_OBJECTID);
+ if (IS_ERR(quota_root)) {
+ ret = PTR_ERR(quota_root);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out_free_root;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*ptr));
+ if (ret)
+ goto out_free_path;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+ btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+ fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_scan(leaf, ptr, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_root = quota_root;
+ fs_info->pending_quota_state = 1;
+ spin_unlock(&fs_info->qgroup_lock);
+out_free_path:
+ btrfs_free_path(path);
+out_free_root:
+ if (ret) {
+ free_extent_buffer(quota_root->node);
+ free_extent_buffer(quota_root->commit_root);
+ kfree(quota_root);
+ }
+out:
+ return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *quota_root;
+ int ret = 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ quota_root = fs_info->quota_root;
+ fs_info->quota_root = NULL;
+ btrfs_free_qgroup_config(fs_info);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ if (!quota_root)
+ return -EINVAL;
+
+ ret = btrfs_clean_quota_tree(trans, quota_root);
+ if (ret)
+ goto out;
+
+ ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+ if (ret)
+ goto out;
+
+ list_del(&quota_root->dirty_list);
+
+ btrfs_tree_lock(quota_root->node);
+ clean_tree_block(trans, tree_root, quota_root->node);
+ btrfs_tree_unlock(quota_root->node);
+ btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+ free_extent_buffer(quota_root->node);
+ free_extent_buffer(quota_root->commit_root);
+ kfree(quota_root);
+out:
+ return ret;
+}
+
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
+{
+ /* FIXME */
+ return 0;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ struct btrfs_root *quota_root;
+ int ret = 0;
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ return -EINVAL;
+
+ ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+ if (ret)
+ return ret;
+
+ ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+ if (ret) {
+ del_qgroup_relation_item(trans, quota_root, src, dst);
+ return ret;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ ret = add_relation_rb(quota_root->fs_info, src, dst);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ struct btrfs_root *quota_root;
+ int ret = 0;
+ int err;
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ return -EINVAL;
+
+ ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+ err = del_qgroup_relation_item(trans, quota_root, dst, src);
+ if (err && !ret)
+ ret = err;
+
+ spin_lock(&fs_info->qgroup_lock);
+ del_relation_rb(fs_info, src, dst);
+
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ return -EINVAL;
+
+ ret = add_qgroup_item(trans, quota_root, qgroupid);
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = add_qgroup_rb(fs_info, qgroupid);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ if (IS_ERR(qgroup))
+ ret = PTR_ERR(qgroup);
+
+ return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ return -EINVAL;
+
+ /* check if there are no relations to this qgroup */
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -EBUSY;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+ spin_lock(&fs_info->qgroup_lock);
+ del_qgroup_rb(quota_root->fs_info, qgroupid);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit)
+{
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+
+ if (!quota_root)
+ return -EINVAL;
+
+ ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
+ limit->flags, limit->max_rfer,
+ limit->max_excl, limit->rsv_rfer,
+ limit->rsv_excl);
+ if (ret) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ printk(KERN_INFO "unable to update quota limit for %llu\n",
+ (unsigned long long)qgroupid);
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ qgroup->lim_flags = limit->flags;
+ qgroup->max_rfer = limit->max_rfer;
+ qgroup->max_excl = limit->max_excl;
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ qgroup->rsv_excl = limit->rsv_excl;
+
+unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (list_empty(&qgroup->dirty))
+ list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
+ * the modification into a list that's later used by btrfs_end_transaction to
+ * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct qgroup_update *u;
+
+ BUG_ON(!trans->delayed_ref_elem.seq);
+ u = kmalloc(sizeof(*u), GFP_NOFS);
+ if (!u)
+ return -ENOMEM;
+
+ u->node = node;
+ u->extent_op = extent_op;
+ list_add_tail(&u->list, &trans->qgroup_ref_list);
+
+ return 0;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_key ins;
+ struct btrfs_root *quota_root;
+ u64 ref_root;
+ struct btrfs_qgroup *qgroup;
+ struct ulist_node *unode;
+ struct ulist *roots = NULL;
+ struct ulist *tmp = NULL;
+ struct ulist_iterator uiter;
+ u64 seq;
+ int ret = 0;
+ int sgn;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ BUG_ON(!fs_info->quota_root);
+
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ struct btrfs_delayed_tree_ref *ref;
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ref_root = ref->root;
+ } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY) {
+ struct btrfs_delayed_data_ref *ref;
+ ref = btrfs_delayed_node_to_data_ref(node);
+ ref_root = ref->root;
+ } else {
+ BUG();
+ }
+
+ if (!is_fstree(ref_root)) {
+ /*
+ * non-fs-trees are not being accounted
+ */
+ return 0;
+ }
+
+ switch (node->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ sgn = 1;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ sgn = -1;
+ break;
+ case BTRFS_UPDATE_DELAYED_HEAD:
+ return 0;
+ default:
+ BUG();
+ }
+
+ /*
+ * the delayed ref sequence number we pass depends on the direction of
+ * the operation. for add operations, we pass (node->seq - 1) to skip
+ * the delayed ref's current sequence number, because we need the state
+ * of the tree before the add operation. for delete operations, we pass
+ * (node->seq) to include the delayed ref's current sequence number,
+ * because we need the state of the tree after the delete operation.
+ */
+ ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
+ sgn > 0 ? node->seq - 1 : node->seq, &roots);
+ if (ret < 0)
+ goto out;
+
+ spin_lock(&fs_info->qgroup_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ goto unlock;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto unlock;
+
+ /*
+ * step 1: for each old ref, visit all nodes once and inc refcnt
+ */
+ tmp = ulist_alloc(GFP_ATOMIC);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ seq = fs_info->qgroup_seq;
+ fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ struct ulist_node *tmp_unode;
+ struct ulist_iterator tmp_uiter;
+ struct btrfs_qgroup *qg;
+
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+
+ ulist_reinit(tmp);
+ /* XXX id not needed */
+ ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
+ if (qg->refcnt < seq)
+ qg->refcnt = seq + 1;
+ else
+ ++qg->refcnt;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ulist_add(tmp, glist->group->qgroupid,
+ (u64)(uintptr_t)glist->group,
+ GFP_ATOMIC);
+ }
+ }
+ }
+
+ /*
+ * step 2: walk from the new root
+ */
+ ulist_reinit(tmp);
+ ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ if (qg->refcnt < seq) {
+ /* not visited by step 1 */
+ qg->rfer += sgn * node->num_bytes;
+ qg->rfer_cmpr += sgn * node->num_bytes;
+ if (roots->nnodes == 0) {
+ qg->excl += sgn * node->num_bytes;
+ qg->excl_cmpr += sgn * node->num_bytes;
+ }
+ qgroup_dirty(fs_info, qg);
+ }
+ WARN_ON(qg->tag >= seq);
+ qg->tag = seq;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ulist_add(tmp, glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ }
+ }
+
+ /*
+ * step 3: walk again from old refs
+ */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct ulist_node *tmp_unode;
+ struct ulist_iterator tmp_uiter;
+
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+
+ ulist_reinit(tmp);
+ ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
+ if (qg->tag == seq)
+ continue;
+
+ if (qg->refcnt - seq == roots->nnodes) {
+ qg->excl -= sgn * node->num_bytes;
+ qg->excl_cmpr -= sgn * node->num_bytes;
+ qgroup_dirty(fs_info, qg);
+ }
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ulist_add(tmp, glist->group->qgroupid,
+ (uintptr_t)glist->group,
+ GFP_ATOMIC);
+ }
+ }
+ }
+ ret = 0;
+unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ ulist_free(roots);
+ ulist_free(tmp);
+
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ int ret = 0;
+
+ if (!quota_root)
+ goto out;
+
+ fs_info->quota_enabled = fs_info->pending_quota_state;
+
+ spin_lock(&fs_info->qgroup_lock);
+ while (!list_empty(&fs_info->dirty_qgroups)) {
+ struct btrfs_qgroup *qgroup;
+ qgroup = list_first_entry(&fs_info->dirty_qgroups,
+ struct btrfs_qgroup, dirty);
+ list_del_init(&qgroup->dirty);
+ spin_unlock(&fs_info->qgroup_lock);
+ ret = update_qgroup_info_item(trans, quota_root, qgroup);
+ if (ret)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ spin_lock(&fs_info->qgroup_lock);
+ }
+ if (fs_info->quota_enabled)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+ else
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = update_qgroup_status_item(trans, fs_info, quota_root);
+ if (ret)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+out:
+
+ return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ int ret = 0;
+ int i;
+ u64 *i_qgroups;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_qgroup *srcgroup;
+ struct btrfs_qgroup *dstgroup;
+ u32 level_size = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ if (!quota_root)
+ return -EINVAL;
+
+ /*
+ * create a tracking group for the subvol itself
+ */
+ ret = add_qgroup_item(trans, quota_root, objectid);
+ if (ret)
+ goto out;
+
+ if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+ ret = update_qgroup_limit_item(trans, quota_root, objectid,
+ inherit->lim.flags,
+ inherit->lim.max_rfer,
+ inherit->lim.max_excl,
+ inherit->lim.rsv_rfer,
+ inherit->lim.rsv_excl);
+ if (ret)
+ goto out;
+ }
+
+ if (srcid) {
+ struct btrfs_root *srcroot;
+ struct btrfs_key srckey;
+ int srcroot_level;
+
+ srckey.objectid = srcid;
+ srckey.type = BTRFS_ROOT_ITEM_KEY;
+ srckey.offset = (u64)-1;
+ srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+ if (IS_ERR(srcroot)) {
+ ret = PTR_ERR(srcroot);
+ goto out;
+ }
+
+ rcu_read_lock();
+ srcroot_level = btrfs_header_level(srcroot->node);
+ level_size = btrfs_level_size(srcroot, srcroot_level);
+ rcu_read_unlock();
+ }
+
+ /*
+ * add qgroup to all inherited groups
+ */
+ if (inherit) {
+ i_qgroups = (u64 *)(inherit + 1);
+ for (i = 0; i < inherit->num_qgroups; ++i) {
+ ret = add_qgroup_relation_item(trans, quota_root,
+ objectid, *i_qgroups);
+ if (ret)
+ goto out;
+ ret = add_qgroup_relation_item(trans, quota_root,
+ *i_qgroups, objectid);
+ if (ret)
+ goto out;
+ ++i_qgroups;
+ }
+ }
+
+
+ spin_lock(&fs_info->qgroup_lock);
+
+ dstgroup = add_qgroup_rb(fs_info, objectid);
+ if (IS_ERR(dstgroup)) {
+ ret = PTR_ERR(dstgroup);
+ goto unlock;
+ }
+
+ if (srcid) {
+ srcgroup = find_qgroup_rb(fs_info, srcid);
+ if (!srcgroup)
+ goto unlock;
+ dstgroup->rfer = srcgroup->rfer - level_size;
+ dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+ srcgroup->excl = level_size;
+ srcgroup->excl_cmpr = level_size;
+ qgroup_dirty(fs_info, dstgroup);
+ qgroup_dirty(fs_info, srcgroup);
+ }
+
+ if (!inherit)
+ goto unlock;
+
+ i_qgroups = (u64 *)(inherit + 1);
+ for (i = 0; i < inherit->num_qgroups; ++i) {
+ ret = add_relation_rb(quota_root->fs_info, objectid,
+ *i_qgroups);
+ if (ret)
+ goto unlock;
+ ++i_qgroups;
+ }
+
+ for (i = 0; i < inherit->num_ref_copies; ++i) {
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *dst;
+
+ src = find_qgroup_rb(fs_info, i_qgroups[0]);
+ dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+ if (!src || !dst) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ dst->rfer = src->rfer - level_size;
+ dst->rfer_cmpr = src->rfer_cmpr - level_size;
+ i_qgroups += 2;
+ }
+ for (i = 0; i < inherit->num_excl_copies; ++i) {
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *dst;
+
+ src = find_qgroup_rb(fs_info, i_qgroups[0]);
+ dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+ if (!src || !dst) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ dst->excl = src->excl + level_size;
+ dst->excl_cmpr = src->excl_cmpr + level_size;
+ i_qgroups += 2;
+ }
+
+unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ return ret;
+}
+
+/*
+ * reserve some space for a qgroup and all its parents. The reservation takes
+ * place with start_transaction or dealloc_reserve, similar to ENOSPC
+ * accounting. If not enough space is available, EDQUOT is returned.
+ * We assume that the requested space is new for all qgroups.
+ */
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 ref_root = root->root_key.objectid;
+ int ret = 0;
+ struct ulist *ulist = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+
+ if (!is_fstree(ref_root))
+ return 0;
+
+ if (num_bytes == 0)
+ return 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ goto out;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ /*
+ * in a first step, we check all affected qgroups if any limits would
+ * be exceeded
+ */
+ ulist = ulist_alloc(GFP_ATOMIC);
+ if (!ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+
+ if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+ qg->reserved + qg->rfer + num_bytes >
+ qg->max_rfer)
+ ret = -EDQUOT;
+
+ if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+ qg->reserved + qg->excl + num_bytes >
+ qg->max_excl)
+ ret = -EDQUOT;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ulist_add(ulist, glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ }
+ }
+ if (ret)
+ goto out;
+
+ /*
+ * no limits exceeded, now record the reservation into all qgroups
+ */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+
+ qg->reserved += num_bytes;
+ }
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(ulist);
+
+ return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *ulist = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ u64 ref_root = root->root_key.objectid;
+
+ if (!is_fstree(ref_root))
+ return;
+
+ if (num_bytes == 0)
+ return;
+
+ spin_lock(&fs_info->qgroup_lock);
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ goto out;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ ulist = ulist_alloc(GFP_ATOMIC);
+ if (!ulist) {
+ btrfs_std_error(fs_info, -ENOMEM);
+ goto out;
+ }
+ ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+
+ qg->reserved -= num_bytes;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ulist_add(ulist, glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ }
+ }
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(ulist);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+ if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+ return;
+ printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+ trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+ trans->delayed_ref_elem.seq);
+ BUG();
+}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 48a4882d8ad..96b93daa0bb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
#include "volumes.h"
#include "disk-io.h"
#include "transaction.h"
+#include "dev-replace.h"
#undef DEBUG
@@ -68,7 +69,7 @@ struct reada_extent {
u32 blocksize;
int err;
struct list_head extctl;
- struct kref refcnt;
+ int refcnt;
spinlock_t lock;
struct reada_zone *zones[BTRFS_MAX_MIRRORS];
int nzones;
@@ -126,7 +127,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, index);
if (re)
- kref_get(&re->refcnt);
+ re->refcnt++;
spin_unlock(&fs_info->reada_lock);
if (!re)
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
struct reada_extent *re = NULL;
struct reada_extent *re_exist = NULL;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
@@ -332,11 +332,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
int nzones = 0;
int i;
unsigned long index = logical >> PAGE_CACHE_SHIFT;
+ int dev_replace_is_ongoing;
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, index);
if (re)
- kref_get(&re->refcnt);
+ re->refcnt++;
spin_unlock(&fs_info->reada_lock);
if (re)
@@ -352,13 +353,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
- kref_init(&re->refcnt);
+ re->refcnt = 1;
/*
* map block
*/
length = blocksize;
- ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+ ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+ &bbio, 0);
if (ret || !bbio || length < blocksize)
goto error;
@@ -393,20 +395,25 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
}
/* insert extent in reada_tree + all per-device trees, all or nothing */
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
BUG_ON(!re_exist);
- kref_get(&re_exist->refcnt);
+ re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
goto error;
}
prev_dev = NULL;
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+ &fs_info->dev_replace);
for (i = 0; i < nzones; ++i) {
dev = bbio->stripes[i].dev;
if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
*/
continue;
}
+ if (!dev->bdev) {
+ /* cannot read ahead on missing device */
+ continue;
+ }
+ if (dev_replace_is_ongoing &&
+ dev == fs_info->dev_replace.tgtdev) {
+ /*
+ * as this device is selected for reading only as
+ * a last resort, skip it for read ahead.
+ */
+ continue;
+ }
prev_dev = dev;
ret = radix_tree_insert(&dev->reada_extents, index, re);
if (ret) {
while (--i >= 0) {
dev = bbio->stripes[i].dev;
BUG_ON(dev == NULL);
+ /* ignore whether the entry was inserted */
radix_tree_delete(&dev->reada_extents, index);
}
BUG_ON(fs_info == NULL);
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
goto error;
}
}
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
kfree(bbio);
return re;
@@ -465,10 +487,6 @@ error:
return re_exist;
}
-static void reada_kref_dummy(struct kref *kr)
-{
-}
-
static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re)
{
@@ -476,7 +494,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
spin_lock(&fs_info->reada_lock);
- if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+ if (--re->refcnt) {
spin_unlock(&fs_info->reada_lock);
return;
}
@@ -671,7 +689,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
return 0;
}
dev->reada_next = re->logical + re->blocksize;
- kref_get(&re->refcnt);
+ re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -919,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- reada_add_block(rc, start, &max_key, level, generation);
+ if (reada_add_block(rc, start, &max_key, level, generation)) {
+ kfree(rc);
+ return ERR_PTR(-ENOMEM);
+ }
reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 646ee21bb03..300e09ac365 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1239,10 +1239,11 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
- kfree(node);
btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
"for start=%llu while inserting into relocation "
- "tree\n");
+ "tree\n", node->bytenr);
+ kfree(node);
+ return -EEXIST;
}
list_add_tail(&root->root_list, &rc->reloc_roots);
@@ -2024,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
- unsigned long nr;
int level;
int max_level;
int replaced = 0;
@@ -2073,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(IS_ERR(trans));
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
BUG_ON(ret != -EAGAIN);
ret = btrfs_commit_transaction(trans, root);
@@ -2124,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path->slots[level]);
root_item->drop_level = level;
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -2154,10 +2154,9 @@ out:
btrfs_update_reloc_root(trans, root);
}
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -2183,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret)
err = ret;
}
@@ -2458,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
if (ret == -EAGAIN)
rc->commit_transaction = 1;
@@ -3258,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
struct btrfs_path *path;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
- unsigned long nr;
int ret = 0;
if (inode)
@@ -3269,8 +3269,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
- if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
- if (inode && !IS_ERR(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode)) {
+ if (!IS_ERR(inode))
iput(inode);
return -ENOENT;
}
@@ -3292,9 +3292,8 @@ truncate:
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
btrfs_free_path(path);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
out:
iput(inode);
return ret;
@@ -3620,7 +3619,7 @@ next:
ret = find_first_extent_bit(&rc->processed_blocks,
key.objectid, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret == 0 && start <= key.objectid) {
btrfs_release_path(path);
@@ -3673,7 +3672,8 @@ int prepare_to_relocate(struct reloc_control *rc)
struct btrfs_trans_handle *trans;
int ret;
- rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+ BTRFS_BLOCK_RSV_TEMP);
if (!rc->block_rsv)
return -ENOMEM;
@@ -3683,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
* is no reservation in transaction handle.
*/
ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
- rc->extent_root->nodesize * 256);
+ rc->extent_root->nodesize * 256,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret)
return ret;
@@ -3709,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
- unsigned long nr;
u64 flags;
u32 item_size;
int ret;
@@ -3826,9 +3826,8 @@ restart:
ret = btrfs_commit_transaction(trans, rc->extent_root);
BUG_ON(ret);
} else {
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root, nr);
+ btrfs_btree_balance_dirty(rc->extent_root);
}
trans = NULL;
@@ -3858,9 +3857,8 @@ restart:
GFP_NOFS);
if (trans) {
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root, nr);
+ btrfs_btree_balance_dirty(rc->extent_root);
}
if (!err) {
@@ -3939,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_key key;
- unsigned long nr;
u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
int err = 0;
@@ -3967,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
err = btrfs_orphan_add(trans, inode);
out:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (err) {
if (inode)
iput(inode);
@@ -4055,8 +4051,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->key.objectid,
(unsigned long long)rc->block_group->flags);
- btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
- btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+ ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 24fb8ce4e07..668af537a3e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,12 +16,55 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/uuid.h>
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
/*
+ * Read a root item from the tree. In case we detect a root item smaller then
+ * sizeof(root_item), we know it's an old version of the root structure and
+ * initialize all new fields to zero. The same happens if we detect mismatching
+ * generation numbers as then we know the root was once mounted with an older
+ * kernel that was not aware of the root item structure change.
+ */
+void btrfs_read_root_item(struct btrfs_root *root,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item)
+{
+ uuid_le uuid;
+ int len;
+ int need_reset = 0;
+
+ len = btrfs_item_size_nr(eb, slot);
+ read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+ min_t(int, len, (int)sizeof(*item)));
+ if (len < sizeof(*item))
+ need_reset = 1;
+ if (!need_reset && btrfs_root_generation(item)
+ != btrfs_root_generation_v2(item)) {
+ if (btrfs_root_generation_v2(item) != 0) {
+ printk(KERN_WARNING "btrfs: mismatching "
+ "generation and generation_v2 "
+ "found in root item. This root "
+ "was probably mounted with an "
+ "older kernel. Resetting all "
+ "new fields.\n");
+ }
+ need_reset = 1;
+ }
+ if (need_reset) {
+ memset(&item->generation_v2, 0,
+ sizeof(*item) - offsetof(struct btrfs_root_item,
+ generation_v2));
+
+ uuid_le_gen(&uuid);
+ memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+ }
+}
+
+/*
* lookup the root with the highest offset for a given objectid. The key we do
* find is copied into 'key'. If we find something return 0, otherwise 1, < 0
* on error.
@@ -61,10 +104,10 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
goto out;
}
if (item)
- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
- sizeof(*item));
+ btrfs_read_root_item(root, l, slot, item);
if (key)
memcpy(key, &found_key, sizeof(found_key));
+
ret = 0;
out:
btrfs_free_path(path);
@@ -91,6 +134,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
int ret;
int slot;
unsigned long ptr;
+ int old_len;
path = btrfs_alloc_path();
if (!path)
@@ -113,6 +157,45 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
+ old_len = btrfs_item_size_nr(l, slot);
+
+ /*
+ * If this is the first time we update the root item which originated
+ * from an older kernel, we need to enlarge the item size to make room
+ * for the added fields.
+ */
+ if (old_len < sizeof(*item)) {
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, key, path,
+ -1, 1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, sizeof(*item));
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ }
+
+ /*
+ * Update generation_v2 so at the next mount we know the new root
+ * fields are valid.
+ */
+ btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+
write_extent_buffer(l, item, ptr, sizeof(*item));
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@@ -123,6 +206,10 @@ out:
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key, struct btrfs_root_item *item)
{
+ /*
+ * Make sure generation v1 and v2 match. See update_root for details.
+ */
+ btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
@@ -454,3 +541,16 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item